diff --git a/.gitmodules b/.gitmodules
index 0c41450793fc2..3cdbc077ba1ec 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -70,8 +70,8 @@
 	path = third_party/xbyak
 	url = https://github.com/herumi/xbyak.git
 	ignore = dirty
-[submodule "third_party/mkldnn"]
-	path = third_party/mkldnn
+[submodule "third_party/onednn"]
+	path = third_party/onednn
 	url = https://github.com/oneapi-src/oneDNN.git
 	ignore = dirty
 [submodule "third_party/flashattn"]
@@ -118,3 +118,7 @@
 	path = third_party/cryptopp-cmake
 	url = https://github.com/noloader/cryptopp-cmake.git
 	ignore = dirty
+[submodule "third_party/nlohmann_json"]
+	path = third_party/nlohmann_json
+	url = https://github.com/nlohmann/json.git
+	ignore = dirty
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3d1ac6a170243..f1dcbd658cb35 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -42,7 +42,7 @@ repos:
     hooks:
     -   id: copyright_checker
         name: copyright_checker
-        entry: python ./tools/codestyle/copyright.hook
+        entry: python ./tools/codestyle/copyright.py
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps|py|sh)$
         exclude: |
@@ -57,7 +57,7 @@ repos:
     -   id: black
         files: (.*\.(py|pyi|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.3.0
+    rev: v0.3.5
     hooks:
     -   id: ruff
         args: [--fix, --exit-non-zero-on-fix, --no-cache]
@@ -67,7 +67,7 @@ repos:
     -   id: clang-format
         name: clang-format
         description: Format files with ClangFormat.
-        entry: bash ./tools/codestyle/clang_format.hook -i
+        entry: bash ./tools/codestyle/clang_format.sh -i
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|xpu|kps)$
 -   repo: local
@@ -75,7 +75,7 @@ repos:
     -   id: cpplint-cpp-source
         name: cpplint
         description: Check C++ code style using cpplint.py.
-        entry: bash ./tools/codestyle/cpplint_pre_commit.hook
+        entry: bash ./tools/codestyle/cpplint_pre_commit.sh
         language: system
         files: \.(cc|cxx|cpp|cu|h|hpp|hxx)$
         args:
diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md
deleted file mode 100644
index 6b2614b101108..0000000000000
--- a/ISSUE_TEMPLATE.md
+++ /dev/null
@@ -1,14 +0,0 @@
-Thank you for contributing to PaddlePaddle. Submitting an issue is a great help for us.
-Both Chinese and English issues are welcome.
-
-It's hard to solve a problem when important details are missing.
-Before submitting the issue, look over the following criteria before handing your request in.
-
-- [ ] Was there a similar issue submitted or resolved before ? You could search issue in the github.
-- [ ] Did you retrieve your issue from widespread search engines ?
-- [ ] Is my description of the issue clear enough to reproduce this problem?
-   * If some errors occurred, we need details about `how do you run your code?`, `what system do you use?`, `Are you using GPU or not?`, etc.
-   * If you use an recording [asciinema](https://asciinema.org/) to show what you are doing to make it happen, that's awesome! We could help you solve the problem more quickly.
-- [ ] Is my description of the issue use the github markdown correctly?
-   * Please use the proper markdown syntaxes for styling all forms of writing, e.g, source code, error information, etc.
-   * Check out [this page](https://guides.github.com/features/mastering-markdown/) to find out much more about markdown.
diff --git a/cmake/cinn.cmake b/cmake/cinn.cmake
index 05210fd578365..a8627b6f70fd0 100644
--- a/cmake/cinn.cmake
+++ b/cmake/cinn.cmake
@@ -59,7 +59,7 @@ if(WITH_MKL)
   add_dependencies(cinn_mklml ${MKLML_PROJECT})
   add_definitions(-DCINN_WITH_MKL_CBLAS)
 endif()
-if(WITH_MKLDNN)
+if(WITH_ONEDNN)
   add_definitions(-DCINN_WITH_DNNL)
 endif()
 
@@ -164,6 +164,8 @@ cinn_cc_library(
   isl
   ginac
   pybind
+  op_fusion
+  cinn_op_dialect
   ${jitify_deps})
 add_dependencies(cinnapi GEN_LLVM_RUNTIME_IR_HEADER ZLIB::ZLIB)
 add_dependencies(cinnapi GEN_LLVM_RUNTIME_IR_HEADER ${core_deps})
@@ -175,9 +177,9 @@ target_link_libraries(cinnapi ${PYTHON_LIBRARIES})
 if(WITH_MKL)
   target_link_libraries(cinnapi cinn_mklml)
   add_dependencies(cinnapi cinn_mklml)
-  if(WITH_MKLDNN)
+  if(WITH_ONEDNN)
     target_link_libraries(cinnapi ${MKLDNN_LIB})
-    add_dependencies(cinnapi ${MKLDNN_PROJECT})
+    add_dependencies(cinnapi ${ONEDNN_PROJECT})
   endif()
 endif()
 
@@ -220,21 +222,25 @@ function(gen_cinncore LINKTYPE)
     schedule_desc_proto
     absl
     isl
-    ginac)
+    ginac
+    pybind
+    op_fusion
+    cinn_op_dialect
+    ${jitify_deps})
   add_dependencies(${CINNCORE_TARGET} GEN_LLVM_RUNTIME_IR_HEADER ZLIB::ZLIB)
   add_dependencies(${CINNCORE_TARGET} GEN_LLVM_RUNTIME_IR_HEADER ${core_deps})
   target_link_libraries(${CINNCORE_TARGET} op_dialect pir phi)
   add_dependencies(${CINNCORE_TARGET} op_dialect pir phi)
 
-  add_dependencies(${CINNCORE_TARGET} pybind)
+  # add_dependencies(${CINNCORE_TARGET} pybind)
   target_link_libraries(${CINNCORE_TARGET} ${PYTHON_LIBRARIES})
 
   if(WITH_MKL)
     target_link_libraries(${CINNCORE_TARGET} cinn_mklml)
     add_dependencies(${CINNCORE_TARGET} cinn_mklml)
-    if(WITH_MKLDNN)
+    if(WITH_ONEDNN)
       target_link_libraries(${CINNCORE_TARGET} ${MKLDNN_LIB})
-      add_dependencies(${CINNCORE_TARGET} ${MKLDNN_PROJECT})
+      add_dependencies(${CINNCORE_TARGET} ${ONEDNN_PROJECT})
     endif()
   endif()
 
@@ -247,16 +253,16 @@ function(gen_cinncore LINKTYPE)
       ${CUBLAS}
       ${CUDNN}
       ${CURAND}
-      ${CUSOLVER}
-      ${jitify_deps})
+      ${CUSOLVER})
+    # ${jitify_deps})
     if(NVTX_FOUND)
       target_link_libraries(${CINNCORE_TARGET} ${CUDA_NVTX_LIB})
     endif()
   endif()
 
   if(WITH_CUTLASS)
-    target_link_libraries(cinnapi cutlass)
-    add_dependencies(cinnapi cutlass)
+    target_link_libraries(${CINNCORE_TARGET} cutlass)
+    add_dependencies(${CINNCORE_TARGET} cutlass)
   endif()
 endfunction()
 
diff --git a/cmake/external/json.cmake b/cmake/external/json.cmake
new file mode 100644
index 0000000000000..b219e60cb9950
--- /dev/null
+++ b/cmake/external/json.cmake
@@ -0,0 +1,43 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(ExternalProject)
+
+set(JSON_PREFIX_DIR ${THIRD_PARTY_PATH}/nlohmann_json)
+set(JSON_INCLUDE_DIR ${JSON_PREFIX_DIR}/include)
+
+set(SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/nlohmann_json)
+set(SOURCE_INCLUDE_DIR ${SOURCE_DIR}/include)
+
+include_directories(${JSON_INCLUDE_DIR})
+
+set(JSON_BuildTests
+    OFF
+    CACHE INTERNAL "")
+
+ExternalProject_Add(
+  extern_json
+  ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+  SOURCE_DIR ${SOURCE_DIR}
+  PREFIX ${JSON_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_IN_SOURCE 1
+  BUILD_COMMAND ""
+  INSTALL_COMMAND ""
+  TEST_COMMAND "")
+
+add_library(json INTERFACE)
+#target_include_directories(json PRIVATE ${JSON_INCLUDE_DIR})
+add_dependencies(json extern_json)
diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake
index 515952eae88cd..c350f79945163 100644
--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@@ -88,7 +88,7 @@ if(NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
     set(LITE_OPTIONAL_ARGS
         -DWITH_MKL=OFF
         -DLITE_WITH_CUDA=OFF
-        -DWITH_MKLDNN=OFF
+        -DWITH_ONEDNN=OFF
         -DLITE_WITH_X86=OFF
         -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON
         -DLITE_WITH_PROFILE=OFF
@@ -141,7 +141,7 @@ if(NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
     set(LITE_OPTIONAL_ARGS
         -DWITH_MKL=ON
         -DLITE_WITH_CUDA=OFF
-        -DWITH_MKLDNN=OFF
+        -DWITH_ONEDNN=OFF
         -DLITE_WITH_X86=ON
         -DLITE_WITH_PROFILE=OFF
         -DWITH_LITE=OFF
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/onednn.cmake
similarity index 68%
rename from cmake/external/mkldnn.cmake
rename to cmake/external/onednn.cmake
index 650a2a4196c86..8b1969f87b5a2 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/onednn.cmake
@@ -14,13 +14,13 @@
 
 include(ExternalProject)
 
-set(MKLDNN_PROJECT "extern_mkldnn")
-set(MKLDNN_PREFIX_DIR ${THIRD_PARTY_PATH}/mkldnn)
-set(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn)
+set(ONEDNN_PROJECT "extern_onednn")
+set(ONEDNN_PREFIX_DIR ${THIRD_PARTY_PATH}/onednn)
+set(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/onednn)
 set(MKLDNN_INC_DIR
     "${MKLDNN_INSTALL_DIR}/include"
-    CACHE PATH "mkldnn include directory." FORCE)
-set(SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/mkldnn)
+    CACHE PATH "oneDNN include directory." FORCE)
+set(SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/onednn)
 
 # Introduce variables:
 # * CMAKE_INSTALL_LIBDIR
@@ -36,28 +36,28 @@ set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}"
                         "${MKLDNN_INSTALL_DIR}/${LIBDIR}")
 
 include_directories(${MKLDNN_INC_DIR}
-)# For MKLDNN code to include internal headers.
+)# For oneDNN code to include internal headers.
 
 if(NOT WIN32)
-  set(MKLDNN_FLAG
+  set(ONEDNN_FLAG
       "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds"
   )
-  set(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
-  set(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
-  set(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
-  set(MKLDNN_CXXFLAG_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
-  set(MKLDNN_CFLAG_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
+  set(ONEDNN_FLAG "${ONEDNN_FLAG} -Wno-unused-result -Wno-unused-value")
+  set(ONEDNN_CFLAG "${CMAKE_C_FLAGS} ${ONEDNN_FLAG}")
+  set(ONEDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${ONEDNN_FLAG}")
+  set(ONEDNN_CXXFLAG_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
+  set(ONEDNN_CFLAG_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
   set(MKLDNN_LIB
       "${MKLDNN_INSTALL_DIR}/${LIBDIR}/libdnnl.so"
-      CACHE FILEPATH "mkldnn library." FORCE)
+      CACHE FILEPATH "oneDNN library." FORCE)
 else()
-  set(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc")
-  set(MKLDNN_CFLAG "${CMAKE_C_FLAGS}")
-  string(REPLACE "/O2 " "" MKLDNN_CFLAG_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
-  string(REPLACE "/O2 " "" MKLDNN_CXXFLAG_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
+  set(ONEDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc")
+  set(ONEDNN_CFLAG "${CMAKE_C_FLAGS}")
+  string(REPLACE "/O2 " "" ONEDNN_CFLAG_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
+  string(REPLACE "/O2 " "" ONEDNN_CXXFLAG_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
   set(MKLDNN_LIB
       "${MKLDNN_INSTALL_DIR}/bin/mkldnn.lib"
-      CACHE FILEPATH "mkldnn library." FORCE)
+      CACHE FILEPATH "oneDNN library." FORCE)
 endif()
 
 if(LINUX)
@@ -67,21 +67,21 @@ else()
 endif()
 
 ExternalProject_Add(
-  ${MKLDNN_PROJECT}
+  ${ONEDNN_PROJECT}
   ${EXTERNAL_PROJECT_LOG_ARGS}
   SOURCE_DIR ${SOURCE_DIR}
-  DEPENDS ${MKLDNN_DEPENDS}
-  PREFIX ${MKLDNN_PREFIX_DIR}
+  DEPENDS ${ONEDNN_DEPENDS}
+  PREFIX ${ONEDNN_PREFIX_DIR}
   UPDATE_COMMAND ""
   #BUILD_ALWAYS        1
   CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
              -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-             -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
-             -DCMAKE_CXX_FLAGS_RELEASE=${MKLDNN_CXXFLAG_RELEASE}
+             -DCMAKE_CXX_FLAGS=${ONEDNN_CXXFLAG}
+             -DCMAKE_CXX_FLAGS_RELEASE=${ONEDNN_CXXFLAG_RELEASE}
              -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-             -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
+             -DCMAKE_C_FLAGS=${ONEDNN_CFLAG}
              -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-             -DCMAKE_C_FLAGS_RELEASE=${MKLDNN_CFLAG_RELEASE}
+             -DCMAKE_C_FLAGS_RELEASE=${ONEDNN_CFLAG_RELEASE}
              -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
              -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
              -DCMAKE_POSITION_INDEPENDENT_CODE=ON
@@ -90,7 +90,7 @@ ExternalProject_Add(
   CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
   BUILD_BYPRODUCTS ${BUILD_BYPRODUCTS_ARGS})
 
-message(STATUS "MKLDNN library: ${MKLDNN_LIB}")
+message(STATUS "OneDNN library: ${MKLDNN_LIB}")
 add_definitions(-DPADDLE_WITH_DNNL)
 # copy the real so.0 lib to install dir
 # it can be directly contained in wheel or capi
@@ -123,21 +123,21 @@ if(WIN32)
     COMMAND lib /def:${MKLDNN_INSTALL_DIR}/bin/mkldnn.def /out:${MKLDNN_LIB}
             /machine:x64
     COMMENT "Generate mkldnn.lib manually--->"
-    DEPENDS ${MKLDNN_PROJECT}
+    DEPENDS ${ONEDNN_PROJECT}
     VERBATIM)
-  add_custom_target(mkldnn_cmd ALL DEPENDS ${MKLDNN_LIB})
+  add_custom_target(onednn_cmd ALL DEPENDS ${MKLDNN_LIB})
 else()
   set(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libdnnl.so.3)
   add_custom_command(
     OUTPUT ${MKLDNN_SHARED_LIB}
     COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
-    DEPENDS ${MKLDNN_PROJECT})
-  add_custom_target(mkldnn_cmd ALL DEPENDS ${MKLDNN_SHARED_LIB})
+    DEPENDS ${ONEDNN_PROJECT})
+  add_custom_target(onednn_cmd ALL DEPENDS ${MKLDNN_SHARED_LIB})
 endif()
 
-# generate a static dummy target to track mkldnn dependencies
-# for cc_library(xxx SRCS xxx.c DEPS mkldnn)
-generate_dummy_static_lib(LIB_NAME "mkldnn" GENERATOR "mkldnn.cmake")
+# generate a static dummy target to track onednn dependencies
+# for cc_library(xxx SRCS xxx.c DEPS onednn)
+generate_dummy_static_lib(LIB_NAME "onednn" GENERATOR "onednn.cmake")
 
-target_link_libraries(mkldnn ${MKLDNN_LIB} ${MKLML_IOMP_LIB})
-add_dependencies(mkldnn ${MKLDNN_PROJECT} mkldnn_cmd)
+target_link_libraries(onednn ${MKLDNN_LIB} ${MKLML_IOMP_LIB})
+add_dependencies(onednn ${ONEDNN_PROJECT} onednn_cmd)
diff --git a/cmake/external/rocksdb.cmake b/cmake/external/rocksdb.cmake
index 072658e54705a..28179cbf1ca20 100644
--- a/cmake/external/rocksdb.cmake
+++ b/cmake/external/rocksdb.cmake
@@ -14,8 +14,6 @@
 
 include(ExternalProject)
 
-# find_package(jemalloc REQUIRED)
-
 set(ROCKSDB_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/rocksdb)
 set(ROCKSDB_TAG 6.19.fb)
 
@@ -32,28 +30,10 @@ set(ROCKSDB_INCLUDE_DIR
 set(ROCKSDB_LIBRARIES
     "${ROCKSDB_INSTALL_DIR}/lib/librocksdb.a"
     CACHE FILEPATH "rocksdb library." FORCE)
-set(ROCKSDB_COMMON_FLAGS
-    "-g -pipe -O2 -W -Wall -Wno-unused-parameter -fPIC -fno-builtin-memcmp -fno-omit-frame-pointer"
-)
-set(ROCKSDB_FLAGS
-    "-DNDEBUG -DROCKSDB_JEMALLOC -DJEMALLOC_NO_DEMANGLE -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DOS_LINUX -DROCKSDB_FALLOCATE_PRESENT -DHAVE_PCLMUL -DZLIB -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_PTHREAD_ADAPTIVE_MUTEX -DROCKSDB_BACKTRACE -DROCKSDB_SUPPORT_THREAD_LOCAL -DROCKSDB_USE_RTTI -DROCKSDB_SCHED_GETCPU_PRESENT -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_AUXV_GETAUXVAL_PRESENT"
-)
-set(ROCKSDB_CMAKE_CXX_FLAGS
-    "${ROCKSDB_COMMON_FLAGS} -DROCKSDB_LIBAIO_PRESENT ${ROCKSDB_FLAGS} -fPIC -I${JEMALLOC_INCLUDE_DIR}"
-)
-if(NOT WITH_ARM)
-  set(ROCKSDB_FLAGS "${ROCKSDB_FLAGS} -DHAVE_SSE42")
-  set(ROCKSDB_CMAKE_CXX_FLAGS
-      "${ROCKSDB_CMAKE_CXX_FLAGS} -msse -msse4.2 -mpclmul")
-endif()
-set(ROCKSDB_CMAKE_C_FLAGS
-    "${ROCKSDB_COMMON_FLAGS} ${ROCKSDB_FLAGS} -DROCKSDB_LIBAIO_PRESENT -fPIC -I${JEMALLOC_INCLUDE_DIR}"
-)
-include_directories(${ROCKSDB_INCLUDE_DIR})
 
-set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread")
-
-set(ROCKSDB_CMAKE_SHARED_LINKER_FLAGS "-ldl -lrt -lz")
+set(ROCKSDB_CXX_FLAGS
+    "${CMAKE_CXX_FLAGS} -DROCKSDB_LIBAIO_PRESENT -I${JEMALLOC_INCLUDE_DIR}")
+set(ROCKSDB_SHARED_LINKER_FLAGS "-Wl,--no-as-needed -ldl")
 
 if(WITH_ARM)
   file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/rocksdb/libaio.h.patch
@@ -62,6 +42,7 @@ if(WITH_ARM)
       git checkout -- . && git checkout ${ROCKSDB_TAG} && patch -Nd
       ${PADDLE_SOURCE_DIR}/third_party/rocksdb/env/ < ${native_src})
 endif()
+
 ExternalProject_Add(
   extern_rocksdb
   ${EXTERNAL_PROJECT_LOG_ARGS}
@@ -76,25 +57,23 @@ ExternalProject_Add(
              -DWITH_GFLAGS=OFF
              -DWITH_TESTS=OFF
              -DWITH_JEMALLOC=ON
-             -DWITH_BENCHMARK_TOOLS=OFF
-             -DFAIL_ON_WARNINGS=OFF # For Clang compatibility
              -DJeMalloc_LIBRARIES=${JEMALLOC_LIBRARIES}
              -DJeMalloc_INCLUDE_DIRS=${JEMALLOC_INCLUDE_DIR}
-             -DCMAKE_CXX_FLAGS=${ROCKSDB_CMAKE_CXX_FLAGS}
-             -DCMAKE_C_FLAGS=${ROCKSDB_CMAKE_C_FLAGS}
-             -DCMAKE_SHARED_LINKER_FLAGS=${ROCKSDB_CMAKE_SHARED_LINKER_FLAGS}
-  INSTALL_COMMAND
-    mkdir -p ${ROCKSDB_INSTALL_DIR}/lib/ && cp
-    ${ROCKSDB_PREFIX_DIR}/src/extern_rocksdb-build/librocksdb.a
-    ${ROCKSDB_LIBRARIES} && cp -r ${ROCKSDB_SOURCE_DIR}/include
-    ${ROCKSDB_INSTALL_DIR}/
+             -DWITH_BENCHMARK_TOOLS=OFF
+             -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+             -DCMAKE_CXX_FLAGS=${ROCKSDB_CXX_FLAGS}
+             -DCMAKE_SHARED_LINKER_FLAGS=${ROCKSDB_SHARED_LINKER_FLAGS}
+  CMAKE_CACHE_ARGS
+    -DCMAKE_INSTALL_PREFIX:PATH=${ROCKSDB_INSTALL_DIR}
+    -DCMAKE_INSTALL_LIBDIR:PATH=${ROCKSDB_INSTALL_DIR}/lib
+    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
   BUILD_BYPRODUCTS ${ROCKSDB_LIBRARIES})
+add_dependencies(extern_rocksdb snappy extern_jemalloc)
 
 add_library(rocksdb STATIC IMPORTED GLOBAL)
-
-add_dependencies(extern_rocksdb snappy)
-add_dependencies(extern_rocksdb extern_jemalloc)
 set_property(TARGET rocksdb PROPERTY IMPORTED_LOCATION ${ROCKSDB_LIBRARIES})
+include_directories(${ROCKSDB_INCLUDE_DIR})
 add_dependencies(rocksdb extern_rocksdb)
 
 list(APPEND external_project_dependencies rocksdb)
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 5b8dd6e0ffe59..940e3804559ef 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -28,8 +28,11 @@ set(XPU_XFA_LIB_NAME "libxpu_flash_attention.so")
 if(NOT DEFINED XPU_BASE_DATE)
   set(XPU_BASE_DATE "20240104")
 endif()
+if(NOT DEFINED XPU_XDNN_BASE_DATE)
+  set(XPU_XDNN_BASE_DATE "20240327")
+endif()
 if(NOT DEFINED XPU_XHPC_BASE_DATE)
-  set(XPU_XHPC_BASE_DATE "20240328")
+  set(XPU_XHPC_BASE_DATE "20240413")
 endif()
 set(XPU_XCCL_BASE_VERSION "1.1.8.1")
 if(NOT DEFINED XPU_XFT_BASE_VERSION)
@@ -45,6 +48,10 @@ else()
   set(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
 
+set(XPU_XDNN_BASE_URL
+    "https://klx-sdk-release-public.su.bcebos.com/xdnn/stable/${XPU_XDNN_BASE_DATE}"
+)
+
 set(XPU_XCCL_BASE_URL
     "https://klx-sdk-release-public.su.bcebos.com/xccl/release/${XPU_XCCL_BASE_VERSION}"
 )
@@ -105,7 +112,7 @@ set(XPU_XRE_URL
     "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz"
     CACHE STRING "" FORCE)
 set(XPU_XDNN_URL
-    "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz"
+    "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz"
     CACHE STRING "" FORCE)
 set(XPU_XCCL_URL
     "${XPU_XCCL_BASE_URL}/${XPU_XCCL_DIR_NAME}.tar.gz"
@@ -229,7 +236,7 @@ if(WITH_XPTI)
 endif()
 
 if(WITH_XPU_XHPC)
-  target_link_libraries(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_XBLAS_LIB}
+  target_link_libraries(xpulib ${XPU_RT_LIB} ${XPU_XBLAS_LIB} ${XPU_API_LIB}
                         ${XPU_XFA_LIB})
 endif()
 
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 23f7ff529fe7a..8279f83369ca8 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -167,6 +167,7 @@ if(NOT WIN32)
   if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
     set(COMMON_FLAGS
         ${COMMON_FLAGS}
+        -Wno-error=unknown-warning-option # For some unknown warning options in lower version clang
         -Wno-error=unused-private-field
         -Wno-error=unused-const-variable
         -Wno-error=deprecated-copy-with-user-provided-copy # For three/five/zeros rule, clang
@@ -211,6 +212,11 @@ if(NOT WIN32)
       -Wno-error=unused-function # Warnings in Numpy Header.
       -Wno-error=array-bounds # Warnings in Eigen::array
   )
+
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    set(GPU_COMMON_FLAGS -ccbin=${CMAKE_CXX_COMPILER} ${GPU_COMMON_FLAGS})
+  endif()
+
   if(NOT WITH_NV_JETSON
      AND NOT WITH_ARM
      AND NOT WITH_SW
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index d618c9667de83..4c8819e438a2f 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -598,9 +598,9 @@ function(paddle_test_build TARGET_NAME)
                           ${paddle_test_DEPS} common paddle_gtest_main_new)
     add_dependencies(${TARGET_NAME} ${paddle_lib} ${paddle_test_DEPS} common
                      paddle_gtest_main_new)
-    if(WITH_MKLDNN)
-      target_link_libraries(${TARGET_NAME} mkldnn)
-      add_dependencies(${TARGET_NAME} mkldnn)
+    if(WITH_ONEDNN)
+      target_link_libraries(${TARGET_NAME} onednn)
+      add_dependencies(${TARGET_NAME} onednn)
     endif()
     if(WITH_SHARED_PHI)
       target_link_libraries(${TARGET_NAME} $<TARGET_LINKER_FILE:phi>)
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 3005da8aea125..3b81733d279d7 100755
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -114,8 +114,8 @@ function(copy_part_of_third_party TARGET DST)
     endif()
   endif()
 
-  if(WITH_MKLDNN)
-    set(dst_dir "${DST}/third_party/install/mkldnn")
+  if(WITH_ONEDNN)
+    set(dst_dir "${DST}/third_party/install/onednn")
     if(WIN32)
       copy(
         ${TARGET}
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index c7dfb4ac641d2..f089f6e55b17b 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -102,42 +102,42 @@ function(register_cu_kernel TARGET)
   endforeach()
 endfunction()
 
-# Just for those mkldnn kernels locating at "fluid/operators/mkldnn/", such as 'layer_norm_mkldnn_op.cc'.
+# Just for those onednn kernels locating at "fluid/operators/onednn/", such as 'layer_norm_onednn_op.cc'.
 # Add other file modes if need in the future.
-function(register_mkldnn_kernel TARGET)
+function(register_onednn_kernel TARGET)
   set(options "")
   set(oneValueArgs "")
   set(multiValueArgs SRCS DEPS)
-  cmake_parse_arguments(register_mkldnn_kernel "${options}" "${oneValueArgs}"
+  cmake_parse_arguments(register_onednn_kernel "${options}" "${oneValueArgs}"
                         "${multiValueArgs}" ${ARGN})
 
-  set(mkldnn_cc_srcs)
+  set(onednn_cc_srcs)
   set(op_common_deps operator op_registry phi layer
                      common_infer_shape_functions)
-  foreach(mkldnn_src ${register_mkldnn_kernel_SRCS})
-    if(${mkldnn_src} MATCHES ".*_mkldnn_op.cc$")
-      list(APPEND mkldnn_cc_srcs mkldnn/${mkldnn_src})
+  foreach(onednn_src ${register_onednn_kernel_SRCS})
+    if(${onednn_src} MATCHES ".*_onednn_op.cc$")
+      list(APPEND onednn_cc_srcs onednn/${onednn_src})
     endif()
   endforeach()
-  list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
-  if(${mkldnn_cc_srcs_len} EQUAL 0)
+  list(LENGTH onednn_cc_srcs onednn_cc_srcs_len)
+  if(${onednn_cc_srcs_len} EQUAL 0)
     message(
       FATAL_ERROR
-        "The MKLDNN kernel file of ${TARGET} should contains at least one *.*_mkldnn_op.cc file"
+        "The MKLDNN kernel file of ${TARGET} should contains at least one *.*_onednn_op.cc file"
     )
   endif()
-  if(WITH_MKLDNN)
+  if(WITH_ONEDNN)
     cc_library(
       ${TARGET}
-      SRCS ${mkldnn_cc_srcs}
+      SRCS ${onednn_cc_srcs}
       DEPS ${op_library_DEPS} ${op_common_deps})
   endif()
   set(OP_LIBRARY
       ${TARGET} ${OP_LIBRARY}
       CACHE INTERNAL "op libs")
-  foreach(mkldnn_src ${mkldnn_cc_srcs})
+  foreach(onednn_src ${onednn_cc_srcs})
     set(op_name "")
-    find_register(${mkldnn_src} "REGISTER_OP_KERNEL" op_name)
+    find_register(${onednn_src} "REGISTER_OP_KERNEL" op_name)
     if(NOT ${op_name} EQUAL "")
       file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, MKLDNN);\n")
     endif()
@@ -161,7 +161,7 @@ function(op_library TARGET)
   set(miopen_cu_srcs)
   set(CUDNN_FILE)
   set(MIOPEN_FILE)
-  set(mkldnn_cc_srcs)
+  set(onednn_cc_srcs)
   set(MKLDNN_FILE)
   set(op_common_deps operator op_registry phi layer
                      common_infer_shape_functions)
@@ -237,10 +237,10 @@ function(op_library TARGET)
         list(APPEND miopen_cu_srcs ${MIOPEN_FILE}.cu)
       endif()
     endif()
-    if(WITH_MKLDNN)
-      string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
-      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn/${MKLDNN_FILE}.cc)
-        list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc)
+    if(WITH_ONEDNN)
+      string(REPLACE "_op" "_onednn_op" MKLDNN_FILE "${TARGET}")
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/onednn/${MKLDNN_FILE}.cc)
+        list(APPEND onednn_cc_srcs onednn/${MKLDNN_FILE}.cc)
       endif()
     endif()
     if(WITH_XPU)
@@ -275,8 +275,8 @@ function(op_library TARGET)
         list(APPEND cudnn_cu_cc_srcs ${src})
       elseif(WITH_GPU AND ${src} MATCHES ".*\\.cu.cc$")
         list(APPEND cu_cc_srcs ${src})
-      elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
-        list(APPEND mkldnn_cc_srcs ${src})
+      elseif(WITH_ONEDNN AND ${src} MATCHES ".*_onednn_op.cc$")
+        list(APPEND onednn_cc_srcs ${src})
       elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
         list(APPEND xpu_cc_srcs ${src})
       elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.xpu$")
@@ -349,7 +349,7 @@ function(op_library TARGET)
     if(WITH_UNITY_BUILD AND op_library_UNITY)
       # Combine the cc and cu source files.
       compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${cu_cc_srcs}
-                                   ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs})
+                                   ${cudnn_cu_cc_srcs} ${onednn_cc_srcs})
       compose_unity_target_sources(${UNITY_TARGET} cu ${cudnn_cu_srcs}
                                    ${cu_srcs})
       if(TARGET ${UNITY_TARGET})
@@ -369,7 +369,7 @@ function(op_library TARGET)
       nv_library(
         ${TARGET}
         SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cudnn_cu_srcs}
-             ${mkldnn_cc_srcs} ${cu_srcs}
+             ${onednn_cc_srcs} ${cu_srcs}
         DEPS ${op_library_DEPS} ${op_common_deps})
     endif()
   elseif(WITH_ROCM)
@@ -389,19 +389,19 @@ function(op_library TARGET)
     hip_library(
       ${TARGET}
       SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs}
-           ${mkldnn_cc_srcs} ${hip_srcs}
+           ${onednn_cc_srcs} ${hip_srcs}
       DEPS ${op_library_DEPS} ${op_common_deps})
   elseif(WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
     xpu_library(
       ${TARGET}
-      SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${xpu_kp_cc_srcs}
+      SRCS ${cc_srcs} ${onednn_cc_srcs} ${xpu_cc_srcs} ${xpu_kp_cc_srcs}
       DEPS ${op_library_DEPS} ${op_common_deps})
   else()
     # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
     if(WITH_UNITY_BUILD AND op_library_UNITY)
       # Combine the cc source files.
       compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs}
-                                   ${mkldnn_cc_srcs} ${xpu_cc_srcs})
+                                   ${onednn_cc_srcs} ${xpu_cc_srcs})
       if(TARGET ${UNITY_TARGET})
         # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
         target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources})
@@ -417,7 +417,7 @@ function(op_library TARGET)
     else()
       cc_library(
         ${TARGET}
-        SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs}
+        SRCS ${cc_srcs} ${onednn_cc_srcs} ${xpu_cc_srcs}
         DEPS ${op_library_DEPS} ${op_common_deps})
     endif()
   endif()
@@ -426,7 +426,7 @@ function(op_library TARGET)
   list(LENGTH hip_srcs hip_srcs_len)
   list(LENGTH cu_cc_srcs cu_cc_srcs_len)
   list(LENGTH hip_cc_srcs hip_cc_srcs_len)
-  list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
+  list(LENGTH onednn_cc_srcs onednn_cc_srcs_len)
   list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
   list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len)
 
@@ -463,7 +463,7 @@ function(op_library TARGET)
     find_register(${cc_src} "REGISTER_OPERATOR" op_name)
     if(NOT ${op_name} EQUAL "")
       file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n")
-      # hack: for example, the target in conv_transpose_op.cc is conv2d_transpose, used in mkldnn
+      # hack: for example, the target in conv_transpose_op.cc is conv2d_transpose, used in onednn
       set(TARGET ${op_name})
       set(pybind_flag 1)
     endif()
@@ -474,7 +474,7 @@ function(op_library TARGET)
     find_register(${cc_src} "REGISTER_ACTIVATION_OP" op_name)
     if(NOT ${op_name} EQUAL "")
       file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n")
-      # hack: for example, the target in conv_transpose_op.cc is conv2d_transpose, used in mkldnn
+      # hack: for example, the target in conv_transpose_op.cc is conv2d_transpose, used in onednn
       set(TARGET ${op_name})
       set(pybind_flag 1)
     endif()
@@ -483,7 +483,7 @@ function(op_library TARGET)
     find_register(${cc_src} "REGISTER_OP_WITHOUT_GRADIENT" op_name)
     if(NOT ${op_name} EQUAL "")
       file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n")
-      # hack: for example, the target in conv_transpose_op.cc is conv2d_transpose, used in mkldnn
+      # hack: for example, the target in conv_transpose_op.cc is conv2d_transpose, used in onednn
       set(TARGET ${op_name})
       set(pybind_flag 1)
     endif()
@@ -496,8 +496,8 @@ function(op_library TARGET)
       # why change TARGET here?
       # when building paddle with on_infer, the REGISTER_OPERATOR(*_grad) will be removed before compiling (see details in remove_grad_op_and_kernel.py)
       # in elementwise_op.cc, it will find REGISTER_OPERATOR(grad_add) and set TARGET to grad_add
-      # and, in the following "mkldnn" part, it will add USE_OP_DEVICE_KERNEL(grad_add, MKLDNN) to pybind.h
-      # however, grad_add has no mkldnn kernel.
+      # and, in the following "onednn" part, it will add USE_OP_DEVICE_KERNEL(grad_add, MKLDNN) to pybind.h
+      # however, grad_add has no onednn kernel.
       set(TARGET ${op_name})
       set(pybind_flag 1)
     endif()
@@ -520,16 +520,16 @@ function(op_library TARGET)
     endif()
   endforeach()
 
-  # pybind USE_OP_DEVICE_KERNEL for operators/mkldnn/*
-  list(APPEND mkldnn_srcs ${mkldnn_cc_srcs})
-  foreach(mkldnn_src ${mkldnn_srcs})
+  # pybind USE_OP_DEVICE_KERNEL for operators/onednn/*
+  list(APPEND onednn_srcs ${onednn_cc_srcs})
+  foreach(onednn_src ${onednn_srcs})
     set(op_name "")
     # Add PHI Kernel Registry Message
-    find_phi_register(${mkldnn_src} ${pybind_file} "PD_REGISTER_KERNEL")
-    find_phi_register(${mkldnn_src} ${pybind_file} "PD_REGISTER_STRUCT_KERNEL")
-    find_phi_register(${mkldnn_src} ${pybind_file}
+    find_phi_register(${onednn_src} ${pybind_file} "PD_REGISTER_KERNEL")
+    find_phi_register(${onednn_src} ${pybind_file} "PD_REGISTER_STRUCT_KERNEL")
+    find_phi_register(${onednn_src} ${pybind_file}
                       "PD_REGISTER_KERNEL_FOR_ALL_DTYPE")
-    find_register(${mkldnn_src} "REGISTER_OP_CUDA_KERNEL" op_name)
+    find_register(${onednn_src} "REGISTER_OP_CUDA_KERNEL" op_name)
     if(NOT ${op_name} EQUAL "")
       file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n")
       set(pybind_flag 1)
@@ -610,14 +610,14 @@ function(op_library TARGET)
   endif()
 
   # pybind USE_OP_DEVICE_KERNEL for MKLDNN
-  if(WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
+  if(WITH_ONEDNN AND ${onednn_cc_srcs_len} GREATER 0)
     # Append first implemented MKLDNN activation operator
-    if(${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
+    if(${MKLDNN_FILE} STREQUAL "activation_onednn_op")
       file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(softplus, MKLDNN);\n")
     else()
-      foreach(mkldnn_src ${mkldnn_cc_srcs})
+      foreach(onednn_src ${onednn_cc_srcs})
         set(op_name "")
-        find_register(${mkldnn_src} "REGISTER_OP_KERNEL" op_name)
+        find_register(${onednn_src} "REGISTER_OP_KERNEL" op_name)
         if(NOT ${op_name} EQUAL "")
           file(APPEND ${pybind_file}
                "USE_OP_DEVICE_KERNEL(${op_name}, MKLDNN);\n")
@@ -666,7 +666,7 @@ function(register_operators)
     GLOB OPS
     RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
     "*_op.cc")
-  string(REPLACE "_mkldnn" "" OPS "${OPS}")
+  string(REPLACE "_onednn" "" OPS "${OPS}")
   string(REPLACE "_xpu" "" OPS "${OPS}")
   string(REPLACE ".cc" "" OPS "${OPS}")
   list(REMOVE_DUPLICATES OPS)
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index 676a25118303c..a305ef4759500 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -11,12 +11,17 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
   set(AVX_FLAG "-mavx")
   set(AVX2_FLAG "-mavx2")
   set(AVX512F_FLAG "-mavx512f")
+  set(Wno_Maybe_Uninitialized "-Wno-maybe-uninitialized")
+  set(FMA_FLAG "-mfma")
 elseif(MSVC)
   set(MMX_FLAG "/arch:MMX")
   set(SSE2_FLAG "/arch:SSE2")
   set(SSE3_FLAG "/arch:SSE3")
   set(AVX_FLAG "/arch:AVX")
   set(AVX2_FLAG "/arch:AVX2")
+  set(AVX512F_FLAG "/arch:AVX512")
+  set(Wno_Maybe_Uninitialized "/wd4701")
+  set(FMA_FLAG "/arch:AVX2")
 endif()
 
 set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 9839f32f83c2b..e90a1c860eb31 100755
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -254,19 +254,19 @@ if(WIN32 OR APPLE)
 endif()
 
 set(WITH_MKLML ${WITH_MKL})
-if(NOT DEFINED WITH_MKLDNN)
+if(NOT DEFINED WITH_ONEDNN)
   if(WITH_MKL AND AVX2_FOUND)
-    set(WITH_MKLDNN ON)
+    set(WITH_ONEDNN ON)
   else()
     message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN.")
-    set(WITH_MKLDNN OFF)
+    set(WITH_ONEDNN OFF)
   endif()
 endif()
 
 if(WIN32)
   if(MSVC)
     if(MSVC_VERSION LESS 1920)
-      set(WITH_MKLDNN OFF)
+      set(WITH_ONEDNN OFF)
     endif()
   endif()
 endif()
@@ -303,7 +303,7 @@ if(WITH_CINN)
   if(WITH_MKL)
     add_definitions(-DCINN_WITH_MKL_CBLAS)
   endif()
-  if(WITH_MKLDNN)
+  if(WITH_ONEDNN)
     add_definitions(-DCINN_WITH_DNNL)
   endif()
   include(cmake/cinn/version.cmake)
@@ -362,9 +362,9 @@ elseif(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS)
   list(APPEND third_party_deps extern_openblas)
 endif()
 
-if(WITH_MKLDNN)
-  include(external/mkldnn) # download, build, install mkldnn
-  list(APPEND third_party_deps extern_mkldnn)
+if(WITH_ONEDNN)
+  include(external/onednn) # download, build, install onednn
+  list(APPEND third_party_deps extern_onednn)
 endif()
 
 include(external/protobuf) # find first, then download, build, install protobuf
@@ -372,6 +372,11 @@ if(TARGET extern_protobuf)
   list(APPEND third_party_deps extern_protobuf)
 endif()
 
+include(external/json) # find first, then build json
+if(TARGET extern_json)
+  list(APPEND third_party_deps extern_json)
+endif()
+
 if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
   include(external/python) # find python and python_module
   include(external/pybind11) # prepare submodule pybind11
diff --git a/cmake/version.cmake b/cmake/version.cmake
index 28f022e0afa0e..185418127fdf4 100644
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -96,7 +96,7 @@ function(version version_file)
     "Paddle version: ${PADDLE_VERSION}\n"
     "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
     "WITH_MKL: ${WITH_MKL}\n"
-    "WITH_MKLDNN: ${WITH_MKLDNN}\n"
+    "WITH_ONEDNN: ${WITH_ONEDNN}\n"
     "WITH_GPU: ${WITH_GPU}\n"
     "WITH_ROCM: ${WITH_ROCM}\n"
     "WITH_IPU: ${WITH_IPU}\n")
diff --git a/paddle/cinn/CMakeLists.txt b/paddle/cinn/CMakeLists.txt
index 8369b4d8bf8e7..487f7167212fe 100644
--- a/paddle/cinn/CMakeLists.txt
+++ b/paddle/cinn/CMakeLists.txt
@@ -17,6 +17,7 @@ add_subdirectory(optim)
 add_subdirectory(hlir)
 add_subdirectory(pybind)
 add_subdirectory(frontend)
+add_subdirectory(operator_fusion)
 
 # Download a model
 download_and_uncompress("${DOWNLOAD_MODEL_DIR}" "${PADDLE_RESOURCE_URL}"
diff --git a/paddle/cinn/api/op_topo_pattern.h b/paddle/cinn/api/op_topo_pattern.h
deleted file mode 100644
index 34f17fbfde9e0..0000000000000
--- a/paddle/cinn/api/op_topo_pattern.h
+++ /dev/null
@@ -1,77 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <list>
-#include <variant>
-#include <vector>
-
-namespace cinn::api {
-
-template <typename T>
-struct ErrorPattern {};
-
-// ElementWise/Broadcast/Injective Ops without reduction ancestors.
-template <typename T>
-struct InjectiveSourcePattern {};
-
-// Reduce op
-template <typename T>
-struct SingleReductionOpPattern {};
-
-// ElementWise/Broadcast ops which have shardable dimentions and reduction
-// ancestors.
-template <typename T>
-struct PartialShardablePattern {};
-
-// Reduce base pattern
-template <typename T>
-struct ReductionPattern {
-  using Nothing = std::monostate;
-  std::variant<Nothing, InjectiveSourcePattern<T>, PartialShardablePattern<T>>
-      input;
-  SingleReductionOpPattern<T> reduce_op_pattern;
-
-  bool HasFusedInput() const {
-    return !std::holds_alternative<Nothing>(this->input);
-  }
-};
-
-// Stmt := IS | R | PS
-// ops in StmtPattern will be lowered into a inlined cuda code.
-template <typename T>
-using StmtPattern = std::variant<InjectiveSourcePattern<T>,
-                                 ReductionPattern<T>,
-                                 PartialShardablePattern<T>>;
-
-// Stmts := [Stmt]
-template <typename T>
-using StmtPatternVec = std::vector<StmtPattern<T>>;
-// fuse rules:
-//  1. IS * IS -> IS
-//  2. PS * PS -> PS
-//  3. IS * PS -> PS
-//  4. IS * R -> R
-//  5. PS * R -> R
-// lifting rules:
-//  1. R -> Stmts
-//  2. PS -> Stmts
-//  3. Stmts * Stmts -> Stmts
-// OpTopoPattern := Error | Stmts
-
-template <typename T>
-using OpTopoPattern = std::variant<ErrorPattern<T>, StmtPatternVec<T>>;
-
-}  // namespace cinn::api
diff --git a/paddle/cinn/ast_gen_ius/ast_gen.cc b/paddle/cinn/ast_gen_ius/ast_gen.cc
index 45923624945d0..89cfd3f7d462f 100644
--- a/paddle/cinn/ast_gen_ius/ast_gen.cc
+++ b/paddle/cinn/ast_gen_ius/ast_gen.cc
@@ -68,8 +68,11 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
   const std::vector<ir::Var>& axis = tensor->axis();
   const std::vector<ir::Expr>& shape = tensor->shape;
   size_t axis_len = axis.size();
-  CHECK_EQ(shape.size(), axis_len) << "Internal Error: Tensor has different "
-                                      "shape and axis length in AstGen";
+  PADDLE_ENFORCE_EQ(
+      shape.size(),
+      axis_len,
+      phi::errors::InvalidArgument("Internal Error: Tensor has different "
+                                   "shape and axis length in AstGen"));
   std::vector<ir::Expr> axis_exprs;
   for (const auto& a : axis) {
     axis_exprs.push_back(a);
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc
index 8b99fd6e61e22..cc14fc369d94d 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc
@@ -434,9 +434,9 @@ void MultiLevelTiling::ApplyCacheWrite(ir::IRSchedule* ir_schedule,
   }
 }
 
-const std::unordered_map<cinn::common::Target::Arch, MultiLevelTiling::Config>
+const std::unordered_map<cinn::common::Arch, MultiLevelTiling::Config>
     MultiLevelTiling::kConfigs{
-        {cinn::common::Target::Arch::NVGPU,
+        {cinn::common::NVGPUArch{},
          MultiLevelTiling::Config{
              /*bind_axis*/ std::vector<std::string>{"blockIdx.x",
                                                     "threadIdx.x"},
@@ -446,7 +446,7 @@ const std::unordered_map<cinn::common::Target::Arch, MultiLevelTiling::Config>
              /*write_cache_memory_type*/ std::string("local"),
              /*write_cache_levels*/ std::vector<int>{3},
          }},
-        {cinn::common::Target::Arch::X86,
+        {cinn::common::X86Arch{},
          MultiLevelTiling::Config{
              /*bind_axis*/ std::vector<std::string>{},
              /*tile_struct*/ std::string("SSRSRS"),
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
index 617cc24998bbb..1bbc8da4497d6 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
@@ -53,7 +53,7 @@ class MultiLevelTiling : public AutoGenRule {
     std::vector<int> write_cache_levels;
   };
 
-  static const std::unordered_map<cinn::common::Target::Arch, Config> kConfigs;
+  static const std::unordered_map<cinn::common::Arch, Config> kConfigs;
 
   MultiLevelTiling(const cinn::common::Target& target, const Config& config);
   ~MultiLevelTiling() = default;
diff --git a/paddle/cinn/auto_schedule/task/tune_task_test.cc b/paddle/cinn/auto_schedule/task/tune_task_test.cc
index 733197b0a6f97..2a4ce9e46fdd8 100644
--- a/paddle/cinn/auto_schedule/task/tune_task_test.cc
+++ b/paddle/cinn/auto_schedule/task/tune_task_test.cc
@@ -301,7 +301,7 @@ TEST(TuneTask, SerializeToString) {
   }
 
 #ifdef CINN_WITH_CUDA
-  std::string single_add_str = R"ROC(Target<linux,nvgpu,64>
+  std::string single_add_str = R"ROC(Target<linux,NVGPU,64>
 
 Group {
   (var_1->float32[32,24]) = elementwise_add(A->float32[32,24], B->float32[32,24])
@@ -324,7 +324,7 @@ Group {
   fused_tasks[0].Initialize(shape_dict, dtype_dict, &op_lowerer);
 
 #ifdef CINN_WITH_CUDA
-  std::string fused_expected_str = R"ROC(Target<linux,nvgpu,64>
+  std::string fused_expected_str = R"ROC(Target<linux,NVGPU,64>
 
 Group {
   (var_1->float32[32,24]) = elementwise_add(A->float32[32,24], B->float32[32,24])
@@ -332,7 +332,7 @@ Group {
 }
 )ROC";
 #else
-  std::string fused_expected_str = R"ROC(Target<linux,x86,64>
+  std::string fused_expected_str = R"ROC(Target<linux,X86,64>
 
 Group {
   (var_1->float32[32,24]) = elementwise_add(A->float32[32,24], B->float32[32,24])
diff --git a/paddle/cinn/backends/codegen_c.cc b/paddle/cinn/backends/codegen_c.cc
index c585aa843a432..85443b02c0a8c 100644
--- a/paddle/cinn/backends/codegen_c.cc
+++ b/paddle/cinn/backends/codegen_c.cc
@@ -434,31 +434,37 @@ void CodeGenC::Visit(const ir::_Module_ *op) { CINN_NOT_IMPLEMENTED }
 void CodeGenC::Visit(const ir::_Var_ *op) { str_ += op->name; }
 
 void CodeGenC::Visit(const ir::Load *op) {
-  ir::Expr op_index = op->index();
-  Expr dense_strided_ramp = detail::StridedRampBase(op_index, 1);
+  ir::Expr offset = [&] {
+    if (load_to_offset_.count(op) == 0) {
+      load_to_offset_[op] = op->index();
+    }
+    return load_to_offset_.at(op);
+  }();
+
+  Expr dense_strided_ramp = detail::StridedRampBase(offset, 1);
   if (dense_strided_ramp.defined()) {  // Loading a continuous Ramp address.
     CHECK(op->type().is_vector());
-    PrintStackVecType(op->type().ElementOf(), op_index.type().lanes());
+    PrintStackVecType(op->type().ElementOf(), offset.type().lanes());
     str_ += "::";
     str_ += "Load(";
     str_ += op->tensor.As<ir::_Tensor_>()->name;
     str_ += ",";
     IrPrinter::Visit(dense_strided_ramp);
     str_ += ")";
-  } else if (op_index.type().is_vector()) {
+  } else if (offset.type().is_vector()) {
     // gather
     CHECK(op->type().is_vector());
-    PrintStackVecType(op->type().ElementOf(), op_index.type().lanes());
+    PrintStackVecType(op->type().ElementOf(), offset.type().lanes());
     str_ += "::Load(";
     str_ += op->tensor.As<ir::_Tensor_>()->name;
     str_ += ",";
-    IrPrinter::Visit(op_index);
+    IrPrinter::Visit(offset);
     str_ += ")";
   } else if (op->is_addr_tensor()) {
     auto *tensor = op->tensor.As<ir::_Tensor_>();
     str_ += tensor->name;
     str_ += "[";
-    IrPrinter::Visit(op_index);
+    IrPrinter::Visit(offset);
     str_ += "]";
   } else {
     IrPrinter::Visit(op);
@@ -467,12 +473,17 @@ void CodeGenC::Visit(const ir::Load *op) {
 
 void CodeGenC::Visit(const ir::Store *op) {
   CHECK(op->is_addr_tensor());
-
+  ir::Expr offset = [&] {
+    if (store_to_offset_.count(op) == 0) {
+      store_to_offset_[op] = op->index();
+    }
+    return store_to_offset_.at(op);
+  }();
   auto *tensor = op->tensor.As<ir::_Tensor_>();
   CHECK(tensor);
   str_ += tensor->name;
   str_ += "[";
-  IrPrinter::Visit(op->index());
+  IrPrinter::Visit(offset);
   str_ += "]";
   str_ += " = ";
   IrPrinter::Visit(op->value);
diff --git a/paddle/cinn/backends/codegen_c.h b/paddle/cinn/backends/codegen_c.h
index c50c85741ce56..2904bef80beea 100644
--- a/paddle/cinn/backends/codegen_c.h
+++ b/paddle/cinn/backends/codegen_c.h
@@ -118,6 +118,8 @@ class CodeGenC : public ir::IrPrinter {
   Target target_;
   std::stringstream ss_;
   bool inline_builtin_codes_{true};
+  std::unordered_map<const ir::Store*, ir::Expr> store_to_offset_;
+  std::unordered_map<const ir::Load*, ir::Expr> load_to_offset_;
 };
 
 namespace detail {
diff --git a/paddle/cinn/backends/codegen_c_test.cc b/paddle/cinn/backends/codegen_c_test.cc
index 61adad6ade461..b0eb626210736 100644
--- a/paddle/cinn/backends/codegen_c_test.cc
+++ b/paddle/cinn/backends/codegen_c_test.cc
@@ -61,7 +61,7 @@ TEST(CodeGenC, module) {
   LOG(INFO) << "C.body: " << C->get_compute_op()->body.front();
 
   Target target;
-  target.arch = Target::Arch::X86;
+  target.arch = common::X86Arch{};
   target.bits = Target::Bit::k32;
   target.os = Target::OS::Linux;
   Module::Builder builder("module1", target);
diff --git a/paddle/cinn/backends/codegen_c_x86_test.cc b/paddle/cinn/backends/codegen_c_x86_test.cc
index 9e1821f7b0200..75d9d978dd960 100644
--- a/paddle/cinn/backends/codegen_c_x86_test.cc
+++ b/paddle/cinn/backends/codegen_c_x86_test.cc
@@ -41,7 +41,7 @@ TEST(CodeGenCX86, basic) {
   const int bn = 32;
 
   Target target;
-  target.arch = Target::Arch ::X86;
+  target.arch = common::X86Arch{};
   target.bits = Target::Bit ::k32;
   target.os = Target::OS ::Linux;
 
diff --git a/paddle/cinn/backends/codegen_cuda_dev.cc b/paddle/cinn/backends/codegen_cuda_dev.cc
index 6b6597b2e208c..9c19c6faffb73 100644
--- a/paddle/cinn/backends/codegen_cuda_dev.cc
+++ b/paddle/cinn/backends/codegen_cuda_dev.cc
@@ -26,6 +26,7 @@
 #include "paddle/cinn/ir/op/ir_operators.h"
 #include "paddle/cinn/ir/utils/ir_verify.h"
 #include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/common/errors.h"
 
 namespace cinn {
 namespace backends {
@@ -509,5 +510,36 @@ void CodeGenCUDA_Dev::Visit(const ir::Store *op) {
   }
 }
 
+ir::Expr CalculateSharedMemory(const ir::Buffer &buffer) {
+  Expr buffer_size(1);
+  for (int i = 0; i < buffer->shape.size(); i++) {
+    buffer_size = buffer_size * buffer->shape[i];
+  }
+  int type_bytes = buffer->dtype.bytes();
+  return buffer_size * Expr(type_bytes);
+}
+
+ir::Expr CalculateSharedMemory(const ir::Expr &func_expr) {
+  auto func = func_expr.as_lowered_func();
+  PADDLE_ENFORCE_NOT_NULL(
+      func, ::common::errors::InvalidType("expr is not a lowered_func"));
+  auto alloc_temp_buffers = func->PrepareAllocTempBufferExprs();
+  ir::Expr shm_size{0};
+  for (const auto &alloc : alloc_temp_buffers) {
+    PADDLE_ENFORCE_NOT_NULL(
+        alloc.As<ir::Alloc>(),
+        ::common::errors::InvalidType("expr is not a Alloc node"));
+    PADDLE_ENFORCE_NOT_NULL(
+        alloc.As<ir::Alloc>()->destination.as_buffer(),
+        ::common::errors::InvalidType("expr is not a Buffer node"));
+
+    auto buffer = alloc.As<ir::Alloc>()->destination.as_buffer_ref();
+    if (buffer->memory_type == ir::MemoryType::GPUShared) {
+      shm_size = shm_size + CalculateSharedMemory(buffer);
+    }
+  }
+  return common::AutoSimplify(shm_size);
+}
+
 }  // namespace backends
 }  // namespace cinn
diff --git a/paddle/cinn/backends/codegen_cuda_dev.h b/paddle/cinn/backends/codegen_cuda_dev.h
index d1ebfd930f92f..d0995fccc0e06 100644
--- a/paddle/cinn/backends/codegen_cuda_dev.h
+++ b/paddle/cinn/backends/codegen_cuda_dev.h
@@ -127,5 +127,7 @@ class CodeGenCUDA_Dev : public CodeGenC {
   std::vector<ir::Buffer> dynamic_alloc_buffers_;
 };
 
+ir::Expr CalculateSharedMemory(const ir::Expr& func_expr);
+
 }  // namespace backends
 }  // namespace cinn
diff --git a/paddle/cinn/backends/codegen_cuda_util.cc b/paddle/cinn/backends/codegen_cuda_util.cc
index 1c8d535507cb7..729dcca7be745 100644
--- a/paddle/cinn/backends/codegen_cuda_util.cc
+++ b/paddle/cinn/backends/codegen_cuda_util.cc
@@ -91,12 +91,7 @@ void detail::CollectBucketStrategyHostFunctionVisitor::ProcessLoweredFunc(
   ir::Var kernel_ptr(GenDeviceKernelName(func_node->name, predicate),
                      type_of<std::string>());
 
-  // shared_mem_bytes Can be calculated after codegen_cuda_dev buffer creation
-  // however, this make CodeGenCUDA_Dev before spliting the host and device
-  // module Maybe we could reorder the process.
-  CodeGenCUDA_Dev codegen_dev(cinn::common::DefaultNVGPUTarget());
-  codegen_dev.Compile(ir::LoweredFunc(func.as_lowered_func_ref()));
-  Expr shared_mem_bytes = codegen_dev.GetDynSharedMemOffset();
+  Expr shared_mem_bytes = CalculateSharedMemory(func);
 
   VLOG(6) << "Add a call node for func_node->name " << func_node->name << "\n"
           << "grid_dim: (" << func_node->cuda_axis_info.grid_dim(0) << ", "
diff --git a/paddle/cinn/backends/compiler.cc b/paddle/cinn/backends/compiler.cc
index f63869730a11f..b37090a74fbe1 100644
--- a/paddle/cinn/backends/compiler.cc
+++ b/paddle/cinn/backends/compiler.cc
@@ -30,6 +30,7 @@
 #include "paddle/cinn/runtime/cuda/cuda_util.h"
 #include "paddle/cinn/runtime/flags.h"
 #endif
+#include "paddle/cinn/adt/adt.h"
 
 PD_DECLARE_string(cinn_source_code_save_path);
 PD_DECLARE_string(cinn_dump_group_lowered_func);
@@ -229,41 +230,41 @@ void SourceCodePrint::write(const std::string& source_code) {
 }
 
 void Compiler::Build(const Module& module, const std::string& code) {
-  if (target_.arch == Target::Arch::NVGPU) {
-    CompileCudaModule(module, code);
-  } else if (target_.arch == Target::Arch::X86) {
-    CompileX86Module(module);
-  } else {
-    CINN_NOT_IMPLEMENTED
-  }
+  auto PatternMatch =
+      adt::match{[&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+                 [&](common::X86Arch) { CompileX86Module(module); },
+                 [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+                 [&](common::NVGPUArch) { CompileCudaModule(module, code); }};
+  return std::visit(PatternMatch, target_.arch.variant());
 }
 
 std::string Compiler::GetSourceCode(const ir::Module& module) {
-  if (target_.arch == Target::Arch::NVGPU) {
+  return target_.arch.Visit(adt::match{
+      [&](common::UnknownArch) -> std::string { CINN_NOT_IMPLEMENTED; },
+      [&](common::X86Arch) -> std::string { CINN_NOT_IMPLEMENTED; },
+      [&](common::ARMArch) -> std::string { CINN_NOT_IMPLEMENTED; },
+      [&](common::NVGPUArch) -> std::string {
 #ifdef CINN_WITH_CUDA
-    auto _host_module_device_module_ =
-        SplitCudaAndHostModule(module);  // NOLINT
-    auto& host_module = std::get<0>(_host_module_device_module_);
-    auto& device_module = std::get<1>(_host_module_device_module_);
-    CodeGenCUDA_Dev codegen(target_);
-    auto source_code = codegen.Compile(device_module);
-    return source_code;
+        auto _host_module_device_module_ =
+            SplitCudaAndHostModule(module);  // NOLINT
+        auto& host_module = std::get<0>(_host_module_device_module_);
+        auto& device_module = std::get<1>(_host_module_device_module_);
+        CodeGenCUDA_Dev codegen(target_);
+        auto source_code = codegen.Compile(device_module);
+        return source_code;
 #else
-    CINN_NOT_IMPLEMENTED
+        CINN_NOT_IMPLEMENTED
 #endif
-  } else {
-    CINN_NOT_IMPLEMENTED
-  }
+      }});
 }
 
 void Compiler::BuildDefault(const Module& module) {
-  if (target_.arch == Target::Arch::NVGPU) {
-    CompileCudaModule(module);
-  } else if (target_.arch == Target::Arch::X86) {
-    CompileX86Module(module);
-  } else {
-    CINN_NOT_IMPLEMENTED
-  }
+  target_.arch.Visit(adt::match{
+      [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+      [&](common::X86Arch) { CompileX86Module(module); },
+      [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+      [&](common::NVGPUArch) { CompileCudaModule(module); },
+  });
 }
 
 void Compiler::CompileCudaModule(const Module& module,
diff --git a/paddle/cinn/backends/extern_func_jit_register.h b/paddle/cinn/backends/extern_func_jit_register.h
index 383f8b3565a4e..4784187c8eddd 100644
--- a/paddle/cinn/backends/extern_func_jit_register.h
+++ b/paddle/cinn/backends/extern_func_jit_register.h
@@ -93,15 +93,12 @@ namespace cinn {
 namespace backends {
 
 static const char* TargetToBackendRepr(Target target) {
-  switch (target.arch) {
-    case Target::Arch::X86:
-      return backend_llvm_host;
-    case Target::Arch::NVGPU:
-      return backend_nvgpu;
-    default:
-      CINN_NOT_IMPLEMENTED
-  }
-  return nullptr;
+  return target.arch.Visit(adt::match{
+      [&](common::UnknownArch) -> const char* { CINN_NOT_IMPLEMENTED; },
+      [&](common::X86Arch) -> const char* { return backend_llvm_host; },
+      [&](common::ARMArch) -> const char* { CINN_NOT_IMPLEMENTED; },
+      [&](common::NVGPUArch) -> const char* { return backend_nvgpu; },
+  });
 }
 
 /**
diff --git a/paddle/cinn/backends/llvm/codegen_llvm.cc b/paddle/cinn/backends/llvm/codegen_llvm.cc
index e24b5220919cb..2f8a387045bf6 100644
--- a/paddle/cinn/backends/llvm/codegen_llvm.cc
+++ b/paddle/cinn/backends/llvm/codegen_llvm.cc
@@ -1366,32 +1366,40 @@ llvm::Value *CodeGenLLVM::CreateVecSlice(llvm::Value *vec,
       vec, undef, llvm::ConstantVector::get(indices));
 }
 
+int GetNaiveVecAlignmentImpl(common::UnknownArch, const Target &target) {
+  PADDLE_THROW(phi::errors::InvalidArgument("unknown Arch found"));
+}
+
+int GetNaiveVecAlignmentImpl(common::X86Arch, const Target &target) {
+  if (target.bits == Target::Bit::k32) {
+    return 256;
+  } else if (target.bits == Target::Bit::k64) {
+    return 512;
+  }
+  PADDLE_THROW(phi::errors::InvalidArgument("get unknown bits"));
+}
+
+int GetNaiveVecAlignmentImpl(common::ARMArch, const Target &target) {
+  return 128;
+}
+
+int GetNaiveVecAlignmentImpl(common::NVGPUArch, const Target &target) {
+  return 128;
+}
+
+int GetNaiveVecAlignment(const Target &target) {
+  return std::visit(
+      [&](const auto &impl) { return GetNaiveVecAlignmentImpl(impl, target); },
+      target.arch.variant());
+}
+
 void CodeGenLLVM::InitTarget(const Target &target) {
   llvm::InitializeAllTargetInfos();
   llvm::InitializeAllTargets();
   llvm::InitializeAllTargetMCs();
   llvm::InitializeAllAsmParsers();
   llvm::InitializeAllAsmPrinters();
-  switch (target.arch) {
-    case Target::Arch::X86:
-      if (target.bits == Target::Bit::k32) {
-        naive_vec_alignment_ = 256;
-      } else if (target.bits == Target::Bit::k64) {
-        naive_vec_alignment_ = 512;
-      } else {
-        PADDLE_THROW(phi::errors::InvalidArgument("get unknown bits"));
-      }
-      break;
-    case Target::Arch::ARM:
-      naive_vec_alignment_ = 128;
-      break;
-    case Target::Arch::NVGPU:
-      naive_vec_alignment_ = 128;
-      break;
-    case Target::Arch::Unk:
-      PADDLE_THROW(phi::errors::InvalidArgument("unknown Arch found"));
-      break;
-  }
+  naive_vec_alignment_ = GetNaiveVecAlignment(target);
 }
 
 bool LLVM_WillVarLowerAsPointer(const std::string &var_name) {
diff --git a/paddle/cinn/backends/llvm/execution_engine_test.cc b/paddle/cinn/backends/llvm/execution_engine_test.cc
index a66b63248a50d..a13f329a81259 100644
--- a/paddle/cinn/backends/llvm/execution_engine_test.cc
+++ b/paddle/cinn/backends/llvm/execution_engine_test.cc
@@ -108,7 +108,7 @@ auto CreateTestCinnModule() {
   C->Bind(C_buf);
 
   cinn::common::Target target;
-  target.arch = cinn::common::Target::Arch::X86;
+  target.arch = cinn::common::X86Arch{};
   target.bits = cinn::common::Target::Bit::k32;
   target.os = cinn::common::Target::OS::Linux;
   ir::Module::Builder builder("module1", target);
diff --git a/paddle/cinn/common/CMakeLists.txt b/paddle/cinn/common/CMakeLists.txt
index 95227b6f414a4..a8b72866dc1f5 100644
--- a/paddle/cinn/common/CMakeLists.txt
+++ b/paddle/cinn/common/CMakeLists.txt
@@ -7,6 +7,7 @@ gather_srcs(
   cinn_value.cc
   type.cc
   target.cc
+  arch_util.cc
   object.cc
   debug_manager.cc
   info_registry.cc
diff --git a/paddle/cinn/common/arch.h b/paddle/cinn/common/arch.h
new file mode 100644
index 0000000000000..e43dbeadc97ab
--- /dev/null
+++ b/paddle/cinn/common/arch.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <ostream>
+#include <variant>
+
+namespace cinn {
+namespace common {
+
+struct UnknownArch {};
+
+struct X86Arch {};
+
+struct ARMArch {};
+
+struct NVGPUArch {};
+
+/**
+ * The architecture used by the target. Determines the instruction set to use.
+ */
+using ArchBase = std::variant<UnknownArch, X86Arch, ARMArch, NVGPUArch>;
+struct Arch final : public ArchBase {
+  using ArchBase::ArchBase;
+
+  template <typename VisitorT>
+  decltype(auto) Visit(VisitorT&& visitor) const {
+    return std::visit(visitor, variant());
+  }
+
+  const ArchBase& variant() const {
+    return static_cast<const ArchBase&>(*this);
+  }
+
+  bool operator==(const auto& other) const {
+    return this->index() == other.index();
+  }
+
+  bool operator!=(const auto& other) const { return !(*this == other); }
+};
+
+inline bool IsDefined(Arch arch) {
+  return !std::holds_alternative<UnknownArch>(arch);
+}
+
+}  // namespace common
+}  // namespace cinn
+
+namespace std {
+
+template <>
+struct hash<::cinn::common::Arch> {
+  std::size_t operator()(const ::cinn::common::Arch& arch) const {
+    return arch.index();
+  }
+};
+
+}  // namespace std
diff --git a/paddle/cinn/common/arch_util.cc b/paddle/cinn/common/arch_util.cc
new file mode 100644
index 0000000000000..4f67fff471b6e
--- /dev/null
+++ b/paddle/cinn/common/arch_util.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/cinn/common/arch_util.h"
+
+namespace cinn {
+namespace common {
+
+std::string GetArchNameImpl(UnknownArch arch) { return "Unk"; }
+
+std::string GetArchNameImpl(X86Arch arch) { return "X86"; }
+
+std::string GetArchNameImpl(ARMArch arch) { return "ARM"; }
+
+std::string GetArchNameImpl(NVGPUArch arch) { return "NVGPU"; }
+
+std::string GetArchName(Arch arch) {
+  return std::visit([](const auto& impl) { return GetArchNameImpl(impl); },
+                    arch.variant());
+}
+
+std::ostream& operator<<(std::ostream& os, Arch arch) {
+  os << GetArchName(arch);
+  return os;
+}
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/arch_util.h b/paddle/cinn/common/arch_util.h
new file mode 100644
index 0000000000000..6f2f2adc9700b
--- /dev/null
+++ b/paddle/cinn/common/arch_util.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <array>
+#include <ostream>
+#include <string>
+#include <variant>
+#include <vector>
+#include "paddle/cinn/common/arch.h"
+
+namespace cinn {
+namespace common {
+
+std::string GetArchName(Arch arch);
+std::ostream& operator<<(std::ostream& os, Arch arch);
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/dev_info_manager.h b/paddle/cinn/common/dev_info_manager.h
index 0f9989f7c67e4..c9a1a9040950c 100644
--- a/paddle/cinn/common/dev_info_manager.h
+++ b/paddle/cinn/common/dev_info_manager.h
@@ -24,7 +24,7 @@
 namespace cinn {
 namespace common {
 
-template <Target::Arch arch>
+template <typename arch>
 struct GetDevType {
   using DevType = DevInfoBase;
 };
@@ -32,11 +32,11 @@ struct GetDevType {
 // Extra device should be added here
 class NVGPUDevInfo;
 template <>
-struct GetDevType<Target::Arch::NVGPU> {
+struct GetDevType<NVGPUArch> {
   using DevType = NVGPUDevInfo;
 };
 
-template <Target::Arch arch>
+template <typename arch>
 class DevInfoMgr final {
  private:
   explicit DevInfoMgr(int device_num = 0) : device_num_(device_num) {
diff --git a/paddle/cinn/common/target.cc b/paddle/cinn/common/target.cc
index c24c89c29ae1a..57657d01d45a8 100644
--- a/paddle/cinn/common/target.cc
+++ b/paddle/cinn/common/target.cc
@@ -22,6 +22,7 @@
 #include <sstream>
 
 #include "paddle/cinn/backends/cuda_util.h"
+#include "paddle/cinn/common/arch_util.h"
 #include "paddle/cinn/common/target.h"
 #include "paddle/cinn/runtime/cinn_runtime.h"
 #include "paddle/common/enforce.h"
@@ -43,29 +44,57 @@ bool Target::operator==(const Target &other) const {
          features == other.features;
 }
 
-int Target::runtime_arch() const {
-  switch (arch) {
-    case Arch::Unk:
-      return cinn_unk_device;
-    case Arch::X86:
-      return cinn_x86_device;
-    case Arch::ARM:
-      return cinn_arm_device;
-    default:
-      PADDLE_THROW(phi::errors::InvalidArgument("Not supported arch"));
-  }
-  return -1;
+int GetRuntimeArchImpl(UnknownArch) { return cinn_unk_device; }
+
+int GetRuntimeArchImpl(X86Arch) { return cinn_x86_device; }
+
+int GetRuntimeArchImpl(ARMArch) { return cinn_arm_device; }
+
+int GetRuntimeArchImpl(NVGPUArch) {
+  PADDLE_THROW(phi::errors::InvalidArgument("Not supported arch"));
 }
 
-int Target::max_num_threads() const {
-  CHECK(arch == Arch::NVGPU)
-      << "The target is not NVGPU! Cannot get max number of threads.";
-  return 1024;
+int GetRuntimeArch(Arch arch) {
+  return std::visit([](const auto &impl) { return GetRuntimeArchImpl(impl); },
+                    arch.variant());
 }
 
-int Target::get_multi_processor_count() const {
-  CHECK(arch == Arch::NVGPU)
-      << "The target is not NVGPU! Cannot get multi processor count";
+int Target::runtime_arch() const { return GetRuntimeArch(arch); }
+
+int GetMaxNumThreadsImpl(UnknownArch arch) {
+  LOG(FATAL) << "The target is not GPU! Cannot get max number of threads.";
+}
+
+int GetMaxNumThreadsImpl(X86Arch arch) {
+  LOG(FATAL) << "The target is not GPU! Cannot get max number of threads.";
+}
+
+int GetMaxNumThreadsImpl(ARMArch arch) {
+  LOG(FATAL) << "The target is not GPU! Cannot get max number of threads.";
+}
+
+int GetMaxNumThreadsImpl(NVGPUArch arch) { return 1024; }
+
+int GetMaxNumThreads(Arch arch) {
+  return std::visit([](const auto &impl) { return GetMaxNumThreadsImpl(impl); },
+                    arch.variant());
+}
+
+int Target::max_num_threads() const { return GetMaxNumThreads(arch); }
+
+int GetMultiProcessCountImpl(UnknownArch arch) {
+  LOG(FATAL) << "The target is not GPU! Cannot get multi processor count.";
+}
+
+int GetMultiProcessCountImpl(X86Arch arch) {
+  LOG(FATAL) << "The target is not GPU! Cannot get multi processor count.";
+}
+
+int GetMultiProcessCountImpl(ARMArch arch) {
+  LOG(FATAL) << "The target is not GPU! Cannot get multi processor count.";
+}
+
+int GetMultiProcessCountImpl(NVGPUArch arch) {
   int num_sm = 0;
 #ifdef CINN_WITH_CUDA
   cudaDeviceGetAttribute(
@@ -74,9 +103,32 @@ int Target::get_multi_processor_count() const {
   return num_sm;
 }
 
-int Target::get_max_threads_per_sm() const {
-  CHECK(arch == Arch::NVGPU)
-      << "The target is not NVGPU! Cannot get max threads per stream processor";
+int GetMultiProcessCount(Arch arch) {
+  return std::visit(
+      [](const auto &impl) { return GetMultiProcessCountImpl(impl); },
+      arch.variant());
+}
+
+int Target::get_multi_processor_count() const {
+  return GetMultiProcessCount(arch);
+}
+
+int GetMaxThreadsPerSmImpl(UnknownArch arch) {
+  LOG(FATAL)
+      << "The target is not GPU! Cannot get max threads per stream processor";
+}
+
+int GetMaxThreadsPerSmImpl(X86Arch arch) {
+  LOG(FATAL)
+      << "The target is not GPU! Cannot get max threads per stream processor";
+}
+
+int GetMaxThreadsPerSmImpl(ARMArch arch) {
+  LOG(FATAL)
+      << "The target is not GPU! Cannot get max threads per stream processor";
+}
+
+int GetMaxThreadsPerSmImpl(NVGPUArch arch) {
   int max_thread = 0;
 #ifdef CINN_WITH_CUDA
   cudaDeviceGetAttribute(
@@ -85,9 +137,30 @@ int Target::get_max_threads_per_sm() const {
   return max_thread;
 }
 
-int Target::get_max_blocks_per_sm() const {
-  CHECK(arch == Arch::NVGPU)
-      << "The target is not NVGPU! Cannot get max blocks per stream processor";
+int GetMaxThreadsPerSm(Arch arch) {
+  return std::visit(
+      [](const auto &impl) { return GetMaxThreadsPerSmImpl(impl); },
+      arch.variant());
+}
+
+int Target::get_max_threads_per_sm() const { return GetMaxThreadsPerSm(arch); }
+
+int GetMaxBlocksPerSmImpl(UnknownArch) {
+  LOG(FATAL)
+      << "The target is not GPU! Cannot get max blocks per stream processor";
+}
+
+int GetMaxBlocksPerSmImpl(X86Arch) {
+  LOG(FATAL)
+      << "The target is not GPU! Cannot get max blocks per stream processor";
+}
+
+int GetMaxBlocksPerSmImpl(ARMArch) {
+  LOG(FATAL)
+      << "The target is not GPU! Cannot get max blocks per stream processor";
+}
+
+int GetMaxBlocksPerSmImpl(NVGPUArch) {
   int max_blocks = 1;
 #ifdef CINN_WITH_CUDA
   cudaDeviceGetAttribute(
@@ -96,6 +169,14 @@ int Target::get_max_blocks_per_sm() const {
   return max_blocks;
 }
 
+int GetMaxBlocksPerSm(Arch arch) {
+  return std::visit(
+      [](const auto &impl) { return GetMaxBlocksPerSmImpl(impl); },
+      arch.variant());
+}
+
+int Target::get_max_blocks_per_sm() const { return GetMaxBlocksPerSm(arch); }
+
 std::vector<Target::Lib> Target::get_target_libs() const { return libs; }
 
 int Target::get_target_bits() const {
@@ -133,21 +214,7 @@ std::ostream &operator<<(std::ostream &os, const Target &target) {
   }
 
   os << ",";
-
-  switch (target.arch) {
-    case Target::Arch::X86:
-      os << "x86";
-      break;
-    case Target::Arch::ARM:
-      os << "arm";
-      break;
-    case Target::Arch::NVGPU:
-      os << "nvgpu";
-      break;
-    case Target::Arch::Unk:
-      os << "unk";
-      break;
-  }
+  os << target.arch;
   os << ",";
 
   switch (target.bits) {
@@ -166,38 +233,19 @@ std::ostream &operator<<(std::ostream &os, const Target &target) {
   return os;
 }
 
-std::ostream &operator<<(std::ostream &os, Target::Arch arch) {
-  switch (arch) {
-    case Target::Arch::Unk:
-      os << "Unk";
-      break;
-    case Target::Arch::X86:
-      os << "X86";
-      break;
-    case Target::Arch::ARM:
-      os << "ARM";
-      break;
-    case Target::Arch::NVGPU:
-      os << "NVGPU";
-      break;
-  }
-  return os;
-}
-
 const Target &UnkTarget() {
   static Target target(
-      Target::OS::Unk, Target::Arch::Unk, Target::Bit::Unk, {}, {});
+      Target::OS::Unk, UnknownArch{}, Target::Bit::Unk, {}, {});
   return target;
 }
 const Target &DefaultHostTarget() {
-  static Target target(
-      Target::OS::Linux, Target::Arch::X86, Target::Bit::k64, {}, {});
+  static Target target(Target::OS::Linux, X86Arch{}, Target::Bit::k64, {}, {});
   return target;
 }
 
 const Target &DefaultNVGPUTarget() {
   static Target target(
-      Target::OS::Linux, Target::Arch::NVGPU, Target::Bit::k64, {}, {});
+      Target::OS::Linux, NVGPUArch{}, Target::Bit::k64, {}, {});
   return target;
 }
 
diff --git a/paddle/cinn/common/target.h b/paddle/cinn/common/target.h
index 9fdc1d9939360..6df1d1ece8c5f 100644
--- a/paddle/cinn/common/target.h
+++ b/paddle/cinn/common/target.h
@@ -17,7 +17,10 @@
 #include <array>
 #include <ostream>
 #include <string>
+#include <variant>
 #include <vector>
+#include "paddle/cinn/adt/adt.h"
+#include "paddle/cinn/common/arch.h"
 
 namespace cinn {
 namespace common {
@@ -33,16 +36,6 @@ struct Target {
     Windows,
   };
 
-  /**
-   * The architecture used by the target. Determines the instruction set to use.
-   */
-  enum class Arch : int {
-    Unk = -1,
-    X86,
-    ARM,
-    NVGPU,
-  };
-
   enum class Bit : int {
     Unk = -1,
     k32,
@@ -50,7 +43,7 @@ struct Target {
   };
 
   OS os{OS::Unk};
-  Arch arch{Arch::Unk};
+  Arch arch{UnknownArch{}};
   Bit bits{Bit::Unk};
 
   enum class Feature : int {
@@ -69,13 +62,13 @@ struct Target {
   std::vector<Lib> libs;
 
   explicit Target(OS o = OS::Linux,
-                  Arch a = Arch::Unk,
+                  Arch a = UnknownArch{},
                   Bit b = Bit::Unk,
                   const std::vector<Feature>& features = {},
                   const std::vector<Lib>& libs = {});
 
   bool defined() const {
-    return os != OS::Unk && arch != Arch::Unk && bits != Bit::Unk;
+    return os != OS::Unk && IsDefined(arch) && bits != Bit::Unk;
   }
 
   //! Get the Runtime architecture, it is casted to integer to avoid header file
@@ -113,7 +106,5 @@ int GetMaxThreads();
 
 int GetMaxBlocks();
 
-std::ostream& operator<<(std::ostream& os, Target::Arch arch);
-
 }  // namespace common
 }  // namespace cinn
diff --git a/paddle/cinn/frontend/CMakeLists.txt b/paddle/cinn/frontend/CMakeLists.txt
index 2ba6ccd12e5bf..1d5315e1f965a 100755
--- a/paddle/cinn/frontend/CMakeLists.txt
+++ b/paddle/cinn/frontend/CMakeLists.txt
@@ -62,7 +62,7 @@ add_subdirectory(paddle)
 add_subdirectory(decomposer)
 add_subdirectory(op_mappers)
 add_subdirectory(pass)
-# add_subdirectory(group_cluster)
+#add_subdirectory(group_cluster)
 
 cinn_cc_test(test_op_mapper_registry SRCS op_mapper_registry_test.cc DEPS
              cinncore)
diff --git a/paddle/cinn/frontend/computation.cc b/paddle/cinn/frontend/computation.cc
index ee7d2ce6b3a82..387fd87f9c709 100644
--- a/paddle/cinn/frontend/computation.cc
+++ b/paddle/cinn/frontend/computation.cc
@@ -58,12 +58,16 @@ std::shared_ptr<ComputationContext> CompileProgram(
 
   if (ctx->compile_options.use_default_passes) {
     hlir::framework::ApplyPass(ctx->graph.get(), "InferShape");
-
+    target.arch.Visit(adt::match{
+        [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+        [&](common::X86Arch) {
 #ifndef CINN_WITH_CUDA
-    if (target.arch == Target::Arch::X86) {
-      hlir::framework::ApplyPass(ctx->graph.get(), "AlterLayout");
-    }
+          hlir::framework::ApplyPass(ctx->graph.get(), "AlterLayout");
 #endif
+        },
+        [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+        [&](common::NVGPUArch) { CINN_NOT_IMPLEMENTED; },
+    });
     hlir::framework::ApplyPass(ctx->graph.get(), "ConstPropagate");
     hlir::framework::ApplyPasses(ctx->graph.get(), DefaultOpFusionPasses());
   }
@@ -200,34 +204,37 @@ void CinnComputation::SetTensorData(hlir::framework::Tensor &t,
                                     size_t size) {
   void *tdata = t->mutable_data(context_->target, t->type());
   CHECK_EQ(size, t->shape().numel() * t->type().bytes());
-  if (context_->target.arch == Target::Arch::NVGPU) {
+  context_->target.arch.Visit(adt::match{
+      [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+      [&](common::X86Arch) { memcpy(tdata, data, size); },
+      [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+      [&](common::NVGPUArch) {
 #ifdef CINN_WITH_CUDA
-    CUDA_CALL(cudaMemcpy(tdata, data, size, cudaMemcpyHostToDevice));
+        CUDA_CALL(cudaMemcpy(tdata, data, size, cudaMemcpyHostToDevice));
 #else
-    CINN_NOT_IMPLEMENTED
+        CINN_NOT_IMPLEMENTED;
 #endif
-  } else if (context_->target.arch == Target::Arch::X86) {
-    memcpy(tdata, data, size);
-  } else {
-    CINN_NOT_IMPLEMENTED
-  }
+      },
+  });
 }
+
 void CinnComputation::GetTensorData(hlir::framework::Tensor &t,
                                     void *data,
                                     size_t size) {
   void *tdata = t->mutable_data(context_->target, t->type());
   CHECK_EQ(size, t->shape().numel() * t->type().bytes());
-  if (context_->target.arch == Target::Arch::NVGPU) {
+  context_->target.arch.Visit(adt::match{
+      [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+      [&](common::X86Arch) { memcpy(data, tdata, size); },
+      [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+      [&](common::NVGPUArch) {
 #ifdef CINN_WITH_CUDA
-    CUDA_CALL(cudaMemcpy(data, tdata, size, cudaMemcpyDeviceToHost));
+        CUDA_CALL(cudaMemcpy(data, tdata, size, cudaMemcpyDeviceToHost));
 #else
-    CINN_NOT_IMPLEMENTED
+        CINN_NOT_IMPLEMENTED;
 #endif
-  } else if (context_->target.arch == Target::Arch::X86) {
-    memcpy(data, tdata, size);
-  } else {
-    CINN_NOT_IMPLEMENTED
-  }
+      },
+  });
 }
 
 void CinnComputation::GetTensorData(const std::string &tname,
diff --git a/paddle/cinn/frontend/group_cluster/CMakeLists.txt b/paddle/cinn/frontend/group_cluster/CMakeLists.txt
deleted file mode 100644
index 14cb3c1cfa0e8..0000000000000
--- a/paddle/cinn/frontend/group_cluster/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-gather_srcs(group_cluster_src SRCS common_utils.cc pattern_node.cc
-            pattern_graph.cc)
-
-add_subdirectory(cluster_policy)
-
-cc_library(group_cluster SRCS ${group_cluster_src})
diff --git a/paddle/cinn/frontend/group_cluster/cluster_policy/CMakeLists.txt b/paddle/cinn/frontend/group_cluster/cluster_policy/CMakeLists.txt
deleted file mode 100644
index c5328419c7f7b..0000000000000
--- a/paddle/cinn/frontend/group_cluster/cluster_policy/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-gather_srcs(group_cluster_src SRCS general_topo_policy.cc policy_manager.cc)
-
-add_subdirectory(shardable_axes_policy)
diff --git a/paddle/cinn/frontend/group_cluster/cluster_policy/general_topo_policy.cc b/paddle/cinn/frontend/group_cluster/cluster_policy/general_topo_policy.cc
deleted file mode 100644
index 87f8523eda49f..0000000000000
--- a/paddle/cinn/frontend/group_cluster/cluster_policy/general_topo_policy.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/cinn/frontend/group_cluster/cluster_policy/general_topo_policy.h"
-
-namespace cinn::frontend::group_cluster::policy {
-
-bool GeneralTopoPolicy::CanFuse(const PatternNodePtr upstream,
-                                const PatternNodePtr downstream) {
-  // TODO(wuzhanfei) topo policy (if lead to loop)
-  return false;
-}
-
-}  // namespace cinn::frontend::group_cluster::policy
diff --git a/paddle/cinn/frontend/group_cluster/cluster_policy/policy_manager.cc b/paddle/cinn/frontend/group_cluster/cluster_policy/policy_manager.cc
deleted file mode 100644
index 3f54bacbd3ecd..0000000000000
--- a/paddle/cinn/frontend/group_cluster/cluster_policy/policy_manager.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/cinn/frontend/group_cluster/cluster_policy/policy_manager.h"
-#include "paddle/common/enforce.h"
-
-namespace cinn::frontend::group_cluster::policy {
-
-bool PolicyManager::CanFuse(const PatternNodePtr upstream,
-                            const PatternNodePtr downstream) {
-  for (const auto& policy : policies_) {
-    if (!policy->CanFuse(upstream, downstream)) return false;
-  }
-  return true;
-}
-
-}  // namespace cinn::frontend::group_cluster::policy
diff --git a/paddle/cinn/frontend/group_cluster/cluster_policy/policy_manager.h b/paddle/cinn/frontend/group_cluster/cluster_policy/policy_manager.h
deleted file mode 100644
index f7a2f100add82..0000000000000
--- a/paddle/cinn/frontend/group_cluster/cluster_policy/policy_manager.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/cinn/frontend/group_cluster/pattern_node.h"
-
-namespace cinn::frontend::group_cluster::policy {
-
-class Policy {
- public:
-  virtual bool CanFuse(const PatternNodePtr upstream,
-                       const PatternNodePtr downstream) = 0;
-};
-
-using PolicyPtr = std::shared_ptr<Policy>;
-
-class PolicyManager {
- public:
-  explicit PolicyManager(const std::vector<PolicyPtr>& policies)
-      : policies_(policies) {}
-  bool CanFuse(const PatternNodePtr upstream, const PatternNodePtr downstream);
-
- private:
-  std::vector<PolicyPtr> policies_;
-};
-
-}  // namespace cinn::frontend::group_cluster::policy
diff --git a/paddle/cinn/frontend/group_cluster/cluster_policy/shardable_axes_policy/CMakeLists.txt b/paddle/cinn/frontend/group_cluster/cluster_policy/shardable_axes_policy/CMakeLists.txt
deleted file mode 100644
index 8d3f64fa5bc96..0000000000000
--- a/paddle/cinn/frontend/group_cluster/cluster_policy/shardable_axes_policy/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-gather_srcs(group_cluster_src SRCS shardable_axes_base.cc
-            shardable_axes_policy.cc)
diff --git a/paddle/cinn/frontend/group_cluster/cluster_policy/shardable_axes_policy/shardable_axes_base.cc b/paddle/cinn/frontend/group_cluster/cluster_policy/shardable_axes_policy/shardable_axes_base.cc
deleted file mode 100644
index ef58985330b70..0000000000000
--- a/paddle/cinn/frontend/group_cluster/cluster_policy/shardable_axes_policy/shardable_axes_base.cc
+++ /dev/null
@@ -1,165 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/cinn/frontend/group_cluster/cluster_policy/shardable_axes_policy/shardable_axes_base.h"
-#include "paddle/cinn/frontend/group_cluster/common_utils.h"
-
-namespace cinn::frontend::group_cluster::policy {
-
-std::string ShardableAxesInfoManager::GetUniqueName() {
-  static std::atomic<int64_t> counter = 0;
-  return "D" + std::to_string(counter);
-}
-
-std::vector<std::string> CreateNewNamesWithRank(int64_t rank) {
-  auto result = std::vector<std::string>();
-  for (int64_t i = 0; i < rank; i++) {
-    result.emplace_back(ShardableAxesInfoManager::GetUniqueName());
-  }
-  return result;
-}
-
-ShardableAxesSignature CreateDefaultSignature(const pir::Operation* op) {
-  ShardableAxesSignature result = ShardableAxesSignature();
-  for (int i = 0; i < op->num_operands(); ++i) {
-    result.inputs.emplace_back(
-        CreateNewNamesWithRank(GetRank(op->operand_source(i))));
-  }
-  for (int i = 0; i < op->num_results(); ++i) {
-    result.outputs.emplace_back(CreateNewNamesWithRank(GetRank(op->result(i))));
-  }
-  return result;
-}
-
-std::optional<ShardableAxesSignature> CreateSignatureForSpecialOps(
-    const pir::Operation* op) {
-  if (op->isa<cinn::dialect::ReshapeOp>()) {
-    return CreateDefaultSignature(op);
-  }
-  return std::nullopt;
-}
-
-ShardableAxesSignature CreateSignatureForReduce(
-    const pir::Operation* reduce_op) {
-  CHECK_EQ(reduce_op->num_operands(), 1);
-  CHECK_EQ(reduce_op->num_results(), 1);
-  ShardableAxesSignature result = ShardableAxesSignature();
-  const size_t input_rank = GetRank(reduce_op->operand_source(0));
-  auto input_axes = CreateNewNamesWithRank(input_rank);
-
-  const auto& reduce_axis_idx = GetReduceAxisIdx(reduce_op);
-  bool keep_dim = GetReduceOpKeepDims(reduce_op);
-  auto output_axes = std::vector<std::string>();
-
-  for (int i = 0; i < input_rank; i++) {
-    if (std::find(reduce_axis_idx.begin(), reduce_axis_idx.end(), i) !=
-        reduce_axis_idx.end()) {
-      if (keep_dim) {
-        output_axes.emplace_back("constant_1");
-      }  // else do nothing
-    } else {
-      output_axes.emplace_back(input_axes[i]);
-    }
-  }
-
-  result.inputs.emplace_back(input_axes);
-  result.outputs.emplace_back(output_axes);
-
-  return result;
-}
-
-ShardableAxesSignature CreateSignatureForElementWise(const pir::Operation* op) {
-  ShardableAxesSignature result = ShardableAxesSignature();
-
-  int64_t rank = GetRank(op->result(0));
-  auto same_axes = CreateNewNamesWithRank(rank);
-
-  for (int i = 0; i < op->num_operands(); ++i) {
-    CHECK(rank == GetRank(op->operand_source(i)));
-    result.inputs.emplace_back(same_axes);
-  }
-  for (int i = 0; i < op->num_results(); ++i) {
-    CHECK(rank == GetRank(op->result(i)));
-    result.outputs.emplace_back(same_axes);
-  }
-  return result;
-}
-
-ShardableAxesSignature CreateSignatureForBroadcast(const pir::Operation* op) {
-  const auto& broad_cast_value = GetBroadcastOpInputOuputValue(op);
-  if (!broad_cast_value.has_value()) {
-    return CreateDefaultSignature(op);
-  }
-  const auto& [input, output] = broad_cast_value.value();
-  // TODO(wuzhanfei) support broadcast
-  return CreateDefaultSignature(op);
-}
-
-ShardableAxesSignature CreateShardableSignature(const pir::Operation* op) {
-  auto special_result = CreateSignatureForSpecialOps(op);
-  if (special_result != std::nullopt) {
-    return special_result.value();
-  }
-
-  CHECK(op->num_results() == 1)
-      << "Now we do not support op with multi outputs";
-  ShardableAxesSignature result;
-  const hlir::framework::OpPatternKind kind = GetOpPatternKind(op);
-  if (kind == hlir::framework::kReduction) {
-    result = CreateSignatureForReduce(op);
-  } else if (kind == hlir::framework::kElementWise) {
-    result = CreateSignatureForElementWise(op);
-  } else if (kind == hlir::framework::kBroadcast) {
-    result = CreateSignatureForBroadcast(op);
-  } else {
-    result = CreateDefaultSignature(op);
-  }
-  VLOG(4) << "[ShardableAxesInfoManager] Create Shardable Axes Signature : \n"
-          << op->name() << " : " << result.DebugStr();
-  return result;
-}
-
-ShardableAxesInfoManager::ShardableAxesInfoManager(
-    const std::vector<const pir::Operation*>& ops,
-    const pir::ShapeConstraintIRAnalysis* shape_analysis)
-    : ops_(ops), shape_analysis_(shape_analysis) {
-  for (const auto& op : ops) {
-    op_signature_map_[op] = CreateShardableSignature(op);
-  }
-
-  // TODO(wuzhanfei) update value_axes_map_ name_union_
-}
-
-std::string ShardableAxes::DebugStr() {
-  std::stringstream ss;
-  for (const auto& name : axis_names) {
-    ss << name << ", ";
-  }
-  return ss.str();
-}
-
-std::string ShardableAxesSignature::DebugStr() {
-  std::stringstream ss;
-  ss << "ShardableAxes Signature:\n";
-  for (int i = 0; i < inputs.size(); i++) {
-    ss << "input " << i << ": " << inputs[i].DebugStr() << "\n";
-  }
-  for (int i = 0; i < outputs.size(); i++) {
-    ss << "output " << i << ": " << outputs[i].DebugStr() << "\n";
-  }
-  return ss.str();
-}
-
-}  // namespace cinn::frontend::group_cluster::policy
diff --git a/paddle/cinn/frontend/group_cluster/cluster_policy/shardable_axes_policy/shardable_axes_policy.h b/paddle/cinn/frontend/group_cluster/cluster_policy/shardable_axes_policy/shardable_axes_policy.h
deleted file mode 100644
index 43b0634fcb2b6..0000000000000
--- a/paddle/cinn/frontend/group_cluster/cluster_policy/shardable_axes_policy/shardable_axes_policy.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/cinn/frontend/group_cluster/cluster_policy/policy_manager.h"
-#include "paddle/cinn/frontend/group_cluster/cluster_policy/shardable_axes_policy/shardable_axes_base.h"
-
-namespace cinn::frontend::group_cluster::policy {
-
-class ShardableAxesPolicy final : virtual public Policy {
- public:
-  ShardableAxesPolicy(const std::vector<const pir::Operation*>& ops,
-                      const pir::ShapeConstraintIRAnalysis* shape_analysis)
-      : axes_info_(ops, shape_analysis) {}
-  bool CanFuse(const PatternNodePtr upstream, const PatternNodePtr downstream);
-
- private:
-  ShardableAxesInfoManager axes_info_;
-};
-
-}  // namespace cinn::frontend::group_cluster::policy
diff --git a/paddle/cinn/frontend/group_cluster/common_utils.cc b/paddle/cinn/frontend/group_cluster/common_utils.cc
deleted file mode 100644
index 304b05193983e..0000000000000
--- a/paddle/cinn/frontend/group_cluster/common_utils.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/cinn/frontend/group_cluster/common_utils.h"
-
-namespace cinn::frontend::group_cluster {
-
-OpPatternKind GetOpPatternKind(const ::pir::Operation* op) {
-  return hlir::framework::pir::CompatibleInfo::OpKind(*op);
-}
-
-size_t GetRank(pir::Value value) {
-  return value.type().dyn_cast<pir::DenseTensorType>().dims().size();
-}
-
-std::vector<int64_t> GetReduceAxisIdx(const pir::Operation* reduce_op) {
-  const size_t input_rank = GetRank(reduce_op->operand_source(0));
-  const auto& attr_val = reduce_op->attributes().at("dim");
-  CHECK(attr_val.isa<::pir::ArrayAttribute>());
-  const auto& axis_attr = attr_val.dyn_cast<::pir::ArrayAttribute>();
-  std::vector<int64_t> reduce_axis_idx;
-  for (int i = 0; i < axis_attr.size(); ++i) {
-    int64_t axis = axis_attr.at(i).dyn_cast<::pir::Int64Attribute>().data();
-    if (axis < 0) {
-      axis += input_rank;
-    }
-    CHECK_GE(axis, 0);
-    CHECK_LT(axis, input_rank);
-    reduce_axis_idx.push_back(axis);
-  }
-  return reduce_axis_idx;
-}
-
-bool GetReduceOpKeepDims(const pir::Operation* reduce_op) {
-  const auto& attr_val = reduce_op->attributes().at("keep_dim");
-  CHECK(attr_val.isa<::pir::BoolAttribute>());
-  return attr_val.dyn_cast<::pir::BoolAttribute>();
-}
-
-std::string OpsDebugStr(std::vector<const pir::Operation*> ops) {
-  std::stringstream ss;
-  pir::IrPrinter printer(ss);
-  for (const auto* op : ops) {
-    printer.PrintOperation(const_cast<pir::Operation*>(op));
-    ss << "\n";
-  }
-  return ss.str();
-}
-
-std::optional<std::pair<pir::Value, pir::Value>> GetBroadcastOpInputOuputValue(
-    const pir::Operation* op) {
-  auto* mut_op = const_cast<pir::Operation*>(op);
-  if (op->isa<paddle::dialect::ExpandOp>()) {
-    auto expand_op = mut_op->dyn_cast<paddle::dialect::ExpandOp>();
-    return std::make_pair(expand_op.x(), expand_op.out());
-  }
-  if (op->isa<cinn::dialect::BroadcastOp>()) {
-    auto broadcast_op = mut_op->dyn_cast<cinn::dialect::BroadcastOp>();
-    return std::make_pair(broadcast_op.x(), broadcast_op.out());
-  }
-  VLOG(4) << "[ShardableAxesSignature] Unsupported Broadcast op: "
-          << op->name();
-  return std::nullopt;
-}
-}  // namespace cinn::frontend::group_cluster
-
-namespace cinn::frontend::group_cluster {
-
-bool IsTrivialPattern(const StmtPattern& pattern) {
-  return std::holds_alternative<TrivialPattern>(pattern);
-}
-
-bool IsReducePattern(const StmtPattern& pattern) {
-  return std::holds_alternative<ReducePattern>(pattern);
-}
-
-bool IsUnsupportPattern(const StmtPattern& pattern) {
-  return std::holds_alternative<UnsupportPattern>(pattern);
-}
-
-std::vector<const pir::Operation*> GetOpsInPattern(const StmtPattern& pattern) {
-  return std::visit([](const auto& impl) { return impl.ops_; }, pattern);
-}
-
-std::string StmtPatternDebugStr(const StmtPattern& stmt) {
-  std::stringstream ss;
-  auto all_ops = GetOpsInPattern(stmt);
-  ss << "StmtPattern, size " << all_ops.size() << " :\n";
-  ss << OpsDebugStr(all_ops);
-  return ss.str();
-}
-
-StmtPattern MergePattern(const StmtPattern& first, const StmtPattern& second) {
-  std::vector<const pir::Operation*> ops =
-      MergeVector(GetOpsInPattern(first), GetOpsInPattern(second));
-  if (IsUnsupportPattern(first) || IsUnsupportPattern(second)) {
-    return UnsupportPattern(ops);
-  } else if (IsReducePattern(first) || IsReducePattern(second)) {
-    return ReducePattern(ops);
-  } else {
-    return TrivialPattern(ops);
-  }
-}
-
-StmtPattern ConvertToStmtPattern(const pir::Operation* op) {
-  const auto& kind = GetOpPatternKind(op);
-  if (kind == hlir::framework::kReduction) {
-    return ReducePattern({op});
-  } else if (kind == hlir::framework::kElementWise ||
-             kind == hlir::framework::kBroadcast ||
-             kind == hlir::framework::kInjective) {
-    return TrivialPattern({op});
-  } else {
-    return UnsupportPattern({op});
-  }
-}
-
-}  // namespace cinn::frontend::group_cluster
diff --git a/paddle/cinn/frontend/group_cluster/common_utils.h b/paddle/cinn/frontend/group_cluster/common_utils.h
deleted file mode 100644
index af2b6c5cde97d..0000000000000
--- a/paddle/cinn/frontend/group_cluster/common_utils.h
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <atomic>
-#include <memory>
-#include <optional>
-#include <typeinfo>
-#include <unordered_map>
-#include <unordered_set>
-#include <variant>
-#include <vector>
-
-#include "glog/logging.h"
-
-#include "paddle/cinn/frontend/group_cluster/pattern.h"
-
-#include "paddle/cinn/common/bfs_walker.h"
-#include "paddle/cinn/common/topo_walker.h"
-
-#include "paddle/cinn/hlir/dialect/operator/ir/cinn_op.h"
-#include "paddle/cinn/hlir/dialect/operator/ir/manual_op.h"
-#include "paddle/cinn/hlir/framework/op.h"
-#include "paddle/cinn/utils/string.h"
-#include "paddle/pir/include/dialect/control_flow/ir/cf_op.h"
-
-namespace cinn::frontend::group_cluster {
-
-using OpPatternKind = cinn::hlir::framework::OpPatternKind;
-
-OpPatternKind GetOpPatternKind(const ::pir::Operation* op);
-size_t GetRank(pir::Value value);
-std::vector<int64_t> GetReduceAxisIdx(const pir::Operation* reduce_op);
-bool GetReduceOpKeepDims(const pir::Operation* reduce_op);
-std::string OpsDebugStr(std::vector<const pir::Operation*> ops);
-std::optional<std::pair<pir::Value, pir::Value>> GetBroadcastOpInputOuputValue(
-    const pir::Operation* op);
-}  // namespace cinn::frontend::group_cluster
-
-namespace cinn::frontend::group_cluster {
-
-bool IsTrivialPattern(const StmtPattern& pattern);
-bool IsReducePattern(const StmtPattern& pattern);
-bool IsUnsupportPattern(const StmtPattern& pattern);
-
-template <typename T>
-void ExtendVector(std::vector<T>* first, const std::vector<T>& second) {
-  std::unordered_set<T> visited =
-      std::unordered_set<T>(first->begin(), first->end());
-  for (auto iter = second.begin(); iter != second.end(); iter++) {
-    if (visited.find(*iter) == visited.end()) {
-      visited.emplace(*iter);
-      first->emplace_back(*iter);
-    }
-  }
-}
-
-template <typename T>
-std::vector<T> MergeVector(const std::vector<T>& first,
-                           const std::vector<T>& second) {
-  std::vector<T> result = std::vector<T>(first);
-  ExtendVector(&result, second);
-  return result;
-}
-
-std::vector<const pir::Operation*> GetOpsInPattern(const StmtPattern& pattern);
-std::string StmtPatternDebugStr(const StmtPattern& pattern);
-StmtPattern MergePattern(const StmtPattern& first, const StmtPattern& second);
-
-StmtPattern ConvertToStmtPattern(const pir::Operation* op);
-}  // namespace cinn::frontend::group_cluster
diff --git a/paddle/cinn/frontend/group_cluster/group_cluster.h b/paddle/cinn/frontend/group_cluster/group_cluster.h
deleted file mode 100644
index 950c3b77942a6..0000000000000
--- a/paddle/cinn/frontend/group_cluster/group_cluster.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/cinn/frontend/group_cluster/cluster_policy/general_topo_policy.h"
-#include "paddle/cinn/frontend/group_cluster/cluster_policy/shardable_axes_policy/shardable_axes_policy.h"
-#include "paddle/cinn/frontend/group_cluster/pattern_graph.h"
-
-namespace cinn::frontend {
-
-inline std::vector<std::vector<const pir::Operation*>> ClusterOps(
-    const cinn::dialect::GroupOp& group_op) {
-  const auto& ops = [&] {
-    std::vector<const pir::Operation*> ops;
-    for (const auto& op : group_op.GetOperators()) {
-      ops.emplace_back(op);
-    }
-    return ops;
-  }();
-
-  VLOG(4) << "Start Cluster Ops!";
-  VLOG(4) << "Input Group with size " << ops.size() << " :\n"
-          << group_cluster::OpsDebugStr(ops);
-
-  const auto* shape_analysis =
-      &pir::ShapeAnalysisManager::Instance().Get(group_op->GetParentProgram());
-
-  auto shardable_axes_policy =
-      std::make_shared<group_cluster::policy::ShardableAxesPolicy>(
-          ops, shape_analysis);
-  auto general_topo_policy =
-      std::make_shared<group_cluster::policy::GeneralTopoPolicy>();
-
-  auto policy_manager = group_cluster::policy::PolicyManager(
-      {shardable_axes_policy, general_topo_policy});
-
-  group_cluster::PatternGraph graph(ops, policy_manager);
-  return graph.ClusterOps();
-}
-
-}  // namespace cinn::frontend
diff --git a/paddle/cinn/frontend/group_cluster/pattern.h b/paddle/cinn/frontend/group_cluster/pattern.h
deleted file mode 100644
index c4d7928c28ba2..0000000000000
--- a/paddle/cinn/frontend/group_cluster/pattern.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <variant>
-#include <vector>
-#include "paddle/pir/include/core/operation.h"
-
-namespace cinn::frontend::group_cluster {
-
-struct TrivialPattern {
-  explicit TrivialPattern(const std::vector<const pir::Operation*>& ops)
-      : ops_(ops) {}
-  std::vector<const pir::Operation*> ops_;
-};
-
-struct ReducePattern {
-  explicit ReducePattern(const std::vector<const pir::Operation*>& ops)
-      : ops_(ops) {}
-  std::vector<const pir::Operation*> ops_;
-};
-
-struct UnsupportPattern {
-  explicit UnsupportPattern(const std::vector<const pir::Operation*>& ops)
-      : ops_(ops) {}
-  std::vector<const pir::Operation*> ops_;
-};
-
-// UnsupportedPattern can't fuse with any pattern
-// Step 1: T x T|R => T|R                 TrivialPattern can always fuse with
-// downstream Step 2: R x T|R => R                   Use Shardable Axes Policy
-// to judge
-
-// If we want add MatmulPattern =>
-// StmtPattern = std::variant<TrivialPattern, ReducePattern, MatmulPattern,
-// UnsupportPattern>; Fusion with different Pattern will have specialized logic
-// to Judge, Update policy logic for MatmulPattern
-using StmtPattern =
-    std::variant<TrivialPattern, ReducePattern, UnsupportPattern>;
-
-}  // namespace cinn::frontend::group_cluster
diff --git a/paddle/cinn/frontend/group_cluster/pattern_graph.cc b/paddle/cinn/frontend/group_cluster/pattern_graph.cc
deleted file mode 100644
index 57d2fd1388f77..0000000000000
--- a/paddle/cinn/frontend/group_cluster/pattern_graph.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/cinn/frontend/group_cluster/pattern_graph.h"
-
-namespace cinn::frontend::group_cluster {
-
-std::vector<std::vector<const pir::Operation*>> PatternGraph::ClusterOps() {
-  SinkTrivialPattern();
-  FuseReducePattern();
-  // TODO(wuzhanfei) need sort here, or do not return from all_pattern_nodes_
-  std::vector<std::vector<const pir::Operation*>> result;
-  std::transform(all_pattern_nodes_.begin(),
-                 all_pattern_nodes_.end(),
-                 std::back_inserter(result),
-                 [](const PatternNodePtr node) { return node->GetOps(); });
-  return result;
-}
-
-void PatternGraph::SinkTrivialPattern() {
-  // TODO(wuzhanfei): need consider Unsupport op here
-  const auto FindTrivialNode =
-      [](std::unordered_set<PatternNodePtr> all_nodes) -> PatternNodePtr {
-    for (PatternNodePtr node : all_nodes) {
-      if (node->IsTrivial() && !node->downstream_.empty()) return node;
-    }
-    return nullptr;
-  };
-
-  PatternNodePtr upstream;
-  while ((upstream = FindTrivialNode(all_pattern_nodes_)) != nullptr) {
-    std::vector<PatternNodePtr> fusion_candidate = upstream->downstream_;
-    upstream->downstream_.clear();
-    for (const auto& downstream : fusion_candidate) {
-      PatternNodePtr new_node =
-          std::make_shared<PatternNode>(upstream, downstream);
-      AppendNode(new_node);
-      RemoveNode(downstream);
-    }
-    RemoveNode(upstream);
-  }
-}
-
-void PatternGraph::FuseReducePattern() {
-  // TODO(wuzhanfei) reduce fusion, similar with implementation in backend
-}
-
-PatternGraph::PatternGraph(const std::vector<const pir::Operation*>& ops,
-                           const policy::PolicyManager policy_manager)
-    : policy_manager_(policy_manager) {
-  std::unordered_map<const pir::Operation*, PatternNodePtr> op_to_node_map;
-
-  for (int i = 0; i < ops.size(); ++i) {
-    PatternNodePtr node = std::make_shared<PatternNode>(ops[i]);
-    op_to_node_map[ops[i]] = node;
-    all_pattern_nodes_.emplace(node);
-    node->sink_op_ = ops[i];
-  }
-
-  for (const pir::Operation* op : ops) {
-    PatternNodePtr cur_node = op_to_node_map[op];
-
-    // add upstream nodes
-    for (int i = 0; i < op->num_operands(); ++i) {
-      ::pir::Operation* input_op = op->operand_source(i).defining_op();
-      if (op_to_node_map.find(input_op) != op_to_node_map.end()) {
-        PatternNodePtr upstream_node = op_to_node_map[input_op];
-        cur_node->upstream_.push_back(upstream_node);
-        upstream_node->downstream_.push_back(cur_node);
-      }
-    }
-
-    // add downstream nodes
-    for (int i = 0; i < op->num_results(); ++i) {
-      pir::Value related_value = op->result(i);
-      for (auto consumer_it = related_value.use_begin();
-           consumer_it != related_value.use_end();
-           ++consumer_it) {
-        ::pir::Operation* output_op = consumer_it->owner();
-        if (op_to_node_map.find(output_op) != op_to_node_map.end()) {
-          PatternNodePtr downstream_node = op_to_node_map[output_op];
-          cur_node->downstream_.push_back(downstream_node);
-          downstream_node->upstream_.push_back(cur_node);
-        }
-      }
-    }
-
-    if (cur_node->upstream_.empty()) {
-      entrance_nodes_.emplace(cur_node);
-    }
-
-    if (cur_node->downstream_.empty()) {
-      exit_nodes_.emplace(cur_node);
-    }
-  }
-
-  VLOG(4) << "PatternGraph Created, pattern node size: "
-          << all_pattern_nodes_.size();
-}
-
-void PatternGraph::RemoveNode(PatternNodePtr node) {
-  if (all_pattern_nodes_.find(node) != all_pattern_nodes_.end()) {
-    all_pattern_nodes_.erase(node);
-  }
-  if (entrance_nodes_.find(node) != entrance_nodes_.end()) {
-    entrance_nodes_.erase(node);
-  }
-  if (exit_nodes_.find(node) != exit_nodes_.end()) {
-    exit_nodes_.erase(node);
-  }
-}
-
-void PatternGraph::AppendNode(PatternNodePtr node) {
-  all_pattern_nodes_.emplace(node);
-  if (node->upstream_.empty()) {
-    entrance_nodes_.emplace(node);
-  }
-  if (node->downstream_.empty()) {
-    exit_nodes_.emplace(node);
-  }
-}
-
-}  // namespace cinn::frontend::group_cluster
diff --git a/paddle/cinn/frontend/group_cluster/pattern_graph.h b/paddle/cinn/frontend/group_cluster/pattern_graph.h
deleted file mode 100644
index cc3c811eba519..0000000000000
--- a/paddle/cinn/frontend/group_cluster/pattern_graph.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include "paddle/cinn/frontend/group_cluster/cluster_policy/policy_manager.h"
-#include "paddle/cinn/frontend/group_cluster/common_utils.h"
-#include "paddle/cinn/frontend/group_cluster/pattern_node.h"
-
-namespace cinn::frontend::group_cluster {
-
-class PatternGraph {
- public:
-  PatternGraph(const std::vector<const pir::Operation*>& ops,
-               const policy::PolicyManager policy_manager);
-
-  std::vector<std::vector<const pir::Operation*>> ClusterOps();
-
- private:
-  void SinkTrivialPattern();
-  void FuseReducePattern();
-
-  void RemoveNode(PatternNodePtr node);
-  void AppendNode(PatternNodePtr node);
-
- private:
-  std::unordered_set<PatternNodePtr> all_pattern_nodes_;
-  std::unordered_set<PatternNodePtr> entrance_nodes_;
-  std::unordered_set<PatternNodePtr> exit_nodes_;
-
-  const policy::PolicyManager policy_manager_;
-};
-
-}  // namespace cinn::frontend::group_cluster
diff --git a/paddle/cinn/frontend/group_cluster/pattern_node.cc b/paddle/cinn/frontend/group_cluster/pattern_node.cc
deleted file mode 100644
index 50c287e679bb4..0000000000000
--- a/paddle/cinn/frontend/group_cluster/pattern_node.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/cinn/frontend/group_cluster/pattern_node.h"
-
-namespace cinn::frontend::group_cluster {
-
-PatternNode::PatternNode(const pir::Operation* op)
-    : sink_op_(op), stmt_pattern_(ConvertToStmtPattern(op)) {}
-
-PatternNode::PatternNode(PatternNodePtr fused_up_node,
-                         PatternNodePtr fused_down_node)
-    : sink_op_(fused_down_node->sink_op_),
-      stmt_pattern_(MergePattern(fused_up_node->stmt_pattern_,
-                                 fused_down_node->stmt_pattern_)) {
-  const auto FindFromVector =
-      [](std::vector<PatternNodePtr> vec,
-         PatternNodePtr item) -> std::vector<PatternNodePtr>::iterator {
-    return std::find(vec.begin(), vec.end(), item);
-  };
-
-  ExtendVector(&upstream_, fused_up_node->upstream_);
-  ExtendVector(&upstream_, fused_down_node->upstream_);
-
-  upstream_.erase(FindFromVector(upstream_, fused_up_node));
-
-  ExtendVector(&downstream_, fused_up_node->downstream_);
-  ExtendVector(&downstream_, fused_down_node->downstream_);
-  downstream_.erase(FindFromVector(downstream_, fused_down_node));
-
-  std::vector<PatternNodePtr>::iterator iter;
-  for (const auto& upstream_node : upstream_) {
-    iter = FindFromVector(upstream_node->downstream_, fused_up_node);
-    if (iter != upstream_node->downstream_.end()) {
-      upstream_node->downstream_.erase(iter);
-    }
-    iter = FindFromVector(upstream_node->downstream_, fused_down_node);
-    if (iter != upstream_node->downstream_.end()) {
-      upstream_node->downstream_.erase(iter);
-    }
-  }
-
-  for (const auto& downstream_node : downstream_) {
-    iter = FindFromVector(downstream_node->upstream_, fused_up_node);
-    if (iter != downstream_node->upstream_.end()) {
-      downstream_node->upstream_.erase(iter);
-    }
-    iter = FindFromVector(downstream_node->upstream_, fused_down_node);
-    if (iter != downstream_node->upstream_.end()) {
-      downstream_node->upstream_.erase(iter);
-    }
-  }
-}
-
-std::vector<const pir::Operation*> PatternNode::GetOps() const {
-  return GetOpsInPattern(stmt_pattern_);
-}
-
-bool PatternNode::IsTrivial() const { return IsTrivialPattern(stmt_pattern_); }
-
-}  // namespace cinn::frontend::group_cluster
diff --git a/paddle/cinn/frontend/group_cluster/pattern_node.h b/paddle/cinn/frontend/group_cluster/pattern_node.h
deleted file mode 100644
index 2eb957329904a..0000000000000
--- a/paddle/cinn/frontend/group_cluster/pattern_node.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/cinn/frontend/group_cluster/common_utils.h"
-
-namespace cinn::frontend::group_cluster {
-
-struct PatternNode {
-  using PatternNodePtr = std::shared_ptr<PatternNode>;
-
-  explicit PatternNode(const pir::Operation* op);
-  explicit PatternNode(PatternNodePtr fused_up_node,
-                       PatternNodePtr fused_down_node);
-
-  bool IsTrivial() const;
-  std::vector<const pir::Operation*> GetOps() const;
-
-  StmtPattern stmt_pattern_;
-  const pir::Operation* sink_op_;
-
-  std::vector<PatternNodePtr> upstream_;
-  std::vector<PatternNodePtr> downstream_;
-};
-
-using PatternNodePtr = PatternNode::PatternNodePtr;
-}  // namespace cinn::frontend::group_cluster
diff --git a/paddle/cinn/frontend/op_mappers/paddle/conv2d.cc b/paddle/cinn/frontend/op_mappers/paddle/conv2d.cc
index 21f1645752ffb..c44c77e6f0a1f 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/conv2d.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/conv2d.cc
@@ -73,8 +73,15 @@ void Conv2dOpMapper(const paddle::cpp::OpDesc& op_desc,
   ctx.AddVarModelToProgram(out_name, out->id);
 }
 
-void DepthwiseConv2dOpMapper(const paddle::cpp::OpDesc& op_desc,
-                             const OpMapperContext& ctx) {
+void DepthwiseConv2dOpMapperImpl(common::UnknownArch,
+                                 const paddle::cpp::OpDesc& op_desc,
+                                 const OpMapperContext& ctx) {
+  LOG(FATAL) << "NotImplemented.";
+}
+
+void DepthwiseConv2dOpMapperImpl(common::X86Arch,
+                                 const paddle::cpp::OpDesc& op_desc,
+                                 const OpMapperContext& ctx) {
   CHECK_EQ(op_desc.Input("Input").size(), 1UL);
   auto x_name = op_desc.Input("Input").front();
   CHECK_EQ(op_desc.Input("Filter").size(), 1UL);
@@ -103,30 +110,83 @@ void DepthwiseConv2dOpMapper(const paddle::cpp::OpDesc& op_desc,
   auto y = ctx.GetVar(y_name);
 
   Variable out;
-  if (ctx.Target().arch == Target::Arch::X86) {
-    out = ctx.Builder()->Conv2d(x,
-                                y,
-                                strides,
-                                paddings,
-                                dilations,
-                                groups,
-                                data_format,
-                                padding_algorithm);
-  } else {
-    out = ctx.Builder()->DepthwiseConv2d(x,
-                                         y,
-                                         strides,
-                                         paddings,
-                                         dilations,
-                                         groups,
-                                         data_format,
-                                         padding_algorithm);
+  out = ctx.Builder()->Conv2d(x,
+                              y,
+                              strides,
+                              paddings,
+                              dilations,
+                              groups,
+                              data_format,
+                              padding_algorithm);
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void DepthwiseConv2dOpMapperImpl(common::ARMArch,
+                                 const paddle::cpp::OpDesc& op_desc,
+                                 const OpMapperContext& ctx) {
+  LOG(FATAL) << "NotImplemented.";
+}
+
+void DepthwiseConv2dOpMapperImpl(common::NVGPUArch,
+                                 const paddle::cpp::OpDesc& op_desc,
+                                 const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("Input").size(), 1UL);
+  auto x_name = op_desc.Input("Input").front();
+  CHECK_EQ(op_desc.Input("Filter").size(), 1UL);
+  auto y_name = op_desc.Input("Filter").front();
+
+  CHECK_EQ(op_desc.Output("Output").size(), 1UL);
+  auto out_name = op_desc.Output("Output").front();
+
+  auto strides =
+      utils::GetAttrOrDefault<std::vector<int>>(op_desc, "strides", {1, 1});
+  auto paddings =
+      utils::GetAttrOrDefault<std::vector<int>>(op_desc, "paddings", {0, 0});
+  auto dilations =
+      utils::GetAttrOrDefault<std::vector<int>>(op_desc, "dilations", {1, 1});
+  auto groups = utils::GetAttrOrDefault<int>(op_desc, "groups", 1);
+
+  auto data_format =
+      utils::GetAttrOrDefault<std::string>(op_desc, "data_format", "NCHW");
+  if (data_format == "AnyLayout") {
+    data_format = "NCHW";
   }
 
+  auto padding_algorithm = utils::GetAttrOrDefault<std::string>(
+      op_desc, "padding_algorithm", "EXPLICIT");
+  auto x = ctx.GetVar(x_name);
+  auto y = ctx.GetVar(y_name);
+
+  Variable out;
+  out = ctx.Builder()->DepthwiseConv2d(x,
+                                       y,
+                                       strides,
+                                       paddings,
+                                       dilations,
+                                       groups,
+                                       data_format,
+                                       padding_algorithm);
+
   ctx.AddVar(out_name, out);
   ctx.AddVarModelToProgram(out_name, out->id);
 }
 
+void DepthwiseConv2dOpMapperByArch(common::Arch arch,
+                                   const paddle::cpp::OpDesc& op_desc,
+                                   const OpMapperContext& ctx) {
+  return std::visit(
+      [&](const auto& impl) {
+        return DepthwiseConv2dOpMapperImpl(impl, op_desc, ctx);
+      },
+      arch.variant());
+}
+
+void DepthwiseConv2dOpMapper(const paddle::cpp::OpDesc& op_desc,
+                             const OpMapperContext& ctx) {
+  return DepthwiseConv2dOpMapperByArch(ctx.Target().arch, op_desc, ctx);
+}
+
 void Conv2dGradOpMapper(const paddle::cpp::OpDesc& op_desc,
                         const OpMapperContext& ctx) {
   // get dy
diff --git a/paddle/cinn/frontend/paddle/model_parser.cc b/paddle/cinn/frontend/paddle/model_parser.cc
index 086cf11fe34b5..cc59f7a8bdb38 100644
--- a/paddle/cinn/frontend/paddle/model_parser.cc
+++ b/paddle/cinn/frontend/paddle/model_parser.cc
@@ -77,48 +77,50 @@ void TensorFromStream(std::istream &is,
   void *buf;
   size_t size = tensor->shape().numel() * SizeOfType(desc.data_type());
   // allocate memory
-  if (target.arch == Target::Arch::X86) {
-    switch (static_cast<int>(desc.data_type())) {
+  target.arch.Visit(adt::match{
+      [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+      [&](common::X86Arch) {
+        switch (static_cast<int>(desc.data_type())) {
 #define SET_TENSOR(desc, type, precision)     \
   case Type::VarType_Type_##desc:             \
     buf = tensor->mutable_data<type>(target); \
     tensor->set_type(precision);              \
     break
-
-      SET_TENSOR(FP32, float, Float(32));
-      SET_TENSOR(INT8, int8_t, Int(8));
-      SET_TENSOR(INT16, int16_t, Int(16));
-      SET_TENSOR(INT32, int32_t, Int(32));
-      SET_TENSOR(INT64, int64_t, Int(64));
+          SET_TENSOR(FP32, float, Float(32));
+          SET_TENSOR(INT8, int8_t, Int(8));
+          SET_TENSOR(INT16, int16_t, Int(16));
+          SET_TENSOR(INT32, int32_t, Int(32));
+          SET_TENSOR(INT64, int64_t, Int(64));
 #undef SET_TENSOR
-      default:
-        std::stringstream ss;
-        ss << "unknown type " << desc.data_type();
-        PADDLE_THROW(phi::errors::InvalidArgument(ss.str()));
-    }
-    // tensor->set_persistable(true);
-    is.read(static_cast<char *>(buf), size);
-  } else if (target.arch == Target::Arch::NVGPU) {
+          default:
+            std::stringstream ss;
+            ss << "unknown type " << desc.data_type();
+            PADDLE_THROW(phi::errors::InvalidArgument(ss.str()));
+        }
+        // tensor->set_persistable(true);
+        is.read(static_cast<char *>(buf), size);
+      },
+      [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+      [&](common::NVGPUArch) {
 #ifdef CINN_WITH_CUDA
-    if (desc.data_type() != Type::VarType_Type_FP32)
-      PADDLE_THROW(
-          phi::errors::InvalidArgument("[CUDA] The type is not fp32!!"));
-    auto *data = tensor->mutable_data<float>(target);
-    tensor->set_type(Float(32));
-    std::vector<float> temp(tensor->shape().numel());
-    // LOG(INFO) <<"[CUDA] The tensor's size is "<< tensor->shape().numel();
-    is.read(reinterpret_cast<char *>(temp.data()), size);
-    CUDA_CALL(cudaMemcpy(reinterpret_cast<void *>(data),
-                         temp.data(),
-                         tensor->shape().numel() * sizeof(float),
-                         cudaMemcpyHostToDevice));
+        if (desc.data_type() != Type::VarType_Type_FP32)
+          PADDLE_THROW(
+              phi::errors::InvalidArgument("[CUDA] The type is not fp32!!"));
+        auto *data = tensor->mutable_data<float>(target);
+        tensor->set_type(Float(32));
+        std::vector<float> temp(tensor->shape().numel());
+        // LOG(INFO) <<"[CUDA] The tensor's size is "<< tensor->shape().numel();
+        is.read(reinterpret_cast<char *>(temp.data()), size);
+        CUDA_CALL(cudaMemcpy(reinterpret_cast<void *>(data),
+                             temp.data(),
+                             tensor->shape().numel() * sizeof(float),
+                             cudaMemcpyHostToDevice));
 #else
-    PADDLE_THROW(phi::errors::Fatal(
-        "To use CUDA backends, you need to set WITH_CUDA ON!"));
+        PADDLE_THROW(phi::errors::Fatal(
+            "To use CUDA backends, you need to set WITH_CUDA ON!"));
 #endif
-  } else {
-    CINN_NOT_IMPLEMENTED
-  }
+      },
+  });
 }
 
 void LoadLoDTensor(std::istream &is,
diff --git a/paddle/cinn/frontend/paddle_model_to_program.cc b/paddle/cinn/frontend/paddle_model_to_program.cc
index 7249c35f19d26..b7e512fe18260 100644
--- a/paddle/cinn/frontend/paddle_model_to_program.cc
+++ b/paddle/cinn/frontend/paddle_model_to_program.cc
@@ -410,39 +410,98 @@ void PaddleModelToProgram::AddOpMapper_relu6() {
     var_model_to_program_map_[out_name] = out->id;
   };
 }
+
+template <typename T>
+Variable AddOpMapperDepthwiseConv2dImpl(common::UnknownArch,
+                                        T* net_builder,
+                                        const paddle::cpp::OpDesc& op_desc,
+                                        const Variable& x,
+                                        const Variable& y) {
+  LOG(FATAL) << "NotImplemented.";
+}
+
+template <typename T>
+Variable AddOpMapperDepthwiseConv2dImpl(common::X86Arch,
+                                        T* net_builder,
+                                        const paddle::cpp::OpDesc& op_desc,
+                                        const Variable& x,
+                                        const Variable& y) {
+  CHECK(op_desc.HasAttr("paddings"));
+  auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+  CHECK(op_desc.HasAttr("strides"));
+  auto strides = op_desc.GetAttr<std::vector<int>>("strides");
+  CHECK(op_desc.HasAttr("dilations"));
+  auto dilations = op_desc.GetAttr<std::vector<int>>("dilations");
+  CHECK(op_desc.HasAttr("groups"));
+  auto groups = op_desc.GetAttr<int>("groups");
+  CHECK(op_desc.HasAttr("data_format"));
+  std::string data_format = op_desc.GetAttr<std::string>("data_format");
+  if (data_format == "AnyLayout") {
+    data_format = "NCHW";
+  }
+  return net_builder->Conv2d(
+      x, y, strides, paddings, dilations, groups, data_format);
+}
+
+template <typename T>
+Variable AddOpMapperDepthwiseConv2dImpl(common::ARMArch,
+                                        T* net_builder,
+                                        const paddle::cpp::OpDesc& op_desc,
+                                        const Variable& x,
+                                        const Variable& y) {
+  LOG(FATAL) << "NotImplemented.";
+}
+
+template <typename T>
+Variable AddOpMapperDepthwiseConv2dImpl(common::NVGPUArch,
+                                        T* net_builder,
+                                        const paddle::cpp::OpDesc& op_desc,
+                                        const Variable& x,
+                                        const Variable& y) {
+  CHECK(op_desc.HasAttr("paddings"));
+  auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+  CHECK(op_desc.HasAttr("strides"));
+  auto strides = op_desc.GetAttr<std::vector<int>>("strides");
+  CHECK(op_desc.HasAttr("dilations"));
+  auto dilations = op_desc.GetAttr<std::vector<int>>("dilations");
+  CHECK(op_desc.HasAttr("groups"));
+  auto groups = op_desc.GetAttr<int>("groups");
+  CHECK(op_desc.HasAttr("data_format"));
+  std::string data_format = op_desc.GetAttr<std::string>("data_format");
+  if (data_format == "AnyLayout") {
+    data_format = "NCHW";
+  }
+  Variable out;
+  return net_builder->DepthwiseConv2d(
+      x, y, strides, paddings, dilations, groups, data_format);
+}
+
+template <typename T>
+Variable AddOpMapperDepthwiseConv2d(common::Arch arch,
+                                    T* net_builder,
+                                    const paddle::cpp::OpDesc& op_desc,
+                                    const Variable& x,
+                                    const Variable& y) {
+  return std::visit(
+      [&](const auto& impl) {
+        return AddOpMapperDepthwiseConv2dImpl(impl, net_builder, op_desc, x, y);
+      },
+      arch.variant());
+}
+
 void PaddleModelToProgram::AddOpMapper_depthwise_conv2d() {
   op_mappers_["depthwise_conv2d"] = [&](const paddle::cpp::OpDesc& op_desc) {
     CHECK_EQ(op_desc.Input("Input").size(), 1UL);
     auto x_name = op_desc.Input("Input").front();
     CHECK_EQ(op_desc.Input("Filter").size(), 1UL);
     auto y_name = op_desc.Input("Filter").front();
-    CHECK_EQ(op_desc.Output("Output").size(), 1UL);
-    auto out_name = op_desc.Output("Output").front();
-
-    CHECK(op_desc.HasAttr("paddings"));
-    auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
-    CHECK(op_desc.HasAttr("strides"));
-    auto strides = op_desc.GetAttr<std::vector<int>>("strides");
-    CHECK(op_desc.HasAttr("dilations"));
-    auto dilations = op_desc.GetAttr<std::vector<int>>("dilations");
-    CHECK(op_desc.HasAttr("groups"));
-    auto groups = op_desc.GetAttr<int>("groups");
-    CHECK(op_desc.HasAttr("data_format"));
-    std::string data_format = op_desc.GetAttr<std::string>("data_format");
-    if (data_format == "AnyLayout") {
-      data_format = "NCHW";
-    }
     auto x = GetVar(TransValidVarName(x_name));
     auto y = GetVar(TransValidVarName(y_name));
-    Variable out;
-    if (target_.arch == Target::Arch::X86) {
-      out = net_builder_->Conv2d(
-          x, y, strides, paddings, dilations, groups, data_format);
-    } else {
-      out = net_builder_->DepthwiseConv2d(
-          x, y, strides, paddings, dilations, groups, data_format);
-    }
-
+    auto* net_builder = net_builder_.get();
+    Variable out =
+        AddOpMapperDepthwiseConv2d(target_.arch, net_builder, op_desc, x, y);
+    CHECK_EQ(op_desc.Output("Output").size(), 1UL);
+    auto out_name = op_desc.Output("Output").front();
     AddVar(TransValidVarName(out_name), out);
     var_model_to_program_map_[out_name] = out->id;
   };
@@ -635,13 +694,13 @@ void PaddleModelToProgram::TransposeVar(const std::string& name) {
   auto* var = scope_->FindVar(name);
   if (var) {
     auto& tensor = absl::get<hlir::framework::Tensor>(*var);
-    if (target_.arch == Target::Arch::X86) {
+    if (std::holds_alternative<common::X86Arch>(target_.arch)) {
       float* data = tensor->mutable_data<float>(target_);
       CHECK(tensor->shape().size() == 2)
           << "The y data's shape size of op [mul] is not equal to 2! Please "
              "check.";
       TransposeData(data, tensor->shape().data()[0], tensor->shape().data()[1]);
-    } else if (target_.arch == Target::Arch::NVGPU) {
+    } else if (std::holds_alternative<common::NVGPUArch>(target_.arch)) {
 #ifdef CINN_WITH_CUDA
       // To use cublas mul api, there is no need to transpose data.
 #ifndef CINN_WITH_CUDNN
@@ -691,13 +750,13 @@ void PaddleModelToProgram::ReverseHWVar(const std::string& name) {
   auto* var = scope_->FindVar(name);
   if (var) {
     auto& tensor = absl::get<hlir::framework::Tensor>(*var);
-    if (target_.arch == Target::Arch::X86) {
+    if (std::holds_alternative<common::X86Arch>(target_.arch)) {
       float* data = tensor->mutable_data<float>(target_);
       CHECK(tensor->shape().size() == 4)
           << "The y data's shape size of op [conv2d] is not equal to 4! Please "
              "check.";
       ReverseHWData(data, tensor->shape().data());
-    } else if (target_.arch == Target::Arch::NVGPU) {
+    } else if (std::holds_alternative<common::NVGPUArch>(target_.arch)) {
 #ifdef CINN_WITH_CUDA
       std::vector<float> data(tensor->shape().numel());
       CUDA_CALL(cudaMemcpy(
diff --git a/paddle/cinn/frontend/pass/gemm_rewriter.cc b/paddle/cinn/frontend/pass/gemm_rewriter.cc
index fe178c0b88137..fae47d5e2a9c5 100644
--- a/paddle/cinn/frontend/pass/gemm_rewriter.cc
+++ b/paddle/cinn/frontend/pass/gemm_rewriter.cc
@@ -40,7 +40,8 @@ class GemmRewriterPass : public ProgramPass {
   void ApplyImpl(Program* prog,
                  const std::unordered_set<std::string>& fetch_ids,
                  const cinn::common::Target& target) override {
-    if (target.arch != Target::Arch::NVGPU || !prog->size()) {
+    if (!std::holds_alternative<common::NVGPUArch>(target.arch) ||
+        !prog->size()) {
       return;
     }
 
diff --git a/paddle/cinn/hlir/dialect/operator/ir/generate_shape_util.cc b/paddle/cinn/hlir/dialect/operator/ir/generate_shape_util.cc
index 0ce1ad6bab5c0..b70ef1d78e6b3 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/generate_shape_util.cc
+++ b/paddle/cinn/hlir/dialect/operator/ir/generate_shape_util.cc
@@ -617,6 +617,13 @@ bool MakeGenerateShapeOpAttribute(
                "they are handled by other passes";
     return false;
   }
+  // When minimal_inputs is empty, it means that all of out dim_expr must be
+  // int64.
+  if (minimal_inputs->empty()) {
+    for (const auto& dim_expr : out_dim_exprs) {
+      if (!dim_expr.isa<int64_t>()) return false;
+    }
+  }
   // generate output_dim_expr_attrs
   ConvertDimExprToAttributes(
       ir_context, out_dim_exprs, /*out*/ output_dim_expr_attrs);
@@ -627,6 +634,26 @@ bool MakeGenerateShapeOpAttribute(
                          *minimal_inputs,
                          symbol_names_in_out_dim_exprs,
                          /*out*/ symbol_bindings);
+
+  // check all dim exprs have symbol binding
+  for (const auto& symbol_name : symbol_names_in_out_dim_exprs) {
+    bool has_symbol_binding = false;
+    for (const auto& symbol_binding : *symbol_bindings) {
+      const std::string& symbol_binding_name = std::visit(
+          [&](const auto& symbol_binding) -> const std::string& {
+            return symbol_binding.symbol_name;
+          },
+          symbol_binding);
+      if (symbol_name == symbol_binding_name) {
+        has_symbol_binding = true;
+        break;
+      }
+    }
+    if (!has_symbol_binding) {
+      LOG(WARNING) << "no symbol binding found for dim expr: " << symbol_name;
+      return false;
+    }
+  }
   return true;
 }
 
diff --git a/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc b/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
index 71f0b9f33f4ec..40008b51a54f2 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
+++ b/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
@@ -65,7 +65,6 @@ void GroupOp::Build(pir::Builder& builder,             // NOLINT
                     std::unique_ptr<pir::Block>&& block) {
   VLOG(4) << "Start build GroupOp";
   if (block && !block->empty()) {
-    // IR_ENFORCE(block->back().isa<pir::YieldOp>());
     PADDLE_ENFORCE_EQ(block->back().isa<pir::YieldOp>(), true);
     auto& op = block->back();
     for (size_t i = 0; i < op.num_operands(); ++i) {
@@ -83,7 +82,10 @@ pir::Block* GroupOp::block() {
 
 pir::Block* GroupOp::block() const {
   pir::Region& region = (*this)->region(0);
-  CHECK(!region.empty());
+  PADDLE_ENFORCE_EQ(region.empty(),
+                    false,
+                    ::common::errors::Unavailable(
+                        "Required GroupOp's region must not be emptpy."));
   return &region.front();
 }
 
@@ -156,7 +158,16 @@ pir::Block* FusionOp::block() {
   return &region.front();
 }
 
-std::vector<pir::Operation*> FusionOp::GetOperators() {
+pir::Block* FusionOp::block() const {
+  pir::Region& region = (*this)->region(0);
+  PADDLE_ENFORCE_EQ(region.empty(),
+                    false,
+                    ::common::errors::Unavailable(
+                        "Required FusionOp's region must not be emptpy."));
+  return &region.front();
+}
+
+std::vector<pir::Operation*> FusionOp::GetOperators() const {
   std::vector<pir::Operation*> rt_ops;
   for (auto& op : *block()) {
     rt_ops.push_back(&op);
@@ -192,6 +203,13 @@ void YieldStoreOp::Build(pir::Builder& builder,
 
 void YieldStoreOp::VerifySig() {}
 
+bool YieldStoreOp::InferSymbolicShape(
+    pir::ShapeConstraintIRAnalysis* shape_analysis) {
+  shape_analysis->SetShapeOrDataForValue(
+      result(0), shape_analysis->GetShapeOrDataForValue(operand_source(0)));
+  return true;
+}
+
 bool ConcatOp::InferSymbolicShape(
     pir::ShapeConstraintIRAnalysis* shape_analysis) {
   VLOG(4) << "Infer symbolic shape for cinn_op.concat";
@@ -298,7 +316,9 @@ void GenerateShapeOp::Build(
   if (inputs.empty()) {
     VLOG(3) << "GenerateShapeOp inputs is empty";
     for (const auto& attr : output_dim_exprs) {
-      CHECK(attr.isa<pir::Int64Attribute>());
+      PADDLE_ENFORCE(attr.isa<pir::Int64Attribute>(),
+                     ::common::errors::PreconditionNotMet(
+                         "Reqiured attr must be Int64Attribute."));
     }
   }
   argument.AddInputs(inputs);
@@ -460,11 +480,15 @@ bool GenerateShapeOp::InferSymbolicShape(
   const auto attr_dim_exprs = [&] {
     std::vector<symbol::DimExpr> dim_exprs{};
     pir::Attribute dim_expr_attr = this->attributes().at("output_dim_exprs");
-    CHECK(dim_expr_attr.isa<pir::ArrayAttribute>());
+    PADDLE_ENFORCE(dim_expr_attr.isa<pir::ArrayAttribute>(),
+                   ::common::errors::PreconditionNotMet(
+                       "Required dim_expr_attr is ArrayAttribute."));
     auto array = dim_expr_attr.dyn_cast<pir::ArrayAttribute>();
     for (int i = 0; i < array.size(); ++i) {
       const auto& dim_expr = ConvertAttributeToDimExpr(array.at(i));
-      CHECK(dim_expr.has_value());
+      PADDLE_ENFORCE(dim_expr.has_value(),
+                     ::common::errors::PreconditionNotMet(
+                         "Required dim_expr.has_value()==true."));
       dim_exprs.push_back(dim_expr.value());
     }
     return dim_exprs;
@@ -474,7 +498,9 @@ bool GenerateShapeOp::InferSymbolicShape(
         this->attributes().at("symbol_bindings");
     auto symbol_bindings =
         ConvertAttributeToSymbolBindings(symbol_bindings_attr);
-    CHECK(symbol_bindings.has_value());
+    PADDLE_ENFORCE(symbol_bindings.has_value(),
+                   ::common::errors::PreconditionNotMet(
+                       "Required symbol_bindings.has_value()==true."));
     return symbol_bindings.value();
   }();
   auto DimExprs4InputDim =
diff --git a/paddle/cinn/hlir/dialect/operator/ir/manual_op.h b/paddle/cinn/hlir/dialect/operator/ir/manual_op.h
index d350cbb3d5208..34c53ed2ebe6b 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/manual_op.h
+++ b/paddle/cinn/hlir/dialect/operator/ir/manual_op.h
@@ -77,7 +77,8 @@ class IR_API FusionOp : public pir::Op<FusionOp> {
                     const cinn::dialect::GroupInfo &group_info);
 
   pir::Block *block();
-  std::vector<pir::Operation *> GetOperators();
+  pir::Block *block() const;
+
   std::vector<pir::Operation *> GetOperators() const;
 
   void VerifySig();
@@ -86,7 +87,9 @@ class IR_API FusionOp : public pir::Op<FusionOp> {
 
 // YieldStoreOp represents a store operation for
 // seperate local variable and ouptut
-class IR_API YieldStoreOp : public pir::Op<YieldStoreOp> {
+class IR_API YieldStoreOp
+    : public pir::Op<YieldStoreOp,
+                     paddle::dialect::InferSymbolicShapeInterface> {
  public:
   using Op::Op;
   static const char *name() { return "cinn_op.yield_store"; }
@@ -98,6 +101,8 @@ class IR_API YieldStoreOp : public pir::Op<YieldStoreOp> {
                     pir::Type output_type);
 
   void VerifySig();
+
+  bool InferSymbolicShape(pir::ShapeConstraintIRAnalysis *shape_analysis);
 };
 
 class IR_API ConcatOp
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/CMakeLists.txt b/paddle/cinn/hlir/dialect/operator/transforms/CMakeLists.txt
index e329b8886f18b..de3f79a112b6f 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/CMakeLists.txt
+++ b/paddle/cinn/hlir/dialect/operator/transforms/CMakeLists.txt
@@ -7,7 +7,7 @@ set(cinn_transforms_deps
     cinn_op_dialect
     op_dialect_vjp
     cinn_runtime_dialect
-    # group_cluster
+    op_fusion
     pir_compiler)
 
 cinn_cc_library(cinn_transforms SRCS ${cinn_transforms_srcs} DEPS
@@ -16,4 +16,4 @@ cinn_cc_library(cinn_transforms SRCS ${cinn_transforms_srcs} DEPS
 cc_library(
   add_cinn_pass
   SRCS add_cinn_pass.cc
-  DEPS op_dialect pir cinn_op_dialect cinnapi pir_transforms cinn_transforms)
+  DEPS pir_transforms cinn_transforms)
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
index 3b6b1adcdbda1..e69b0e7d96bd1 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
@@ -29,8 +29,8 @@
 #include "paddle/cinn/hlir/dialect/operator/transforms/add_store_in_fusion_op_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.h"
+#include "paddle/cinn/hlir/dialect/operator/transforms/fold_manipulation_ops_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.h"
-#include "paddle/cinn/hlir/dialect/operator/transforms/group_merge/check_infer_symbolic_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_0d_to_1d_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_dynamic_to_static_dim_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_static_dim_to_dynamic_pass.h"
@@ -38,11 +38,9 @@
 #include "paddle/cinn/hlir/dialect/operator/transforms/group_merge/move_generate_shape_ops_to_prologue_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/group_merge/simplify_dim_expr_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/group_merge/single_op_fallback_to_phi.h"
-#include "paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/insert_broadcast_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/lower_cinn_fusion_op_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.h"
-#include "paddle/cinn/hlir/dialect/operator/transforms/remove_unchanged_reshape_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/replace_dynamic_expand_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/split_generate_shape_into_shape_ops_pass.h"
 #include "paddle/fluid/pir/transforms/build_cinn_pass.h"
@@ -50,13 +48,16 @@
 #include "paddle/fluid/pir/transforms/shape_optimization_pass.h"
 
 COMMON_DECLARE_bool(print_ir);
-COMMON_DECLARE_bool(check_infer_symbolic);
+COMMON_DECLARE_bool(disable_dyshape_in_train);
 PD_DECLARE_bool(group_schedule_tiling_first);
 
 namespace cinn::dialect::ir {
 
 namespace {
 bool HasDynamicShape(const pir::Program& program) {
+  if (FLAGS_disable_dyshape_in_train) {
+    return false;
+  }
   for (const auto& op : *program.block()) {
     if (op.isa<pir::CombineOp>()) {
       continue;
@@ -92,11 +93,6 @@ void ApplyCinnPreprocessPass(
   std::shared_ptr<pir::PassManager> pass_manager = CreatePassManager();
   bool has_dynamic_shape = HasDynamicShape(*program);
 
-  if (!has_dynamic_shape && FLAGS_check_infer_symbolic) {
-    pass_manager->AddPass(pir::CreateShapeOptimizationPass());
-    pass_manager->AddPass(cinn::dialect::ir::CreateCheckInferSymbolicPass());
-  }
-
   if (has_dynamic_shape) {
     pass_manager->AddPass(cinn::dialect::ir::CreateConvert0DTo1DPass());
     pass_manager->AddPass(pir::CreateShapeOptimizationPass());
@@ -118,7 +114,7 @@ void ApplyBuildGroupOpPass(
   if (has_dynamic_shape) {
     pass_manager->AddPass(pir::CreateShapeOptimizationPass());
   }
-  pass_manager->AddPass(cinn::dialect::ir::CreateRemoveUnchangedReshapePass());
+  pass_manager->AddPass(cinn::dialect::ir::CreateFoldManipulationOpsPass());
 
   pass_manager->AddPass(pir::CreateBuildCinnPass());
 
@@ -134,8 +130,6 @@ void ApplyGroupOpPass(::pir::Program* program,
   if (HasDynamicShape(*program)) {
     pass_manager->AddPass(::pir::CreateShapeOptimizationPass());
     pass_manager->AddPass(cinn::dialect::ir::CreateInsertBroadcastPass());
-    pass_manager->AddPass(
-        cinn::dialect::ir::CreateSubstituteDimExprBasedOnConstraintsPass());
     pass_manager->AddPass(cinn::dialect::ir::CreateSimplifyDimExprPass());
     pass_manager->AddPass(
         cinn::dialect::ir::CreateFuseShapeOpsIntoGenerateShapeOpPass());
@@ -145,7 +139,7 @@ void ApplyGroupOpPass(::pir::Program* program,
 
   pass_manager->AddPass(cinn::dialect::ir::CreateDynamicReshapeOpPass());
   pass_manager->AddPass(pir::CreateDeadCodeEliminationPass());
-  pass_manager->AddPass(cinn::dialect::ir::CreateRemoveUnchangedReshapePass());
+  pass_manager->AddPass(cinn::dialect::ir::CreateFoldManipulationOpsPass());
 
   pass_manager->Run(program);
 }
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/add_store_in_fusion_op_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/add_store_in_fusion_op_pass.cc
index 143f72985a3bf..d66943dfc8bf9 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/add_store_in_fusion_op_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/add_store_in_fusion_op_pass.cc
@@ -35,42 +35,11 @@ class AddYieldStoreInFusionOpPattern
     auto& shape_analysis =
         pir::ShapeAnalysisManager::Instance().Get(op->GetParentProgram());
     for (auto i = 0; i < op->num_operands(); ++i) {
-      if (auto reshape_op = op->operand_source(i)
-                                .defining_op()
-                                ->dyn_cast<cinn::dialect::ReshapeOp>()) {
-        if (reshape_op.operand_source(0).defining_op() == nullptr) {
-          continue;
-        }
-        auto pre_name = reshape_op.operand_source(0).defining_op()->name();
-
-        if (op->operand_source(i).use_count() > 1) {
-          continue;
-        }
-
-        if ((pre_name != "cinn_op.reduce_sum") &&
-            (pre_name != "cinn_op.reduce_max")) {
-          auto store_op = rewriter.Build<cinn::dialect::YieldStoreOp>(
-              op->operand_source(i).defining_op()->operand_source(0),
-              op->operand_source(i).type());
-
-          if (shape_analysis.HasShapeOrDataForValue(reshape_op->result(0))) {
-            shape_analysis.SetShapeOrDataForValue(
-                store_op.result(0),
-                shape_analysis.GetShapeOrDataForValue(reshape_op->result(0)));
-          }
-
-          op->operand(i).set_source(store_op.result(0));
-          if (reshape_op->result(0).use_count() == 0) {
-            rewriter.EraseOp(reshape_op);
-          }
-          continue;
-        }
-      }
-
       if (op->operand_source(i).use_count() == 1) {
         continue;
       }
 
+      rewriter.SetInsertionPointAfter(op->operand_source(i).defining_op());
       auto store_op = rewriter.Build<cinn::dialect::YieldStoreOp>(
           op->operand_source(i), op->operand_source(i).type());
       auto orignal_base = op->operand_source(i);
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_pass.cc
new file mode 100644
index 0000000000000..d5ec3042186e3
--- /dev/null
+++ b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_pass.cc
@@ -0,0 +1,339 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_pass.h"
+
+#include "paddle/cinn/hlir/dialect/operator/ir/generate_shape_util.h"
+#include "paddle/cinn/hlir/dialect/operator/ir/manual_op.h"
+#include "paddle/cinn/hlir/dialect/operator/ir/op_dialect.h"
+#include "paddle/cinn/hlir/dialect/runtime/ir/runtime_dialect.h"
+#include "paddle/common/ddim.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/pir/include/core/builtin_type.h"
+#include "paddle/pir/include/pattern_rewrite/pattern_rewrite_driver.h"
+
+namespace cinn {
+namespace dialect {
+namespace ir {
+
+namespace {
+
+class BlockDimExprsAsserter {
+ public:
+  BlockDimExprsAsserter(const DimExprs4ValueT& func,
+                        pir::IrContext* ir_ctx,
+                        pir::Block* block)
+      : GraphDimExprs4Value(func),
+        block_(block),
+        ir_ctx_(ir_ctx),
+        builder_(ir_ctx, block) {}
+
+  void AssertDimExprs() {
+    const auto ops = [&] {
+      std::vector<pir::Operation*> ops;
+      ops.reserve(block_->size());
+      for (auto& op : *block_) {
+        ops.push_back(&op);
+      }
+      return ops;
+    }();
+    for (auto* op : ops) {
+      if (op->num_regions() == 0) {
+        AssertDimExprForOutput(op);
+      } else {
+        AssertOpRegions(op);
+      }
+    }
+  }
+
+ private:
+  void AssertOpRegions(const pir::Operation* op) {
+    for (std::size_t i = 0; i < op->num_regions(); ++i) {
+      for (auto& block : op->region(i)) {
+        BlockDimExprsAsserter asserter(GraphDimExprs4Value, ir_ctx_, &block);
+        asserter.AssertDimExprs();
+      }
+    }
+  }
+
+  void InitLocalShapeAnalysis(const pir::Operation& op,
+                              pir::ShapeConstraintIRAnalysis* shape_analysis) {
+    auto VisitEachInputAndDimExprs = [&](const auto& Visit) {
+      for (int i = 0; i < op.num_operands(); ++i) {
+        pir::Value input = op.operand_source(i);
+        const auto& value_dim_exprs = GraphDimExprs4Value(input);
+        Visit(input, value_dim_exprs);
+      }
+    };
+    auto NewSymbolReplacedDimExprs = [&](const auto& dim_exprs) {
+      auto NewSymbolReplaced = [shape_analysis](const auto& dim_expr) {
+        if (dim_expr.template isa<int64_t>()) return dim_expr;
+        return symbol::DimExpr(shape_analysis->GetNextSymName());
+      };
+      std::vector<symbol::DimExpr> ret;
+      ret.reserve(dim_exprs.size());
+      for (const auto& dim_expr : dim_exprs) {
+        ret.push_back(NewSymbolReplaced(dim_expr));
+      }
+      return ret;
+    };
+    auto NewSymbolReplacedTensor =
+        [&](const symbol::TensorShapeOrDataDimExprs& tensor_shape_or_data) {
+          auto shape = NewSymbolReplacedDimExprs(tensor_shape_or_data.shape());
+          const auto& data = tensor_shape_or_data.data();
+          if (!data.has_value()) {
+            return symbol::ShapeOrDataDimExprs(
+                symbol::TensorShapeOrDataDimExprs(shape));
+          } else {
+            auto replaecd_data = NewSymbolReplacedDimExprs(data.value());
+            return symbol::ShapeOrDataDimExprs(
+                symbol::TensorShapeOrDataDimExprs(shape, replaecd_data));
+          }
+        };
+    auto NewSymbolReplacedTensorList =
+        [&](const symbol::TensorListShapeOrDataDimExprs& shape_or_data_list) {
+          symbol::TensorListShapeOrDataDimExprs ret;
+          ret.reserve(shape_or_data_list.size());
+          for (auto& shape_or_data : shape_or_data_list) {
+            const auto& replaced_shape_or_data =
+                NewSymbolReplacedTensor(shape_or_data);
+            ret.push_back(replaced_shape_or_data
+                              .dyn_cast<symbol::TensorShapeOrDataDimExprs>());
+          }
+          return symbol::ShapeOrDataDimExprs(ret);
+        };
+    auto GetNewSymbolReplaced = [&](const auto& value_dim_exprs) {
+      auto patterns = symbol::Overloaded{NewSymbolReplacedTensor,
+                                         NewSymbolReplacedTensorList};
+      return std::visit(patterns, value_dim_exprs.variant());
+    };
+    VisitEachInputAndDimExprs([&](auto value, const auto& value_dim_exprs) {
+      const auto& new_symbol_replaced = GetNewSymbolReplaced(value_dim_exprs);
+      shape_analysis->SetShapeOrDataForValue(value, new_symbol_replaced);
+    });
+  }
+
+  DimExprs4ValueT MakeOpDimExprs4Value(const pir::Operation* op) {
+    auto shape_analysis = std::make_shared<pir::ShapeConstraintIRAnalysis>();
+    InitLocalShapeAnalysis(*op, shape_analysis.get());
+
+    pir::Operation* mut_op = const_cast<pir::Operation*>(op);
+    auto interface =
+        mut_op->dyn_cast<paddle::dialect::InferSymbolicShapeInterface>();
+    if (!interface) {
+      PADDLE_THROW(phi::errors::Unimplemented(
+          op->name() + " DOES NOT have InferSymbolicShapeInterface!"));
+    } else {
+      bool infer_result = interface.InferSymbolicShape(shape_analysis.get());
+      PADDLE_ENFORCE_EQ(infer_result,
+                        true,
+                        ::common::errors::PreconditionNotMet(
+                            "InferSymbolicShape for %s failed.", op->name()));
+    }
+    return [shape_analysis](
+               pir::Value value) -> const symbol::ShapeOrDataDimExprs& {
+      return shape_analysis->GetShapeOrDataForValue(value);
+    };
+  }
+
+  void AssertDimExprForOutput(pir::Operation* op) {  // NOLINT
+    VLOG(5) << "Add assert for result of [ " << op->name() << " ]";
+    if (!op->HasInterface<paddle::dialect::InferSymbolicShapeInterface>()) {
+      LOG(INFO) << "skip the checking for [ " << op->name() << " ]";
+      return;
+    }
+    auto OpDimExprs4Value = MakeOpDimExprs4Value(op);
+    const auto& inputs = [&] {
+      std::vector<pir::Value> inputs;
+      inputs.reserve(op->num_operands());
+      for (int i = 0; i < op->num_operands(); ++i) {
+        const auto& input = op->operand_source(i);
+        if (input.type().isa<pir::VectorType>()) {
+          return std::vector<pir::Value>{};
+        }
+        inputs.push_back(input);
+      }
+      return inputs;
+    }();
+    if (inputs.empty()) return;
+    builder_.SetInsertionPointAfter(op);
+    for (std::size_t i = 0; i < op->num_results(); ++i) {
+      pir::Value output = op->result(i);
+      const auto& shape_or_data_dim_expr = GraphDimExprs4Value(output);
+      if (!shape_or_data_dim_expr.isa<symbol::TensorShapeOrDataDimExprs>())
+        continue;
+      if (shape_or_data_dim_expr.data().has_value()) {
+        TryAssertDimExprsForOutputData(inputs, output, OpDimExprs4Value);
+      } else {
+        TryAssertDimExprsForOutputShape(inputs, output, OpDimExprs4Value);
+      }
+    }
+  }
+
+  void TryAssertDimExprsForOutputShape(
+      const std::vector<pir::Value>& inputs,
+      pir::Value output,
+      const DimExprs4ValueT& OpDimExprs4Value) {
+    if (!::common::contain_unknown_dim(
+            output.type()
+                .dyn_cast<paddle::dialect::DenseTensorType>()
+                .dims())) {
+      return;
+    }
+    auto opt_shape_tensor_from_dim_exprs =
+        BuildShapeTensorFromShapeDimExprs(inputs, output, OpDimExprs4Value);
+    if (!opt_shape_tensor_from_dim_exprs.has_value()) return;
+    const auto& shape_tensor_from_dim_exprs =
+        opt_shape_tensor_from_dim_exprs.value();
+    auto shape_tensor_from_infer_meta = BuildShapeTensorFromInferMeta(output);
+    AddAssertEqual(shape_tensor_from_dim_exprs, shape_tensor_from_infer_meta);
+  }
+
+  std::optional<pir::Value> BuildShapeTensorFromShapeDimExprs(
+      const std::vector<pir::Value>& inputs,
+      pir::Value output,
+      const DimExprs4ValueT& OpDimExprs4Value) {
+    const auto& shape_or_data = GraphDimExprs4Value(output);
+    const auto& dim_exprs = shape_or_data.shape();
+    return BuildShapeTensorFromDimExprs(inputs, dim_exprs, OpDimExprs4Value);
+  }
+
+  std::optional<pir::Value> BuildShapeTensorFromDataDimExprs(
+      const std::vector<pir::Value>& inputs,
+      pir::Value output,
+      const DimExprs4ValueT& OpDimExprs4Value) {
+    const auto& shape_or_data = GraphDimExprs4Value(output);
+    const auto& dim_exprs = shape_or_data.data();
+    if (!dim_exprs.has_value()) return std::nullopt;
+    return BuildShapeTensorFromDimExprs(
+        inputs, dim_exprs.value(), OpDimExprs4Value);
+  }
+
+  std::optional<pir::Value> BuildShapeTensorFromDimExprs(
+      const std::vector<pir::Value>& inputs,
+      const std::vector<symbol::DimExpr>& dim_exprs,
+      const DimExprs4ValueT& OpDimExprs4Value) {
+    const auto& LocalDimExprs4Value =
+        [&](pir::Value value) -> const symbol::ShapeOrDataDimExprs& {
+      return OpDimExprs4Value(value);
+    };
+    std::vector<pir::Value> input_tensors{};
+    std::vector<pir::Attribute> output_dim_expr_attrs{};
+    GenerateShapeOp::SymbolBindings symbol_bindings{};
+    bool success =
+        MakeGenerateShapeOpAttribute(ir_ctx_,
+                                     LocalDimExprs4Value,
+                                     dim_exprs,
+                                     /*origin inputs*/ inputs,
+                                     /*minimal inputs*/ &input_tensors,
+                                     &output_dim_expr_attrs,
+                                     &symbol_bindings);
+    if (!success) return std::nullopt;
+    auto out_shape_value =
+        builder_
+            .Build<cinn::dialect::GenerateShapeOp>(
+                input_tensors, output_dim_expr_attrs, symbol_bindings)
+            .out();
+    return builder_
+        .Build<paddle::dialect::CastOp>(out_shape_value, phi::DataType::INT32)
+        .out();
+  }
+
+  pir::Value BuildShapeTensorFromInferMeta(pir::Value output) {
+    return builder_.Build<paddle::dialect::ShapeOp>(output).out();
+  }
+
+  void TryAssertDimExprsForOutputData(const std::vector<pir::Value>& inputs,
+                                      pir::Value output,
+                                      const DimExprs4ValueT& OpDimExprs4Value) {
+    auto opt_shape_tensor_from_dim_exprs =
+        BuildShapeTensorFromDataDimExprs(inputs, output, OpDimExprs4Value);
+    if (!opt_shape_tensor_from_dim_exprs.has_value()) return;
+    AddAssertEqual(opt_shape_tensor_from_dim_exprs.value(), output);
+  }
+
+  size_t GetNumel(pir::Value value) {
+    const auto& dims = value.type().dyn_cast<pir::DenseTensorType>().dims();
+    int64_t numel = ::common::product(dims);
+    PADDLE_ENFORCE_GE(
+        numel,
+        0,
+        ::common::errors::InvalidArgument(
+            "The numel of value must be >= 0, but received numel is %d.",
+            numel));
+    return numel;
+  }
+
+  void AddAssertEqual(pir::Value lhs, pir::Value rhs) {
+    size_t lhs_numel = GetNumel(lhs);
+    size_t rhs_numel = GetNumel(rhs);
+    PADDLE_ENFORCE_EQ(lhs_numel,
+                      rhs_numel,
+                      ::common::errors::InvalidArgument(
+                          "The numel of lhs and rhs must be equal, but "
+                          "received lhs's numel is [%d], rhs's numel is [%d]",
+                          lhs_numel,
+                          rhs_numel));
+    pir::Value lhs_eq_rhs =
+        builder_.Build<paddle::dialect::EqualOp>(lhs, rhs).out();
+    pir::Value all_eq =
+        builder_.Build<paddle::dialect::AllOp>(lhs_eq_rhs).out();
+    builder_.Build<paddle::dialect::AssertOp>(all_eq, lhs_eq_rhs, lhs_numel);
+  }
+
+  DimExprs4ValueT GraphDimExprs4Value;
+  pir::IrContext* ir_ctx_;
+  pir::Block* block_;
+  pir::Builder builder_;
+};
+
+class CheckInferSymbolicPass : public pir::Pass {
+ public:
+  explicit CheckInferSymbolicPass(const DimExprs4ValueT& func)
+      : pir::Pass("check_infer_symbolic", 1), GraphDimExprs4Value(func) {}
+
+  void Run(pir::Operation* op) override {
+    for (uint32_t i = 0; i < op->num_regions(); ++i) {
+      for (auto& block : op->region(i)) {
+        auto* ir_ctx = pir::IrContext::Instance();
+        BlockDimExprsAsserter asserter(GraphDimExprs4Value, ir_ctx, &block);
+        asserter.AssertDimExprs();
+      }
+    }
+  }
+
+  bool CanApplyOn(pir::Operation* op) const override {
+    return op->isa<pir::ModuleOp>() && op->num_regions() > 0;
+  }
+
+ private:
+  DimExprs4ValueT GraphDimExprs4Value;
+};
+
+}  // namespace
+
+std::unique_ptr<::pir::Pass> CreateCheckInferSymbolicPass(
+    const DimExprs4ValueT& GraphDimExprs4Value) {
+  return std::make_unique<CheckInferSymbolicPass>(GraphDimExprs4Value);
+}
+
+}  // namespace ir
+}  // namespace dialect
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.h b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_pass.h
similarity index 70%
rename from paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.h
rename to paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_pass.h
index 30c0dd7b6a7b6..527632d1c1008 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.h
+++ b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_pass.h
@@ -14,15 +14,21 @@
 
 #pragma once
 
+#include <functional>
+#include <memory>
+#include <optional>
+#include "paddle/pir/include/dialect/shape/utils/shape_or_data_expr.h"
 #include "paddle/pir/include/pass/pass.h"
 
 namespace cinn {
 namespace dialect {
 namespace ir {
 
-// This is a helper pass for substituting DimExpr based on the
-// constraints symbol::Equal<symbol::DimExpr>.
-std::unique_ptr<::pir::Pass> CreateSubstituteDimExprBasedOnConstraintsPass();
+using DimExprs4ValueT =
+    std::function<const symbol::ShapeOrDataDimExprs&(pir::Value)>;
+std::unique_ptr<::pir::Pass> CreateCheckInferSymbolicPass(
+    const DimExprs4ValueT& OptDimExprs4Value);
+
 }  // namespace ir
 }  // namespace dialect
 }  // namespace cinn
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.cc b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.cc
new file mode 100644
index 0000000000000..ff9b9dcd07d9c
--- /dev/null
+++ b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.h"
+#include <functional>
+#include <memory>
+#include <optional>
+#include "paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_pass.h"
+#include "paddle/cinn/hlir/dialect/operator/transforms/split_generate_shape_into_shape_ops_pass.h"
+#include "paddle/common/flags.h"
+#include "paddle/fluid/pir/transforms/shape_optimization_pass.h"
+
+COMMON_DECLARE_bool(check_infer_symbolic);
+PD_DECLARE_bool(prim_all);
+
+namespace cinn {
+namespace dialect {
+namespace ir {
+
+namespace {
+
+DimExprs4ValueT MakeDimExprs4Value(
+    pir::Program* program, const PassManagerCreater& CreatePassManager) {
+  std::shared_ptr<pir::PassManager> pass_manager = CreatePassManager();
+  pass_manager->AddPass(pir::CreateShapeOptimizationPass());
+  pass_manager->Run(program);
+  const auto* shape_analysis =
+      &pir::ShapeAnalysisManager::Instance().Get(program);
+  return
+      [shape_analysis](pir::Value value) -> const symbol::ShapeOrDataDimExprs& {
+        return shape_analysis->GetShapeOrDataForValue(value);
+      };
+}
+
+}  // namespace
+
+void CheckInferSymbolicIfNeed(pir::Program* program,
+                              const PassManagerCreater& CreatePassManager) {
+  if (!FLAGS_prim_all || !FLAGS_check_infer_symbolic) return;
+  const auto& GraphDimExprs4Value =
+      MakeDimExprs4Value(program, CreatePassManager);
+  std::shared_ptr<pir::PassManager> pass_manager = CreatePassManager();
+  pass_manager->AddPass(CreateCheckInferSymbolicPass(GraphDimExprs4Value));
+  pass_manager->AddPass(CreateSplitGenerateShapeIntoShapeOpsPass());
+  pass_manager->Run(program);
+}
+
+}  // namespace ir
+}  // namespace dialect
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/check_infer_symbolic_pass.h b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.h
similarity index 74%
rename from paddle/cinn/hlir/dialect/operator/transforms/group_merge/check_infer_symbolic_pass.h
rename to paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.h
index 825efa23eedf6..d61dd2c6d27f3 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/check_infer_symbolic_pass.h
+++ b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.h
@@ -14,16 +14,21 @@
 
 #pragma once
 
+#include <functional>
 #include <memory>
 #include <optional>
 #include "paddle/pir/include/pass/pass.h"
+#include "paddle/pir/include/pass/pass_manager.h"
 
 namespace cinn {
 namespace dialect {
 namespace ir {
 
-// This is a helper pass for checking the symbolic inference accuracy.
-std::unique_ptr<::pir::Pass> CreateCheckInferSymbolicPass();
+using PassManagerCreater = std::function<std::shared_ptr<pir::PassManager>()>;
+
+void CheckInferSymbolicIfNeed(pir::Program* program,
+                              const PassManagerCreater& CreatePassManager);
+
 }  // namespace ir
 }  // namespace dialect
 }  // namespace cinn
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc
index 2b8926bca6e60..606d07fd59826 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc
@@ -33,6 +33,7 @@
 #include "paddle/cinn/hlir/dialect/operator/ir/manual_op.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_util.h"
 #include "paddle/cinn/hlir/framework/pir/utils.h"
+#include "paddle/cinn/operator_fusion/group_cluster.h"
 #include "paddle/common/ddim.h"
 #include "paddle/common/flags.h"
 #include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
@@ -48,8 +49,7 @@
 #include "paddle/pir/include/pattern_rewrite/pattern_match.h"
 #include "paddle/pir/include/pattern_rewrite/pattern_rewrite_driver.h"
 
-// #include "paddle/cinn/frontend/group_cluster/group_cluster.h"
-// PD_DECLARE_bool(cinn_new_cluster_op_method);
+PD_DECLARE_bool(cinn_new_cluster_op_method);
 
 namespace cinn {
 namespace dialect {
@@ -249,7 +249,6 @@ std::vector<::pir::Value> GenerateOutputValue(
       if (outside_need_value.count(op->result(i))) {
         if (!inserted_val.count(op->result(i))) {
           temp_out.push_back(op->result(i));
-
           inserted_val.insert(op->result(i));
         }
       }
@@ -835,30 +834,39 @@ std::vector<GroupClusterNode> NodeMergeWithNode(
   return second_stage_output;
 }
 
-// std::vector<GroupClusterNode> NewOpMergeWithOp(
-//     cinn::dialect::GroupOp group_op) {
-//   const auto cluster_result = frontend::ClusterOps(group_op);
-
-//   // Each stmts corresponds to each fusion op(cluster node).
-//   // Concat all the ops of patterns in the stmts, and make them the op list
-//   of
-//   // cluster node.
-//   VLOG(4) << "Start Creating Cluster Nodes!";
-//   std::vector<GroupClusterNode> output_cluster_nodes;
-//   for (const auto& op_set : cluster_result) {
-//     GroupClusterNode cluster_node;
-//     for (const auto* op : op_set) {
-//       cluster_node.ops.push_back(const_cast<pir::Operation*>(op));
-//       auto op_kind = cinn::hlir::framework::pir::CompatibleInfo::OpKind(*op);
-//       cluster_node.group_kind =
-//           cluster_node.group_kind > op_kind ? cluster_node.group_kind :
-//           op_kind;
-//     }
-//     output_cluster_nodes.push_back(cluster_node);
-//   }
-//   VLOG(4) << "Finished Creating Cluster Nodes!";
-//   return output_cluster_nodes;
-// }
+std::vector<GroupClusterNode> NewOpMergeWithOp(
+    cinn::dialect::GroupOp group_op) {
+  std::function<cinn::fusion::FrontendContent(pir::Operation*)> func =
+      [](pir::Operation* op) { return cinn::fusion::FrontendContent(op); };
+  const auto& contents = cinn::fusion::MapVector(group_op.GetOperators(), func);
+  auto cluster_result = cinn::fusion::ClusterOps(contents);
+  std::vector<std::vector<pir::Operation*>> result;
+  std::transform(
+      cluster_result.begin(),
+      cluster_result.end(),
+      std::back_inserter(result),
+      [](const cinn::fusion::PatternNodePtr<cinn::fusion::FrontendStage> node) {
+        return cinn::fusion::GetOpsInPattern(node->stmt_pattern_);
+      });
+
+  // Each stmts corresponds to each fusion op(cluster node).
+  // Concat all the ops of patterns in the stmts, and make them the op list of
+  // cluster node.
+  VLOG(4) << "Start Creating Cluster Nodes!";
+  std::vector<GroupClusterNode> output_cluster_nodes;
+  for (const auto& op_set : result) {
+    GroupClusterNode cluster_node;
+    for (const auto* op : op_set) {
+      cluster_node.ops.push_back(const_cast<pir::Operation*>(op));
+      auto op_kind = cinn::hlir::framework::pir::CompatibleInfo::OpKind(*op);
+      cluster_node.group_kind =
+          cluster_node.group_kind > op_kind ? cluster_node.group_kind : op_kind;
+    }
+    output_cluster_nodes.push_back(cluster_node);
+  }
+  VLOG(4) << "Finished Creating Cluster Nodes!";
+  return output_cluster_nodes;
+}
 
 std::vector<GroupClusterNode> OpMergeWithOp(cinn::dialect::GroupOp group_op) {
   // op merge with op
@@ -926,9 +934,9 @@ std::vector<GroupClusterNode> OpMergeWithOp(cinn::dialect::GroupOp group_op) {
 
 std::vector<GroupClusterNode> GroupSplit(cinn::dialect::GroupOp group_op) {
   // stage 1
-  // if (FLAGS_cinn_new_cluster_op_method) {
-  //   return NewOpMergeWithOp(group_op);
-  // }
+  if (FLAGS_cinn_new_cluster_op_method) {
+    return NewOpMergeWithOp(group_op);
+  }
 
   auto first_stage_output = OpMergeWithOp(group_op);
 
@@ -1044,14 +1052,12 @@ class CinnGroupClusterPattern
       // update ir mapping
       for (size_t i = 0; i < output_values.size(); ++i) {
         ir_mapping.Add(output_values[i], new_group_op->result(i));
-
         if (shape_analysis.HasShapeOrDataForValue(output_values[i])) {
           shape_analysis.SetShapeOrDataForValue(
               new_group_op->result(i),
               shape_analysis.GetShapeOrDataForValue(output_values[i]));
         }
       }
-
       for (size_t i = 0; i < output_values.size(); ++i) {
         auto find_it = all_output_values.find(output_values[i]);
         if ((find_it != all_output_values.end()) &&
@@ -1062,6 +1068,7 @@ class CinnGroupClusterPattern
         }
       }
     }
+
     rewriter.EraseOp(group_op);
 
     return true;
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/remove_unchanged_reshape_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/fold_manipulation_ops_pass.cc
similarity index 69%
rename from paddle/cinn/hlir/dialect/operator/transforms/remove_unchanged_reshape_pass.cc
rename to paddle/cinn/hlir/dialect/operator/transforms/fold_manipulation_ops_pass.cc
index a2c09cc14a8dc..bbd79947314d2 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/remove_unchanged_reshape_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/fold_manipulation_ops_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/cinn/hlir/dialect/operator/transforms/remove_unchanged_reshape_pass.h"
+#include "paddle/cinn/hlir/dialect/operator/transforms/fold_manipulation_ops_pass.h"
 
 #include "paddle/cinn/hlir/dialect/operator/ir/cinn_op.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_util.h"
@@ -75,7 +75,7 @@ bool RemoveOp(pir::Operation* op, pir::PatternRewriter* rewriter) {
 }
 
 template <typename OPTYPE>
-class RemoveUnchangedReshapePattern : public pir::OpRewritePattern<OPTYPE> {
+class RemoveUnchangedOpPattern : public pir::OpRewritePattern<OPTYPE> {
  public:
   using pir::OpRewritePattern<OPTYPE>::OpRewritePattern;
 
@@ -85,18 +85,19 @@ class RemoveUnchangedReshapePattern : public pir::OpRewritePattern<OPTYPE> {
   }
 };
 
-class MergeReshapePattern
-    : public pir::OpRewritePattern<cinn::dialect::ReshapeOp> {
+template <typename OPTYPE>
+class MergeRedundantOpPattern : public pir::OpRewritePattern<OPTYPE> {
  public:
-  using pir::OpRewritePattern<cinn::dialect::ReshapeOp>::OpRewritePattern;
+  using pir::OpRewritePattern<OPTYPE>::OpRewritePattern;
 
-  bool MatchAndRewrite(cinn::dialect::ReshapeOp op,
+  bool MatchAndRewrite(OPTYPE op,
                        pir::PatternRewriter& rewriter) const override {
-    if (auto pre_shape = op->operand_source(0)
-                             .defining_op()
-                             ->dyn_cast<cinn::dialect::ReshapeOp>()) {
-      op->operand(0).set_source(pre_shape->operand_source(0));
-
+    if (auto pre_op = (op->operand_source(0).defining_op())
+                          ->template dyn_cast<OPTYPE>()) {
+      op->operand(0).set_source(pre_op->operand_source(0));
+      if (pre_op->use_empty()) {
+        rewriter.EraseOp(pre_op);
+      }
       return true;
     }
 
@@ -104,18 +105,24 @@ class MergeReshapePattern
   }
 };
 
-class RemoveUnchangedReshapePass : public pir::PatternRewritePass {
+class FoldManipulationOpsPass : public pir::PatternRewritePass {
  public:
-  RemoveUnchangedReshapePass()
-      : pir::PatternRewritePass("remove_unchanged_reshape_pass", 1) {}
+  FoldManipulationOpsPass()
+      : pir::PatternRewritePass("fold_manipulation_ops_pass", 1) {}
 
   pir::RewritePatternSet InitializePatterns(pir::IrContext* context) override {
     pir::RewritePatternSet ps(context);
 
-    // remove out_shape equal in_shape reshape op
-    ps.Add<RemoveUnchangedReshapePattern<cinn::dialect::ReshapeOp>>(context);
-    ps.Add<RemoveUnchangedReshapePattern<paddle::dialect::ReshapeOp>>(context);
-    ps.Add<MergeReshapePattern>(context);
+    // remove out_shape equal in_shape ops
+    ps.Add<RemoveUnchangedOpPattern<cinn::dialect::ReshapeOp>>(context);
+    ps.Add<RemoveUnchangedOpPattern<paddle::dialect::ReshapeOp>>(context);
+    ps.Add<RemoveUnchangedOpPattern<cinn::dialect::BroadcastOp>>(context);
+    ps.Add<RemoveUnchangedOpPattern<paddle::dialect::ExpandOp>>(context);
+    // merge redundant ops
+    ps.Add<MergeRedundantOpPattern<cinn::dialect::ReshapeOp>>(context);
+    ps.Add<MergeRedundantOpPattern<paddle::dialect::ReshapeOp>>(context);
+    ps.Add<MergeRedundantOpPattern<cinn::dialect::BroadcastOp>>(context);
+    ps.Add<MergeRedundantOpPattern<paddle::dialect::ExpandOp>>(context);
     ps.Add<RefreshCombineOpPattern>(context);
 
     return ps;
@@ -126,13 +133,12 @@ class RemoveUnchangedReshapePass : public pir::PatternRewritePass {
   }
 };
 
-std::unique_ptr<pir::Pass> CreateRemoveUnchangedReshapePass() {
-  return std::make_unique<RemoveUnchangedReshapePass>();
+std::unique_ptr<pir::Pass> CreateFoldManipulationOpsPass() {
+  return std::make_unique<FoldManipulationOpsPass>();
 }
-
 }  // namespace ir
 }  // namespace dialect
 }  // namespace cinn
 
-REGISTER_IR_PASS(remove_unchanged_reshape_pass,
-                 ::cinn::dialect::ir::RemoveUnchangedReshapePass);
+REGISTER_IR_PASS(fold_manipulation_ops_pass,
+                 ::cinn::dialect::ir::FoldManipulationOpsPass);
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/remove_unchanged_reshape_pass.h b/paddle/cinn/hlir/dialect/operator/transforms/fold_manipulation_ops_pass.h
similarity index 93%
rename from paddle/cinn/hlir/dialect/operator/transforms/remove_unchanged_reshape_pass.h
rename to paddle/cinn/hlir/dialect/operator/transforms/fold_manipulation_ops_pass.h
index ef75306748af2..239ba863389f7 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/remove_unchanged_reshape_pass.h
+++ b/paddle/cinn/hlir/dialect/operator/transforms/fold_manipulation_ops_pass.h
@@ -21,7 +21,7 @@ namespace cinn {
 namespace dialect {
 namespace ir {
 
-std::unique_ptr<pir::Pass> CreateRemoveUnchangedReshapePass();
+std::unique_ptr<pir::Pass> CreateFoldManipulationOpsPass();
 }  // namespace ir
 }  // namespace dialect
 }  // namespace cinn
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/check_infer_symbolic_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/check_infer_symbolic_pass.cc
deleted file mode 100644
index 953e268b27a80..0000000000000
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/check_infer_symbolic_pass.cc
+++ /dev/null
@@ -1,175 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/cinn/hlir/dialect/operator/transforms/group_merge/check_infer_symbolic_pass.h"
-
-#include "paddle/cinn/hlir/dialect/operator/ir/generate_shape_util.h"
-#include "paddle/cinn/hlir/dialect/operator/ir/op_dialect.h"
-#include "paddle/cinn/hlir/dialect/runtime/ir/runtime_dialect.h"
-#include "paddle/cinn/runtime/flags.h"
-#include "paddle/common/flags.h"
-#include "paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h"
-#include "paddle/pir/include/core/builtin_type.h"
-
-namespace cinn {
-namespace dialect {
-namespace ir {
-
-namespace {
-
-std::string SprintShape(const std::vector<std::vector<std::int64_t>>& shapes) {
-  std::string str;
-  for (int i = 0; i < shapes.size(); i++) {
-    str += "[";
-    for (int j = 0; j < shapes[i].size(); j++) {
-      str += std::to_string(shapes[i][j]);
-      if (j != shapes[i].size() - 1) {
-        str += ", ";
-      }
-    }
-    str += "]";
-    if (i != shapes.size() - 1) {
-      str += ", ";
-    }
-  }
-  return str;
-}
-
-void PrintProgram(pir::ModuleOp m, const std::string& mgs) {
-  std::ostringstream print_stream;
-  print_stream << "\n\n";
-  m.program()->Print(print_stream);
-  print_stream << "\n\n";
-  VLOG(4) << "===================== " << mgs << " =====================\n"
-          << print_stream.str();
-}
-
-std::vector<std::vector<std::int64_t>> GetStaticValueShape(pir::Value value) {
-  std::vector<std::vector<std::int64_t>> static_shape;
-  if (const pir::DenseTensorType& dense_tensor =
-          value.type().dyn_cast<::pir::DenseTensorType>()) {
-    static_shape.push_back(::common::vectorize(dense_tensor.dims()));
-  } else if (const pir::VectorType vector_tensor =
-                 value.type().dyn_cast<::pir::VectorType>()) {
-    for (size_t i = 0; i < vector_tensor.size(); i++) {
-      if (vector_tensor[i].isa<pir::DenseTensorType>()) {
-        const pir::DenseTensorType& dense_tensor =
-            vector_tensor[i].dyn_cast<::pir::DenseTensorType>();
-        static_shape.push_back(::common::vectorize(dense_tensor.dims()));
-      }
-    }
-  } else {
-    IR_THROW("error:the value doesn't have DenseTensorType");
-  }
-  return static_shape;
-}
-
-std::vector<std::int64_t> GetShapeFromTensor(
-    const symbol::TensorShapeOrDataDimExprs& tensor_shape_or_data) {
-  std::vector<std::int64_t> dynamic_shape;
-  for (const auto& dim_expr_shape : tensor_shape_or_data.shape()) {
-    CHECK(dim_expr_shape.Has<std::int64_t>());
-    dynamic_shape.push_back(dim_expr_shape.Get<std::int64_t>());
-  }
-  return dynamic_shape;
-}
-
-std::vector<std::vector<std::int64_t>> GetDynamicValueShape(
-    pir::Value value, const pir::ShapeConstraintIRAnalysis& shape_analysis) {
-  std::vector<std::vector<std::int64_t>> dynamic_shapes;
-  if (!shape_analysis.HasShapeOrDataForValue(value)) {
-    return dynamic_shapes;
-  }
-  symbol::ShapeOrDataDimExprs shape_or_data =
-      shape_analysis.GetShapeOrDataForValue(value);
-  auto lambdas = symbol::Overloaded{
-      [&](const symbol::TensorShapeOrDataDimExprs& tensor_shape_or_data) {
-        dynamic_shapes.push_back(GetShapeFromTensor(tensor_shape_or_data));
-      },
-      [&](const symbol::TensorListShapeOrDataDimExprs& tensor_list) {
-        for (const auto& tensor_shape_or_data : tensor_list) {
-          dynamic_shapes.push_back(GetShapeFromTensor(tensor_shape_or_data));
-        }
-      }};
-  std::visit(lambdas, shape_or_data.variant());
-  return dynamic_shapes;
-}
-
-void CompareStaticAndDynamicValueShape(
-    pir::Value value,
-    const pir::ShapeConstraintIRAnalysis& shape_analysis,
-    int op_index,
-    pir::ModuleOp module_op) {
-  std::vector<std::vector<std::int64_t>> static_value_shape =
-      GetStaticValueShape(value);
-  std::vector<std::vector<std::int64_t>> dynamic_value_shape =
-      GetDynamicValueShape(value, shape_analysis);
-  if (static_value_shape != dynamic_value_shape) {
-    VLOG(4) << "CheckInferSymbolic failed, in the following program, the "
-            << op_index
-            << "th op : the shape is not equal\nthe static shape is: "
-            << SprintShape(static_value_shape) << ", and the dynamic shape is: "
-            << SprintShape(dynamic_value_shape);
-    PrintProgram(module_op, "CheckInferSymbolic");
-  }
-}
-
-void CheckInferSymbolic(pir::ModuleOp module_op) {
-  VLOG(4) << "CheckInferSymbolic start";
-  int op_index = 0;
-  const auto& shape_analysis =
-      pir::ShapeAnalysisManager::Instance().Get(module_op.program());
-  for (uint32_t i = 0; i < module_op->num_regions(); i++) {
-    for (const auto& block : module_op->region(i)) {
-      for (const auto& op : block) {
-        for (std::size_t j = 0; j < op.num_operands(); ++j) {
-          CompareStaticAndDynamicValueShape(
-              op.operand_source(j), shape_analysis, op_index, module_op);
-        }
-        for (std::size_t j = 0; j < op.num_results(); ++j) {
-          CompareStaticAndDynamicValueShape(
-              op.result(j), shape_analysis, op_index, module_op);
-        }
-        op_index++;
-      }
-    }
-  }
-  VLOG(4) << "CheckInferSymbolic end";
-}
-
-class CheckInferSymbolicPass : public pir::Pass {
- public:
-  CheckInferSymbolicPass() : pir::Pass("check_infer_symbolic_pass", 1) {}
-
-  void Run(pir::Operation* op) override {
-    pir::ModuleOp module_op = op->dyn_cast<pir::ModuleOp>();
-    CheckInferSymbolic(module_op);
-  }
-
-  bool CanApplyOn(pir::Operation* op) const override {
-    return op->isa<pir::ModuleOp>() && op->num_regions() > 0;
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<::pir::Pass> CreateCheckInferSymbolicPass() {
-  return std::make_unique<CheckInferSymbolicPass>();
-}
-
-}  // namespace ir
-}  // namespace dialect
-}  // namespace cinn
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_dynamic_to_static_dim_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_dynamic_to_static_dim_pass.cc
index d1550a2bdf257..72219287fe3e3 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_dynamic_to_static_dim_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_dynamic_to_static_dim_pass.cc
@@ -181,10 +181,23 @@ class DynamicToStaticConverter {
     CHECK(shape_analysis_->HasShapeOrDataForValue(value));
     const auto& origin_shape = GetOriginValueShape(value);
     const auto& target_shape = GetTargetValueShape(value);
-    CHECK_EQ(origin_shape.size(), target_shape.size());
+    PADDLE_ENFORCE_EQ(
+        origin_shape.size(),
+        target_shape.size(),
+        phi::errors::InvalidArgument(
+            "The size of origin shape and target shape is not equal,"
+            "where the size of origin shape:%d but the size of target "
+            "shape:%d.",
+            origin_shape.size(),
+            target_shape.size()));
     for (std::size_t i = 0; i < origin_shape.size(); ++i) {
       if (origin_shape.at(i) == -1) {
-        CHECK_GT(target_shape.at(i), 0);
+        PADDLE_ENFORCE_GT(target_shape.at(i),
+                          0,
+                          phi::errors::InvalidArgument(
+                              "The size of target shape is incorrect."
+                              "Expected size is larger than 0, but receive %d.",
+                              target_shape.at(i)));
         update = true;
       } else {
         CHECK(origin_shape.at(i) == target_shape.at(i));
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_static_dim_to_dynamic_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_static_dim_to_dynamic_pass.cc
index e67cb5aacabfa..e20cab270cdd3 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_static_dim_to_dynamic_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_static_dim_to_dynamic_pass.cc
@@ -154,7 +154,15 @@ struct StaticDimToDynamicConverter {
       const auto& origin_shape = GetOriginValueShape(value);
       const auto& target_shape = GetTargetValueShape(
           shape_analysis->GetShapeOrDataForValue(value).shape());
-      CHECK_EQ(origin_shape.size(), target_shape.size());
+      PADDLE_ENFORCE_EQ(
+          origin_shape.size(),
+          target_shape.size(),
+          phi::errors::InvalidArgument(
+              "The size of origin shape and target shape is not equal,"
+              "where the size of origin shape:%d but the size of target "
+              "shape:%d.",
+              origin_shape.size(),
+              target_shape.size()));
       const auto& origin_type = value.type().dyn_cast<::pir::DenseTensorType>();
       pir::DenseTensorType target_type =
           pir::DenseTensorType::get(pir::IrContext::Instance(),
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass.cc
index 79b8a70d28acc..1b0519938c933 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass.cc
@@ -1941,7 +1941,14 @@ class GeneralFusionMergePassHelper {
         }
       }
 
-      CHECK_GE(producer->consumer_groups().size(), candidates.size());
+      PADDLE_ENFORCE_GE(
+          producer->consumer_groups().size(),
+          candidates.size(),
+          phi::errors::InvalidArgument(
+              "The size of producer consumer groups is incorrect."
+              "Expected size is greater than or equal to %d, but receive %d.",
+              candidates.size(),
+              producer->consumer_groups().size()));
       if (producer->consumer_groups().size() == 0 && candidates.size() == 0 &&
           output_ops_set_.count(producer->CollectOps()[0]) == 0) {
         producer->belong_groups.insert(*fusionable_consumers->begin());
@@ -2204,8 +2211,24 @@ class GeneralFusionMergePassHelper {
         CHECK(consumer->belong_groups.size());
         consumers.insert(*consumer->belong_groups.begin());
       }
-      CHECK_EQ(group->producer_groups().size(), producers.size());
-      CHECK_EQ(group->consumer_groups().size(), consumers.size());
+      PADDLE_ENFORCE_EQ(
+          group->producer_groups().size(),
+          producers.size(),
+          phi::errors::InvalidArgument(
+              "The size of group's producer groups and producers is not equal,"
+              "where the size of group's producer groups:%d but the size of "
+              "producers:%d.",
+              group->producer_groups().size(),
+              producers.size()));
+      PADDLE_ENFORCE_EQ(
+          group->consumer_groups().size(),
+          consumers.size(),
+          phi::errors::InvalidArgument(
+              "The size of group's consumer groups and consumers is not equal,"
+              "where the size of group's consumer groups:%d but the size of "
+              "consumers:%d.",
+              group->consumer_groups().size(),
+              consumers.size()));
       (*group->mut_producer_groups()) = producers;
       (*group->mut_consumer_groups()) = consumers;
     }
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.cc
deleted file mode 100644
index 97570459eebc1..0000000000000
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.cc
+++ /dev/null
@@ -1,216 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.h"
-
-#include "paddle/cinn/common/union_find.h"
-#include "paddle/cinn/hlir/dialect/operator/ir/manual_op.h"
-#include "paddle/pir/include/dialect/shape/ir/shape_attribute.h"
-#include "paddle/pir/include/dialect/shape/utils/dim_expr_util.h"
-
-namespace cinn {
-namespace dialect {
-namespace ir {
-
-namespace {
-
-template <typename DoEachT>
-void VisitEachOp(pir::Operation* op, const DoEachT& DoEach) {
-  DoEach(op);
-  for (auto& region : *op) {
-    for (auto& block : region) {
-      for (auto& op_in_block : block) {
-        DoEach(&op_in_block);
-      }
-    }
-  }
-}
-
-template <typename DoEachT>
-void VisitEachValue(const pir::Operation* op, const DoEachT& DoEach) {
-  for (std::size_t i = 0; i < op->num_operands(); ++i) {
-    DoEach(op->operand_source(i));
-  }
-  for (std::size_t i = 0; i < op->num_results(); ++i) {
-    DoEach(op->result(i));
-  }
-}
-
-symbol::TensorShapeOrDataDimExprs SubstituteTensorShapeOrData(
-    const symbol::TensorShapeOrDataDimExprs& shape_or_data,
-    const std::unordered_map<symbol::DimExpr, symbol::DimExpr>&
-        substitution_pattern) {
-  auto SubstituteOneDimExpr =
-      [](const std::vector<symbol::DimExpr>& original_dim_expr,
-         const std::unordered_map<symbol::DimExpr, symbol::DimExpr>&
-             substitution_pattern) -> std::vector<symbol::DimExpr> {
-    std::vector<symbol::DimExpr> substituted_dim_expr{};
-    for (const symbol::DimExpr& dim_expr : original_dim_expr) {
-      const auto& tmp_dim_expr =
-          symbol::SubstituteDimExpr(dim_expr, substitution_pattern);
-      substituted_dim_expr.push_back(symbol::SimplifyDimExpr(tmp_dim_expr));
-    }
-    return substituted_dim_expr;
-  };
-
-  std::vector<symbol::DimExpr> substituted_shape =
-      SubstituteOneDimExpr(shape_or_data.shape(), substitution_pattern);
-  if (!shape_or_data.data().has_value()) {
-    return symbol::ShapeOrData<symbol::DimExpr>(substituted_shape);
-  } else {
-    std::vector<symbol::DimExpr> substituted_data = SubstituteOneDimExpr(
-        shape_or_data.data().value(), substitution_pattern);
-    return symbol::ShapeOrData<symbol::DimExpr>(substituted_shape,
-                                                substituted_data);
-  }
-}
-
-symbol::ShapeOrDataDimExprs SubstituteShapeOrData(
-    const symbol::ShapeOrDataDimExprs& shape_or_data,
-    const std::unordered_map<symbol::DimExpr, symbol::DimExpr>&
-        substitution_pattern) {
-  auto lambdas = symbol::Overloaded{
-      [&](const symbol::TensorShapeOrDataDimExprs& tensor_shape_or_data) {
-        return symbol::ShapeOrDataDimExprs(SubstituteTensorShapeOrData(
-            tensor_shape_or_data, substitution_pattern));
-      },
-      [&](const symbol::TensorListShapeOrDataDimExprs& tensor_list) {
-        symbol::TensorListShapeOrDataDimExprs substituted_tensor_list;
-        for (symbol::TensorShapeOrDataDimExprs tensor_shape_or_data :
-             tensor_list) {
-          substituted_tensor_list.push_back(SubstituteTensorShapeOrData(
-              tensor_shape_or_data, substitution_pattern));
-        }
-        return symbol::ShapeOrDataDimExprs(substituted_tensor_list);
-      }};
-  return std::visit(lambdas, shape_or_data.variant());
-}
-
-int GetDimExprPriority(const symbol::DimExpr& dim_expr) {
-  return std::visit(
-      symbol::Overloaded{
-          [&](std::int64_t) { return 0; },
-          [&](const std::string&) { return 1; },
-          [&](const symbol::Negative<symbol::DimExpr>&) { return 2; },
-          [&](const symbol::Reciprocal<symbol::DimExpr>&) { return 2; },
-          [&](const symbol::Add<symbol::DimExpr>&) { return 2; },
-          [&](const symbol::Mul<symbol::DimExpr>&) { return 2; },
-          [&](const symbol::Max<symbol::DimExpr>&) { return 2; },
-          [&](const symbol::Min<symbol::DimExpr>&) { return 2; },
-          [&](const symbol::Broadcast<symbol::DimExpr>&) { return 2; },
-      },
-      dim_expr.variant());
-}
-
-std::unordered_map<symbol::DimExpr, symbol::DimExpr> GetDimExprSubstitution(
-    pir::ShapeConstraintIRAnalysis* shape_analysis) {
-  const std::vector<symbol::DimExprConstraint>& dim_expr_constraints =
-      shape_analysis->DimExprBuilder().constraints();
-  const cinn::common::UnionFindSet<symbol::DimExpr>& union_find_set = [&]() {
-    cinn::common::UnionFindSet<symbol::DimExpr> union_find_set;
-    for (const auto& constraint : dim_expr_constraints) {
-      CHECK(std::holds_alternative<symbol::Equal<symbol::DimExpr>>(constraint))
-          << "The DimExprConstraint type is no Equal<DimExpr>, this part is to "
-             "be completed.";
-      const auto& data =
-          std::get<symbol::Equal<symbol::DimExpr>>(constraint).data;
-      union_find_set.Union(data->lhs, data->rhs);
-    }
-    return union_find_set;
-  }();
-
-  const std::vector<std::vector<symbol::DimExpr>>& dim_expr_clusters =
-      union_find_set.Clusters();
-  std::unordered_map<symbol::DimExpr, symbol::DimExpr> substitution_pattern;
-  for (const auto& dim_expr_cluster : dim_expr_clusters) {
-    CHECK(!dim_expr_cluster.empty());
-    auto dim_expr_root = dim_expr_cluster[0];
-    for (const auto& dim_expr : dim_expr_cluster) {
-      if (GetDimExprPriority(dim_expr) < GetDimExprPriority(dim_expr_root)) {
-        dim_expr_root = dim_expr;
-      }
-    }
-    for (const auto& dim_expr : dim_expr_cluster) {
-      if (dim_expr != dim_expr_root) {
-        substitution_pattern[dim_expr] = dim_expr_root;
-      }
-    }
-  }
-  return substitution_pattern;
-}
-
-void SubstituteDimExprBasedOnConstraints(pir::Operation* region_op) {
-  VLOG(4) << "SubstituteDimExprBasedOnConstraints start";
-  pir::ShapeConstraintIRAnalysis* shape_analysis =
-      &pir::ShapeAnalysisManager::Instance().Get(region_op->GetParentProgram());
-  const std::unordered_map<symbol::DimExpr, symbol::DimExpr>&
-      substitution_pattern = GetDimExprSubstitution(shape_analysis);
-
-  VisitEachOp(region_op, [&](pir::Operation* op) {
-    VisitEachValue(op, [&](pir::Value value) {
-      if (!shape_analysis->HasShapeOrDataForValue(value)) {
-        VLOG(4) << "Can not find ShapeOrData for value of op(" << op->name()
-                << ") in shape_analysis";
-      } else {
-        const symbol::ShapeOrDataDimExprs& origin_shape_or_data =
-            shape_analysis->GetShapeOrDataForValue(value);
-        VLOG(8) << op->name()
-                << "      origin_shape_or_data: " << origin_shape_or_data;
-        const symbol::ShapeOrDataDimExprs& substituted_shape_or_data =
-            SubstituteShapeOrData(origin_shape_or_data, substitution_pattern);
-        VLOG(8) << op->name()
-                << " substituted_shape_or_data: " << substituted_shape_or_data;
-        shape_analysis->SetShapeOrDataForValue(value,
-                                               substituted_shape_or_data);
-      }
-    });
-    if (op->num_regions() > 0) {
-      return;
-    }
-    if (op->num_results() > 0) {
-      pir::shape::SetShapeAttrForOp(
-          op, shape_analysis->GetShapeOrDataForValue(op->result(0)));
-    } else {
-      pir::shape::SetShapeAttrForOp(
-          op, shape_analysis->GetShapeOrDataForValue(op->operand_source(0)));
-    }
-  });
-  VLOG(4) << "SubstituteDimExprBasedOnConstraints end";
-}
-
-class SubstituteDimExprBasedOnConstraintsPass : public pir::Pass {
- public:
-  SubstituteDimExprBasedOnConstraintsPass()
-      : pir::Pass("substitute_dim_expr_based_on_constraints_pass", 1) {}
-
-  void Run(pir::Operation* op) override {
-    SubstituteDimExprBasedOnConstraints(op);
-  }
-
-  bool CanApplyOn(pir::Operation* op) const override {
-    return op->num_regions() > 0;
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<::pir::Pass> CreateSubstituteDimExprBasedOnConstraintsPass() {
-  return std::make_unique<SubstituteDimExprBasedOnConstraintsPass>();
-}
-
-}  // namespace ir
-}  // namespace dialect
-}  // namespace cinn
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/broadcast_with_cf.cc b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/broadcast_with_cf.cc
index 7a8615ad2ef97..22917b41d5b1c 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/broadcast_with_cf.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/broadcast_with_cf.cc
@@ -53,7 +53,6 @@ static bool SameInputOutputShape(
 }
 
 void CompileGroupToJitKernelOp(
-    const std::vector<pir::Value>& group_inputs,
     pir::PatternRewriter& rewriter,  // NOLINT
     std::unordered_map<pir::Block*, OpLoweringGroupPtr>* group_map) {
   // prepare attribute for jit_kernel_op
@@ -73,6 +72,7 @@ void CompileGroupToJitKernelOp(
     auto& yield_op = block->back();
     CHECK(yield_op.isa<pir::YieldOp>()) << "Last op of block should be yield";
     rewriter.set_insertion_point(&yield_op);
+    const auto& group_inputs = GetBlockOutsideInput(group->ops());
     auto jit_kernel_op = rewriter.Build<cinn::dialect::JitKernelOp>(
         group_inputs, op_attr_map.at(group), output_types);
     CHECK(jit_kernel_op.num_results() == group_output_values.size());
@@ -108,11 +108,12 @@ void UpdateGroupShapeExprs(
     const auto& origin_shape_or_data =
         origin_group->GetShapeOrDataExprs(origin_val);
     if (origin_shape_or_data.data()) {
+      std::vector<symbol::DimExpr> shape_dim_expr_shape = {
+          symbol::DimExpr(static_cast<int64_t>(shape_dim_expr.size()))};
       new_group->SetShapeOrDataExprs(
           new_val,
           symbol::ShapeOrDataDimExprs{symbol::TensorShapeOrDataDimExprs(
-              std::vector<symbol::DimExpr>{shape_dim_expr.size()},
-              shape_dim_expr)});
+              shape_dim_expr_shape, shape_dim_expr)});
     } else {
       new_group->SetShapeOrDataExprs(
           new_val,
@@ -134,7 +135,9 @@ bool EraseOneExpand(
     if (!SameInputOutputShape(expand, ShapeOrDataDimExprs4Value)) continue;
     auto generate_shape_op =
         expand.shape().defining_op<cinn::dialect::GenerateShapeOp>();
-    CHECK_NOTNULL(generate_shape_op);
+    PADDLE_ENFORCE_NOT_NULL(generate_shape_op,
+                            phi::errors::PreconditionNotMet(
+                                "The generate shape op must not be null."));
     rewriter.ReplaceAllUsesWith(expand.out(), expand.x());
     rewriter.EraseOp(expand);
     if (generate_shape_op->use_empty()) {
@@ -280,7 +283,15 @@ void SetLeafBlockByGroupView(
   }
 
   auto new_group = CloneGroup(origin_group, block, &ir_mapping);
-  CHECK_EQ(origin_group->ops().size(), new_group->ops().size());
+  PADDLE_ENFORCE_EQ(
+      origin_group->ops().size(),
+      new_group->ops().size(),
+      phi::errors::InvalidArgument(
+          "The size of origin group ops and new group ops is not equal,"
+          "where the size of origin group ops:%d but the size of new group "
+          "ops:%d.",
+          origin_group->ops().size(),
+          new_group->ops().size()));
   UpdateGroupShapeExprs(new_group,
                         origin_group,
                         ir_mapping,
@@ -500,7 +511,7 @@ pir::Operation* CompileBroadcastTreeToConditionBlock(
   VLOG(6) << "After simply condition block: " << *program;
 
   // 3. compile condition block to jit_kernel_op
-  CompileGroupToJitKernelOp(group_inputs, rewriter, &group_map);
+  CompileGroupToJitKernelOp(rewriter, &group_map);
   VLOG(6) << "compile condition block to jit_kernel_op: " << *program;
 
   return cond_op;
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/collect_sym_expr.cc b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/collect_sym_expr.cc
index fd5a71e47c105..7526ad1ab6309 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/collect_sym_expr.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/collect_sym_expr.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/collect_sym_expr.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.h"
+#include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_symbolic_shape.h"
 #include "paddle/pir/include/dialect/shape/utils/dim_expr_util.h"
 
 namespace {
@@ -135,81 +136,39 @@ bool IsShapeOrDataNeedSubstitute(
   return ret;
 }
 
-symbol::TensorShapeOrDataDimExprs SubstituteTensorShapeOrData(
-    const symbol::TensorShapeOrDataDimExprs& shape_or_data,
-    const std::unordered_map<symbol::DimExpr, symbol::DimExpr>& dim_expr_map) {
-  const auto& SimplifyDimExpr =
-      [&](const std::vector<symbol::DimExpr>& original_dim_expr)
-      -> std::vector<symbol::DimExpr> {
-    std::vector<symbol::DimExpr> simplified_dim_expr{};
-    for (const symbol::DimExpr& dim_expr : original_dim_expr) {
-      simplified_dim_expr.push_back(symbol::SimplifyDimExpr(
-          symbol::SubstituteDimExpr(dim_expr, dim_expr_map)));
-    }
-    return simplified_dim_expr;
-  };
-
-  std::vector<symbol::DimExpr> simplified_shape =
-      SimplifyDimExpr(shape_or_data.shape());
-  if (!shape_or_data.data().has_value()) {
-    return symbol::ShapeOrData<symbol::DimExpr>(simplified_shape);
-  }
-  std::vector<symbol::DimExpr> simplified_data =
-      SimplifyDimExpr(shape_or_data.data().value());
-  return symbol::ShapeOrData<symbol::DimExpr>(simplified_shape,
-                                              simplified_data);
-}
-
-symbol::ShapeOrDataDimExprs SubstituteShapeOrData(
-    const symbol::ShapeOrDataDimExprs& shape_or_data,
-    const std::unordered_map<symbol::DimExpr, symbol::DimExpr>& dim_expr_map) {
-  auto lambdas = symbol::Overloaded{
-      [&](const symbol::TensorShapeOrDataDimExprs& tensor_shape_or_data) {
-        return symbol::ShapeOrDataDimExprs(
-            SubstituteTensorShapeOrData(tensor_shape_or_data, dim_expr_map));
-      },
-      [&](const symbol::TensorListShapeOrDataDimExprs& tensor_list) {
-        symbol::TensorListShapeOrDataDimExprs simplified_tensor_list;
-        for (symbol::TensorShapeOrDataDimExprs tensor_shape_or_data :
-             tensor_list) {
-          simplified_tensor_list.push_back(
-              SubstituteTensorShapeOrData(tensor_shape_or_data, dim_expr_map));
-        }
-        return symbol::ShapeOrDataDimExprs(simplified_tensor_list);
-      }};
-  return std::visit(lambdas, shape_or_data.variant());
-}
-
 symbol::ShapeOrDataDimExprs TrySubstitute(
     const symbol::ShapeOrDataDimExprs& shape_or_data,
     const std::unordered_map<symbol::DimExpr, symbol::DimExpr>& dim_expr_map) {
   if (!IsShapeOrDataNeedSubstitute(shape_or_data, dim_expr_map)) {
     return shape_or_data;
   }
-  return SubstituteShapeOrData(shape_or_data, dim_expr_map);
+  return symbol::SubstituteShapeOrData(shape_or_data, dim_expr_map);
 }
 
-}  // namespace
-
-namespace cinn::dialect::ir::details {
+void InferSymbolicShapeForOperation(
+    pir::Operation* op, pir::ShapeConstraintIRAnalysis* shape_analysis) {
+  auto infer_symbolic_shape_interface =
+      op->dyn_cast<paddle::dialect::InferSymbolicShapeInterface>();
+  if (infer_symbolic_shape_interface) {
+    infer_symbolic_shape_interface.InferSymbolicShape(shape_analysis);
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        op->name() + " DOES NOT have InferSymbolicShapeInterface!"));
+  }
+}
 
 std::unordered_map<::pir::Value, symbol::ShapeOrDataDimExprs>
-CreateGroupShapeOrDataExprs(
-    const OpLoweringGroupPtr& group,
-    pir::ShapeConstraintIRAnalysis& shape_analysis) {  // NOLINT
-  std::unordered_map<symbol::DimExpr, symbol::DimExpr> dim_expr_map =
-      CollectSubstituteDimExprMap(group, shape_analysis);
+GetGroupValue2Shape(const OpLoweringGroupPtr& group,
+                    pir::ShapeConstraintIRAnalysis& shape_analysis) {  // NOLINT
   std::unordered_map<::pir::Value, symbol::ShapeOrDataDimExprs> value2shape;
-  for (auto* op : group->ops()) {
+  for (auto op : group->ops()) {
     for (size_t i = 0; i < op->num_operands(); ++i) {
       auto operand = op->operand_source(i);
       if (operand && value2shape.find(operand) == value2shape.end() &&
           shape_analysis.HasShapeOrDataForValue(operand)) {
         VLOG(6) << "Add value_to_shape_or_data_exprs for " << operand.impl();
         value2shape.insert(
-            {operand,
-             TrySubstitute(shape_analysis.GetShapeOrDataForValue(operand),
-                           dim_expr_map)});
+            {operand, shape_analysis.GetShapeOrDataForValue(operand)});
       }
     }
     for (size_t i = 0; i < op->num_results(); ++i) {
@@ -218,9 +177,49 @@ CreateGroupShapeOrDataExprs(
           shape_analysis.HasShapeOrDataForValue(result)) {
         VLOG(6) << "Add value_to_shape_or_data_exprs for " << result.impl();
         value2shape.insert(
-            {result,
-             TrySubstitute(shape_analysis.GetShapeOrDataForValue(result),
-                           dim_expr_map)});
+            {result, shape_analysis.GetShapeOrDataForValue(result)});
+      }
+    }
+  }
+  return value2shape;
+}
+
+}  // namespace
+
+namespace cinn::dialect::ir::details {
+
+std::unordered_map<::pir::Value, symbol::ShapeOrDataDimExprs>
+CreateGroupShapeOrDataExprs(
+    const OpLoweringGroupPtr& group,
+    pir::ShapeConstraintIRAnalysis& global_shape_analysis) {  // NOLINT
+  std::unordered_map<symbol::DimExpr, symbol::DimExpr> dim_expr_map =
+      CollectSubstituteDimExprMap(group, global_shape_analysis);
+  std::unordered_map<::pir::Value, symbol::ShapeOrDataDimExprs> value2shape;
+  if (dim_expr_map.size() == 0) {
+    return GetGroupValue2Shape(group, global_shape_analysis);
+  }
+
+  pir::ShapeConstraintIRAnalysis local_shape_analysis({});
+
+  // process input values.
+  VisitEachInputValue(group, [&](::pir::Value value) {
+    auto new_shape_expr = TrySubstitute(
+        global_shape_analysis.GetShapeOrDataForValue(value), dim_expr_map);
+    local_shape_analysis.SetShapeOrDataForValue(value, new_shape_expr);
+    value2shape.insert({value, new_shape_expr});
+    VLOG(6) << "Add value_to_shape_or_data_exprs for " << value.impl();
+  });
+
+  // process the result values of each op.
+  for (auto* op : group->ops()) {
+    InferSymbolicShapeForOperation(op, &local_shape_analysis);
+    for (size_t i = 0; i < op->num_results(); ++i) {
+      auto result = op->result(i);
+      if (result && !value2shape.count(result) &&
+          local_shape_analysis.HasShapeOrDataForValue(result)) {
+        VLOG(6) << "Add value_to_shape_or_data_exprs for " << result.impl();
+        value2shape.insert(
+            {result, local_shape_analysis.GetShapeOrDataForValue(result)});
       }
     }
   }
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/lower_cinn_fusion_op_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/lower_cinn_fusion_op_pass.cc
index 0e7ebb8e9499d..3fa26f51b5592 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/lower_cinn_fusion_op_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/lower_cinn_fusion_op_pass.cc
@@ -34,6 +34,9 @@ pir::Operation* ProcessDyShapeGroup(
     const OpLoweringGroupPtr& group,
     pir::ShapeConstraintIRAnalysis& shape_analysis,  // NOLINT
     pir::PatternRewriter& rewriter) {                // NOLINT
+  // NOTE(dev): Need UpdateShapeOrDataExprs firstly and the logic
+  // will be migated into BucketLower later.
+  UpdateGroupShapeOrDataExprs(const_cast<OpLoweringGroupPtr&>(group));
   auto group_inputs = GetBlockOutsideInput(group->ops());
   GroupDimExprInfo group_dim_expr_info = GetGroupDimExprInfo(group);
   const auto& leaves = group_dim_expr_info.all_value_dim_exprs;
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc
index e4724c617dfaf..29c127b42d10d 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc
@@ -78,7 +78,8 @@ CompileGroupAsOpAttribute(const std::vector<OpLoweringGroupPtr>& group_list) {
 
 std::unordered_map<std::string, ::pir::Attribute> GetJitKernelAttr(
     const OpLoweringGroupPtr& group) {
-  auto kernel_info = CompilationCache::Instance().GetKernelInfo(group);
+  hlir::framework::pir::FusionInfo fusion_info(*group);
+  auto kernel_info = CompilationCache::Instance().GetKernelInfo(fusion_info);
   std::unordered_map<std::string, ::pir::Attribute> attrs{
       {cinn::dialect::JitKernelOp::kAttrName,
        cinn::dialect::CINNKernelInfoAttribute::get(pir::IrContext::Instance(),
@@ -88,33 +89,36 @@ std::unordered_map<std::string, ::pir::Attribute> GetJitKernelAttr(
 
 OpLoweringGroupPtr BuildOpLoweringGroup(pir::Operation* fusion_op_ptr) {
   auto fusion_op = fusion_op_ptr->dyn_cast<cinn::dialect::FusionOp>();
-  auto group = std::make_shared<OpLoweringGroup>();
-  group->set_op_pattern_kind(
-      cinn::hlir::framework::OpPatternKind::kElementWise);
+  std::vector<::pir::Operation*> ops;
+  auto group_op_kind = cinn::hlir::framework::OpPatternKind::kElementWise;
+  // Rebuild ops of the group
+  for (auto op : fusion_op.GetOperators()) {
+    if (!op->isa<::pir::YieldOp>()) {
+      ops.push_back(op);
+      group_op_kind = static_cast<int>(CompatibleInfo::OpKind(*op)) >
+                              static_cast<int>(group_op_kind)
+                          ? CompatibleInfo::OpKind(*op)
+                          : group_op_kind;
+    }
+  }
+
+  auto group = std::make_shared<OpLoweringGroup>(ops);
+
   if (fusion_op.attributes().count("group_info")) {
     auto attr = fusion_op.attribute("group_info")
                     .dyn_cast<cinn::dialect::GroupInfoAttribute>()
                     .data();
 
-    group->set_op_pattern_kind(attr.op_pattern_kind);
+    group_op_kind =
+        static_cast<int>(attr.op_pattern_kind) > static_cast<int>(group_op_kind)
+            ? attr.op_pattern_kind
+            : group_op_kind;
     group->set_loop_ranges(attr.loop_ranges);
     group->set_loop_ranges_expr(attr.loop_ranges_expr);
-
     group->set_reduce_axis(attr.reduce_axis);
     group->set_alignment_schedule_info(attr.alignment_schedule_info);
   }
-
-  // Rebuild ops of the group
-  for (auto op : fusion_op.GetOperators()) {
-    if (!op->isa<::pir::YieldOp>()) {
-      group->mut_ops().push_back(op);
-      auto op_pattern_kind = static_cast<int>(CompatibleInfo::OpKind(*op)) >
-                                     static_cast<int>(group->op_pattern_kind())
-                                 ? CompatibleInfo::OpKind(*op)
-                                 : group->op_pattern_kind();
-      group->set_op_pattern_kind(op_pattern_kind);
-    }
-  }
+  group->set_op_pattern_kind(group_op_kind);
 
   // Rebuild output_ops and input_ops of the group
   auto yield_op = fusion_op.GetOperators().back();
@@ -127,10 +131,7 @@ OpLoweringGroupPtr BuildOpLoweringGroup(pir::Operation* fusion_op_ptr) {
   // Because the group is rebuilt, the order of group.output_values generated
   // by BuildCUDAJITInfo may not be same with the order bound in the yield op,
   // so a mapping is required.
-  auto& shape_analysis =
-      pir::ShapeAnalysisManager::Instance().Get(fusion_op->GetParentProgram());
-  group->set_value_to_shape_or_data_exprs(
-      CreateGroupShapeOrDataExprs(group, shape_analysis));
+  UpdateGroupShapeOrDataExprs(group);
   if (FLAGS_cinn_enable_map_expr) {
     cinn::adt::TryGenerateMapExprFromGroup(group);
   }
@@ -139,4 +140,11 @@ OpLoweringGroupPtr BuildOpLoweringGroup(pir::Operation* fusion_op_ptr) {
   return group;
 }
 
+void UpdateGroupShapeOrDataExprs(OpLoweringGroupPtr group) {
+  auto& shape_analysis =
+      pir::ShapeAnalysisManager::Instance().Get(group->GetParentProgram());
+  group->set_value_to_shape_or_data_exprs(
+      CreateGroupShapeOrDataExprs(group, shape_analysis));
+}
+
 }  // namespace cinn::dialect::ir::details
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.h b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.h
index 3b3ba4379d57c..5c5d0c104390a 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.h
+++ b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.h
@@ -31,4 +31,6 @@ std::unordered_map<std::string, ::pir::Attribute> GetJitKernelAttr(
 
 OpLoweringGroupPtr BuildOpLoweringGroup(pir::Operation* fusion_op_ptr);
 
+void UpdateGroupShapeOrDataExprs(OpLoweringGroupPtr group);
+
 }  // namespace cinn::dialect::ir::details
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc
index 3bf32aa91837d..be57629fe8747 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc
@@ -761,8 +761,8 @@ class FullWithTensorOpPattern
 
   bool MatchAndRewrite(paddle::dialect::FullWithTensorOp op,
                        pir::PatternRewriter &rewriter) const override {
-    auto shape = op->operand_source(0);
-    auto value = op->operand_source(1);
+    auto value = op->operand_source(0);
+    auto shape = op->operand_source(1);
 
     if (paddle::dialect::TransToPhiDataType(
             value.type()
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/split_generate_shape_into_shape_ops_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/split_generate_shape_into_shape_ops_pass.cc
index 19e7f5060eb96..0f15edcd0b8d6 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/split_generate_shape_into_shape_ops_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/split_generate_shape_into_shape_ops_pass.cc
@@ -116,15 +116,32 @@ struct CachedDimExprToValueConverter {
   }
 
   pir::Value ConvertTensorDimToValue(const TensorDimInData& tensor_dim) {
-    return rewriter
-        ->Build<paddle::dialect::SliceOp>(
-            tensor_dim.value,
-            std::vector<int64_t>{0LL},
-            std::vector<int64_t>{tensor_dim.axis},
-            std::vector<int64_t>{tensor_dim.axis + 1},
-            std::vector<int64_t>{},
-            std::vector<int64_t>{})
-        .out();
+    auto CastToInt64IfNeed = [&](pir::Value value) {
+      if (value.type()
+              .dyn_cast<paddle::dialect::DenseTensorType>()
+              .dtype()
+              .isa<pir::Int64Type>()) {
+        return value;
+      }
+      return rewriter
+          ->Build<paddle::dialect::CastOp>(value, phi::DataType::INT64)
+          .out();
+    };
+    if (tensor_dim.value.type()
+            .dyn_cast<paddle::dialect::DenseTensorType>()
+            .dims()
+            .size() == 0) {
+      return CastToInt64IfNeed(tensor_dim.value);
+    }
+    return CastToInt64IfNeed(rewriter
+                                 ->Build<paddle::dialect::SliceOp>(
+                                     tensor_dim.value,
+                                     std::vector<int64_t>{0LL},
+                                     std::vector<int64_t>{tensor_dim.axis},
+                                     std::vector<int64_t>{tensor_dim.axis + 1},
+                                     std::vector<int64_t>{},
+                                     std::vector<int64_t>{})
+                                 .out());
   }
 
   pir::Value ConvertToValueImpl(
@@ -143,7 +160,12 @@ struct CachedDimExprToValueConverter {
 
   pir::Value ConvertToValueImpl(const symbol::Add<symbol::DimExpr>& dim_expr) {
     const auto& [operands] = dim_expr;
-    CHECK_GT(operands->size(), 0);
+    PADDLE_ENFORCE_GT(operands->size(),
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The size of operands is incorrect."
+                          "Expected size is larger than 0, but receive %d.",
+                          operands->size()));
     pir::Value acc = ConvertToValue(operands->at(0));
     for (int i = 1; i < operands->size(); ++i) {
       if (operands->at(i).isa<symbol::Negative<symbol::DimExpr>>()) {
@@ -162,7 +184,12 @@ struct CachedDimExprToValueConverter {
 
   pir::Value ConvertToValueImpl(const symbol::Mul<symbol::DimExpr>& dim_expr) {
     const auto& [operands] = dim_expr;
-    CHECK_GT(operands->size(), 0);
+    PADDLE_ENFORCE_GT(operands->size(),
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The size of operands is incorrect."
+                          "Expected size is larger than 0, but receive %d.",
+                          operands->size()));
     pir::Value prod = ConvertToValue(operands->at(0));
     for (int i = 1; i < operands->size(); ++i) {
       if (operands->at(i).isa<symbol::Reciprocal<symbol::DimExpr>>()) {
@@ -182,7 +209,12 @@ struct CachedDimExprToValueConverter {
 
   pir::Value ConvertToValueImpl(const symbol::Max<symbol::DimExpr>& dim_expr) {
     const auto& [operands] = dim_expr;
-    CHECK_GT(operands->size(), 0);
+    PADDLE_ENFORCE_GT(operands->size(),
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The size of operands is incorrect."
+                          "Expected size is larger than 0, but receive %d.",
+                          operands->size()));
     pir::Value max = ConvertToValue(operands->at(0));
     for (int i = 1; i < operands->size(); ++i) {
       pir::Value operand_value = ConvertToValue(operands->at(i));
@@ -193,7 +225,12 @@ struct CachedDimExprToValueConverter {
 
   pir::Value ConvertToValueImpl(const symbol::Min<symbol::DimExpr>& dim_expr) {
     const auto& [operands] = dim_expr;
-    CHECK_GT(operands->size(), 0);
+    PADDLE_ENFORCE_GT(operands->size(),
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The size of operands is incorrect."
+                          "Expected size is larger than 0, but receive %d.",
+                          operands->size()));
     pir::Value min = ConvertToValue(operands->at(0));
     for (int i = 1; i < operands->size(); ++i) {
       pir::Value operand_value = ConvertToValue(operands->at(i));
@@ -205,7 +242,12 @@ struct CachedDimExprToValueConverter {
   pir::Value ConvertToValueImpl(
       const symbol::Broadcast<symbol::DimExpr>& dim_expr) {
     const auto& [operands] = dim_expr;
-    CHECK_GT(operands->size(), 0);
+    PADDLE_ENFORCE_GT(operands->size(),
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The size of operands is incorrect."
+                          "Expected size is larger than 0, but receive %d.",
+                          operands->size()));
     pir::Value broadcasted = ConvertToValue(operands->at(0));
     for (int i = 1; i < operands->size(); ++i) {
       pir::Value operand_value = ConvertToValue(operands->at(i));
diff --git a/paddle/cinn/hlir/framework/instruction.cc b/paddle/cinn/hlir/framework/instruction.cc
index c7185223843d5..65ac90793472b 100644
--- a/paddle/cinn/hlir/framework/instruction.cc
+++ b/paddle/cinn/hlir/framework/instruction.cc
@@ -147,27 +147,29 @@ void Instruction::Run(
 
   utils::RecordEvent record_args("Instruction::Run",
                                  cinn::utils::EventType::kInstruction);
+  const auto DefaultRun = [&] {
 #if defined(CINN_WITH_CUDA) && !defined(CINN_WITH_CUDNN)
-  if (function_name_ == "cublas_gemm" && target_.arch == Target::Arch::NVGPU) {
-    auto& pod_args = args_cached_[0];
-    VLOG(3) << "The pod_args size of cublas_gemm: " << pod_args.size();
-    runtime::cuda::cinn_gpu_cublas_gemm(attrs,
-                                        pod_args[0],
-                                        pod_args[1],
-                                        pod_args[2],
-                                        pod_args[3],
-                                        static_cast<cudaStream_t>(stream));
-  } else if (function_name_ == "cublas_matmul" &&
-             target_.arch == Target::Arch::NVGPU) {
+    VLOG(3) << "Running extern function " << function_name_;
+    for (int idx = 0; idx < fn_ptrs_.size(); ++idx) {
+      VLOG(3) << "Running func name: " << fn_names_[idx];
+      auto& pod_args = args_cached_[idx];
+      CHECK(fn_ptrs_[idx]) << "The LoweredFunc address should be set first by "
+                              "calling SetLoweredFunc method";
+      if (!dryrun) {
+        if (target_ == cinn::common::DefaultNVGPUTarget()) {
+          ((lower_func_ptr_g)fn_ptrs_[idx])(
+              static_cast<void*>(pod_args.data()), pod_args.size(), stream);
+        } else {
+          ((lower_func_ptr_t)fn_ptrs_[idx])(static_cast<void*>(pod_args.data()),
+                                            pod_args.size());
+        }
+      }
+    }
+    VLOG(3) << "Done Running extern function " << function_name_;
+#elif defined(CINN_WITH_CUDNN)
     auto& pod_args = args_cached_[0];
-    VLOG(3) << "The pod_args size of cublas_matmul: " << pod_args.size();
-    runtime::cuda::cinn_gpu_cublas_gemm(attrs,
-                                        pod_args[0],
-                                        pod_args[1],
-                                        nullptr,
-                                        pod_args[2],
-                                        static_cast<cudaStream_t>(stream));
-  } else {
+    // Here conv2d and depthwise_conv2d are implemented by one cudnn api
+    // cudnnConvolutionForward
     VLOG(3) << "Running extern function " << function_name_;
     for (int idx = 0; idx < fn_ptrs_.size(); ++idx) {
       VLOG(3) << "Running func name: " << fn_names_[idx];
@@ -185,136 +187,202 @@ void Instruction::Run(
       }
     }
     VLOG(3) << "Done Running extern function " << function_name_;
-  }
+#else
+    VLOG(3) << "Running extern function " << function_name_;
+    for (int idx = 0; idx < fn_ptrs_.size(); ++idx) {
+      VLOG(3) << "Running func name: " << fn_names_[idx];
+      auto& pod_args = args_cached_[idx];
+      CHECK(fn_ptrs_[idx]) << "The LoweredFunc address should be set first by "
+                              "calling SetLoweredFunc method";
+      if (!dryrun) {
+        ((lower_func_ptr_t)fn_ptrs_[idx])(static_cast<void*>(pod_args.data()),
+                                          pod_args.size());
+      }
+    }
+    VLOG(3) << "Done Running extern function " << function_name_;
+#endif
+  };
+  const auto NVGPURun = [&] {
+#if defined(CINN_WITH_CUDA) && !defined(CINN_WITH_CUDNN)
+    if (function_name_ == "cublas_gemm") {
+      auto& pod_args = args_cached_[0];
+      VLOG(3) << "The pod_args size of cublas_gemm: " << pod_args.size();
+      runtime::cuda::cinn_gpu_cublas_gemm(attrs,
+                                          pod_args[0],
+                                          pod_args[1],
+                                          pod_args[2],
+                                          pod_args[3],
+                                          static_cast<cudaStream_t>(stream));
+    } else if (function_name_ == "cublas_matmul") {
+      auto& pod_args = args_cached_[0];
+      VLOG(3) << "The pod_args size of cublas_matmul: " << pod_args.size();
+      runtime::cuda::cinn_gpu_cublas_gemm(attrs,
+                                          pod_args[0],
+                                          pod_args[1],
+                                          nullptr,
+                                          pod_args[2],
+                                          static_cast<cudaStream_t>(stream));
+    } else {
+      VLOG(3) << "Running extern function " << function_name_;
+      for (int idx = 0; idx < fn_ptrs_.size(); ++idx) {
+        VLOG(3) << "Running func name: " << fn_names_[idx];
+        auto& pod_args = args_cached_[idx];
+        CHECK(fn_ptrs_[idx])
+            << "The LoweredFunc address should be set first by "
+               "calling SetLoweredFunc method";
+        if (!dryrun) {
+          if (target_ == cinn::common::DefaultNVGPUTarget()) {
+            ((lower_func_ptr_g)fn_ptrs_[idx])(
+                static_cast<void*>(pod_args.data()), pod_args.size(), stream);
+          } else {
+            ((lower_func_ptr_t)fn_ptrs_[idx])(
+                static_cast<void*>(pod_args.data()), pod_args.size());
+          }
+        }
+      }
+      VLOG(3) << "Done Running extern function " << function_name_;
+    }
 #elif defined(CINN_WITH_CUDNN)
-  auto& pod_args = args_cached_[0];
-  // Here conv2d and depthwise_conv2d are implemented by one cudnn api
-  // cudnnConvolutionForward
-  if ((function_name_ == "conv2d" || function_name_ == "depthwise_conv2d") &&
-      target_.arch == Target::Arch::NVGPU) {
-    if (str_attrs[0] == "forward") {
-      if (str_attrs.size() > 1 && str_attrs[1] == "NHWC") {
+    auto& pod_args = args_cached_[0];
+    // Here conv2d and depthwise_conv2d are implemented by one cudnn api
+    // cudnnConvolutionForward
+    if ((function_name_ == "conv2d" || function_name_ == "depthwise_conv2d")) {
+      if (str_attrs[0] == "forward") {
+        if (str_attrs.size() > 1 && str_attrs[1] == "NHWC") {
+          absl::flat_hash_map<std::string, int> attrs_map = {
+              {"input_n", attrs[0]},     {"input_h", attrs[1]},
+              {"input_w", attrs[2]},     {"input_c", attrs[3]},
+              {"weights_n", attrs[4]},   {"weights_c", attrs[5]},
+              {"weights_h", attrs[6]},   {"weights_w", attrs[7]},
+              {"pad_h", attrs[8]},       {"pad_w", attrs[9]},
+              {"stride_h", attrs[10]},   {"stride_w", attrs[11]},
+              {"dilation_h", attrs[12]}, {"dilation_w", attrs[13]},
+              {"groups", attrs[14]},     {"output_n", attrs[15]},
+              {"output_h", attrs[16]},   {"output_w", attrs[17]},
+              {"output_c", attrs[18]},
+          };
+          runtime::cuda::cinn_gpu_cudnn_conv2d(
+              attrs_map,
+              pod_args[0],
+              pod_args[1],
+              pod_args[2],
+              static_cast<cudaStream_t>(stream),
+              cinn::common::Layout::kNHWC);
+
+        } else {
+          absl::flat_hash_map<std::string, int> attrs_map = {
+              {"input_n", attrs[0]},     {"input_c", attrs[1]},
+              {"input_h", attrs[2]},     {"input_w", attrs[3]},
+              {"weights_n", attrs[4]},   {"weights_c", attrs[5]},
+              {"weights_h", attrs[6]},   {"weights_w", attrs[7]},
+              {"pad_h", attrs[8]},       {"pad_w", attrs[9]},
+              {"stride_h", attrs[10]},   {"stride_w", attrs[11]},
+              {"dilation_h", attrs[12]}, {"dilation_w", attrs[13]},
+              {"groups", attrs[14]},     {"output_n", attrs[15]},
+              {"output_c", attrs[16]},   {"output_h", attrs[17]},
+              {"output_w", attrs[18]},
+          };
+          runtime::cuda::cinn_gpu_cudnn_conv2d(
+              attrs_map,
+              pod_args[0],
+              pod_args[1],
+              pod_args[2],
+              static_cast<cudaStream_t>(stream),
+              cinn::common::Layout::kNCHW);
+        }
+      } else if (str_attrs[0] == "backward_data") {
+        // w, dy, dx
         absl::flat_hash_map<std::string, int> attrs_map = {
-            {"input_n", attrs[0]},     {"input_h", attrs[1]},
-            {"input_w", attrs[2]},     {"input_c", attrs[3]},
-            {"weights_n", attrs[4]},   {"weights_c", attrs[5]},
-            {"weights_h", attrs[6]},   {"weights_w", attrs[7]},
+            {"input_n", attrs[15]},    {"input_c", attrs[16]},
+            {"input_h", attrs[17]},    {"input_w", attrs[18]},
+            {"weights_n", attrs[0]},   {"weights_c", attrs[1]},
+            {"weights_h", attrs[2]},   {"weights_w", attrs[3]},
             {"pad_h", attrs[8]},       {"pad_w", attrs[9]},
             {"stride_h", attrs[10]},   {"stride_w", attrs[11]},
             {"dilation_h", attrs[12]}, {"dilation_w", attrs[13]},
-            {"groups", attrs[14]},     {"output_n", attrs[15]},
-            {"output_h", attrs[16]},   {"output_w", attrs[17]},
-            {"output_c", attrs[18]},
+            {"groups", attrs[14]},     {"output_n", attrs[4]},
+            {"output_c", attrs[5]},    {"output_h", attrs[6]},
+            {"output_w", attrs[7]},
         };
-        runtime::cuda::cinn_gpu_cudnn_conv2d(attrs_map,
-                                             pod_args[0],
-                                             pod_args[1],
-                                             pod_args[2],
-                                             static_cast<cudaStream_t>(stream),
-                                             cinn::common::Layout::kNHWC);
-
+        // w, dy, dx
+        runtime::cuda::cinn_gpu_cudnn_conv2d_backward_data(
+            attrs_map,
+            pod_args[0],
+            pod_args[1],
+            pod_args[2],
+            static_cast<cudaStream_t>(stream));
       } else {
+        // x, dy, w
         absl::flat_hash_map<std::string, int> attrs_map = {
             {"input_n", attrs[0]},     {"input_c", attrs[1]},
             {"input_h", attrs[2]},     {"input_w", attrs[3]},
-            {"weights_n", attrs[4]},   {"weights_c", attrs[5]},
-            {"weights_h", attrs[6]},   {"weights_w", attrs[7]},
+            {"weights_n", attrs[15]},  {"weights_c", attrs[16]},
+            {"weights_h", attrs[17]},  {"weights_w", attrs[18]},
             {"pad_h", attrs[8]},       {"pad_w", attrs[9]},
             {"stride_h", attrs[10]},   {"stride_w", attrs[11]},
             {"dilation_h", attrs[12]}, {"dilation_w", attrs[13]},
-            {"groups", attrs[14]},     {"output_n", attrs[15]},
-            {"output_c", attrs[16]},   {"output_h", attrs[17]},
-            {"output_w", attrs[18]},
+            {"groups", attrs[14]},     {"output_n", attrs[4]},
+            {"output_c", attrs[5]},    {"output_h", attrs[6]},
+            {"output_w", attrs[7]},
         };
-        runtime::cuda::cinn_gpu_cudnn_conv2d(attrs_map,
-                                             pod_args[0],
-                                             pod_args[1],
-                                             pod_args[2],
-                                             static_cast<cudaStream_t>(stream),
-                                             cinn::common::Layout::kNCHW);
+        // x, dy, w
+        runtime::cuda::cinn_gpu_cudnn_conv2d_backward_filter(
+            attrs_map,
+            pod_args[0],
+            pod_args[1],
+            pod_args[2],
+            static_cast<cudaStream_t>(stream));
       }
-    } else if (str_attrs[0] == "backward_data") {
-      // w, dy, dx
-      absl::flat_hash_map<std::string, int> attrs_map = {
-          {"input_n", attrs[15]},    {"input_c", attrs[16]},
-          {"input_h", attrs[17]},    {"input_w", attrs[18]},
-          {"weights_n", attrs[0]},   {"weights_c", attrs[1]},
-          {"weights_h", attrs[2]},   {"weights_w", attrs[3]},
-          {"pad_h", attrs[8]},       {"pad_w", attrs[9]},
-          {"stride_h", attrs[10]},   {"stride_w", attrs[11]},
-          {"dilation_h", attrs[12]}, {"dilation_w", attrs[13]},
-          {"groups", attrs[14]},     {"output_n", attrs[4]},
-          {"output_c", attrs[5]},    {"output_h", attrs[6]},
-          {"output_w", attrs[7]},
-      };
-      // w, dy, dx
-      runtime::cuda::cinn_gpu_cudnn_conv2d_backward_data(
-          attrs_map,
-          pod_args[0],
-          pod_args[1],
-          pod_args[2],
-          static_cast<cudaStream_t>(stream));
-    } else {
-      // x, dy, w
-      absl::flat_hash_map<std::string, int> attrs_map = {
-          {"input_n", attrs[0]},     {"input_c", attrs[1]},
-          {"input_h", attrs[2]},     {"input_w", attrs[3]},
-          {"weights_n", attrs[15]},  {"weights_c", attrs[16]},
-          {"weights_h", attrs[17]},  {"weights_w", attrs[18]},
-          {"pad_h", attrs[8]},       {"pad_w", attrs[9]},
-          {"stride_h", attrs[10]},   {"stride_w", attrs[11]},
-          {"dilation_h", attrs[12]}, {"dilation_w", attrs[13]},
-          {"groups", attrs[14]},     {"output_n", attrs[4]},
-          {"output_c", attrs[5]},    {"output_h", attrs[6]},
-          {"output_w", attrs[7]},
-      };
-      // x, dy, w
-      runtime::cuda::cinn_gpu_cudnn_conv2d_backward_filter(
-          attrs_map,
-          pod_args[0],
-          pod_args[1],
-          pod_args[2],
-          static_cast<cudaStream_t>(stream));
-    }
-  } else if (function_name_ == "pool2d" &&
-             target_.arch == Target::Arch::NVGPU) {
-    runtime::cuda::cinn_gpu_cudnn_pool2d(attrs,
-                                         str_attrs,
+    } else if (function_name_ == "pool2d") {
+      runtime::cuda::cinn_gpu_cudnn_pool2d(attrs,
+                                           str_attrs,
+                                           pod_args[0],
+                                           pod_args[1],
+                                           static_cast<cudaStream_t>(stream));
+    } else if (function_name_ == "softmax") {
+      CHECK_EQ(pod_args.size(), 3);
+      runtime::cuda::cinn_gpu_cudnn_softmax(
+          attrs, pod_args[0], pod_args[1], static_cast<cudaStream_t>(stream));
+    } else if (function_name_ == "mul") {
+      CHECK_EQ(pod_args.size(), 4);
+      runtime::cuda::cinn_gpu_cublas_mul(attrs,
                                          pod_args[0],
                                          pod_args[1],
+                                         pod_args[2],
                                          static_cast<cudaStream_t>(stream));
-  } else if (function_name_ == "softmax" &&
-             target_.arch == Target::Arch::NVGPU) {
-    CHECK_EQ(pod_args.size(), 3);
-    runtime::cuda::cinn_gpu_cudnn_softmax(
-        attrs, pod_args[0], pod_args[1], static_cast<cudaStream_t>(stream));
-  } else if (function_name_ == "mul" && target_.arch == Target::Arch::NVGPU) {
-    CHECK_EQ(pod_args.size(), 4);
-    runtime::cuda::cinn_gpu_cublas_mul(attrs,
-                                       pod_args[0],
-                                       pod_args[1],
-                                       pod_args[2],
-                                       static_cast<cudaStream_t>(stream));
-  } else if (function_name_ == "cublas_gemm" &&
-             target_.arch == Target::Arch::NVGPU) {
-    VLOG(3) << "The pod_args size of cublas_gemm: " << pod_args.size();
-    runtime::cuda::cinn_gpu_cublas_gemm(attrs,
-                                        pod_args[0],
-                                        pod_args[1],
-                                        pod_args[2],
-                                        pod_args[3],
-                                        static_cast<cudaStream_t>(stream));
-  } else if (function_name_ == "cublas_matmul" &&
-             target_.arch == Target::Arch::NVGPU) {
-    auto& pod_args = args_cached_[0];
-    VLOG(3) << "The pod_args size of cublas_matmul: " << pod_args.size();
-    runtime::cuda::cinn_gpu_cublas_gemm(attrs,
-                                        pod_args[0],
-                                        pod_args[1],
-                                        nullptr,
-                                        pod_args[2],
-                                        static_cast<cudaStream_t>(stream));
-  } else {
+    } else if (function_name_ == "cublas_gemm") {
+      VLOG(3) << "The pod_args size of cublas_gemm: " << pod_args.size();
+      runtime::cuda::cinn_gpu_cublas_gemm(attrs,
+                                          pod_args[0],
+                                          pod_args[1],
+                                          pod_args[2],
+                                          pod_args[3],
+                                          static_cast<cudaStream_t>(stream));
+    } else if (function_name_ == "cublas_matmul") {
+      auto& pod_args = args_cached_[0];
+      VLOG(3) << "The pod_args size of cublas_matmul: " << pod_args.size();
+      runtime::cuda::cinn_gpu_cublas_gemm(attrs,
+                                          pod_args[0],
+                                          pod_args[1],
+                                          nullptr,
+                                          pod_args[2],
+                                          static_cast<cudaStream_t>(stream));
+    } else {
+      VLOG(3) << "Running extern function " << function_name_;
+      for (int idx = 0; idx < fn_ptrs_.size(); ++idx) {
+        VLOG(3) << "Running func name: " << fn_names_[idx];
+        auto& pod_args = args_cached_[idx];
+        CHECK(fn_ptrs_[idx])
+            << "The LoweredFunc address should be set first by "
+               "calling SetLoweredFunc method";
+        if (!dryrun) {
+          ((lower_func_ptr_g)fn_ptrs_[idx])(
+              static_cast<void*>(pod_args.data()), pod_args.size(), stream);
+        }
+      }
+      VLOG(3) << "Done Running extern function " << function_name_;
+    }
+#else
     VLOG(3) << "Running extern function " << function_name_;
     for (int idx = 0; idx < fn_ptrs_.size(); ++idx) {
       VLOG(3) << "Running func name: " << fn_names_[idx];
@@ -332,37 +400,17 @@ void Instruction::Run(
       }
     }
     VLOG(3) << "Done Running extern function " << function_name_;
-  }
-#else
-  VLOG(3) << "Running extern function " << function_name_;
-  for (int idx = 0; idx < fn_ptrs_.size(); ++idx) {
-    VLOG(3) << "Running func name: " << fn_names_[idx];
-    auto& pod_args = args_cached_[idx];
-    CHECK(fn_ptrs_[idx]) << "The LoweredFunc address should be set first by "
-                            "calling SetLoweredFunc method";
-    if (!dryrun) {
-      if (target_ == cinn::common::DefaultNVGPUTarget()) {
-        ((lower_func_ptr_g)fn_ptrs_[idx])(
-            static_cast<void*>(pod_args.data()), pod_args.size(), stream);
-      } else {
-        ((lower_func_ptr_t)fn_ptrs_[idx])(static_cast<void*>(pod_args.data()),
-                                          pod_args.size());
-      }
-    }
-  }
-  VLOG(3) << "Done Running extern function " << function_name_;
 #endif
-
+  };
+  target_.arch.Visit(adt::match{
+      [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+      [&](common::X86Arch) { DefaultRun(); },
+      [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+      [&](common::NVGPUArch) { NVGPURun(); },
+  });
   if (!cinn::runtime::CheckStringFlagFalse(FLAGS_cinn_self_check_accuracy)) {
     CheckResults(name2podargs, stream);
   }
-  // TODO(thisjiang): revert while flags correct
-  //   if (FLAGS_cinn_sync_run) {
-  // #ifdef CINN_WITH_CUDA
-  //     utils::RecordEvent record_sync("FLAGS_cinn_sync_run");
-  //     CUDA_CALL(cudaStreamSynchronize(static_cast<cudaStream_t>(stream)));
-  // #endif
-  //   }
 }
 
 std::string Instruction::DumpInstruction() const {
diff --git a/paddle/cinn/hlir/framework/memory.cc b/paddle/cinn/hlir/framework/memory.cc
index bfc33b31beda9..d85393db72fb3 100755
--- a/paddle/cinn/hlir/framework/memory.cc
+++ b/paddle/cinn/hlir/framework/memory.cc
@@ -58,10 +58,10 @@ class CudaMemoryMng : public MemoryInterface {
 }  // namespace
 
 MemoryManager::MemoryManager() {
-  Register(Target::Arch::Unk, new X86MemoryMng);
-  Register(Target::Arch::X86, new X86MemoryMng);
+  Register(common::UnknownArch{}, new X86MemoryMng);
+  Register(common::X86Arch{}, new X86MemoryMng);
 #ifdef CINN_WITH_CUDA
-  Register(Target::Arch::NVGPU, new CudaMemoryMng);
+  Register(common::NVGPUArch{}, new CudaMemoryMng);
 #endif
 }
 
diff --git a/paddle/cinn/hlir/framework/memory.h b/paddle/cinn/hlir/framework/memory.h
index 889e32e7fca0b..b719ece874f51 100644
--- a/paddle/cinn/hlir/framework/memory.h
+++ b/paddle/cinn/hlir/framework/memory.h
@@ -19,6 +19,7 @@
 
 #include <memory>
 
+#include "paddle/cinn/common/arch_util.h"
 #include "paddle/cinn/common/macros.h"
 #include "paddle/cinn/common/target.h"
 
@@ -41,7 +42,7 @@ class MemoryInterface {
  */
 class MemoryManager final {
  public:
-  using key_t = cinn::common::Target::Arch;
+  using key_t = cinn::common::Arch;
 
   static MemoryManager& Global() {
     static auto* x = new MemoryManager;
@@ -56,12 +57,14 @@ class MemoryManager final {
 
   MemoryInterface* RetrieveSafely(key_t key) {
     auto* res = Retrieve(key);
-    CHECK(res) << "no MemoryInterface for architecture [" << key << "]";
+    CHECK(res) << "no MemoryInterface for architecture [" << GetArchName(key)
+               << "]";
     return res;
   }
 
   MemoryInterface* Register(key_t key, MemoryInterface* item) {
-    CHECK(!memory_mngs_.count(key)) << "Duplicate register [" << key << "]";
+    CHECK(!memory_mngs_.count(key))
+        << "Duplicate register [" << GetArchName(key) << "]";
     memory_mngs_[key].reset(item);
     return item;
   }
@@ -69,8 +72,7 @@ class MemoryManager final {
  private:
   MemoryManager();
 
-  absl::flat_hash_map<cinn::common::Target::Arch,
-                      std::unique_ptr<MemoryInterface>>
+  absl::flat_hash_map<cinn::common::Arch, std::unique_ptr<MemoryInterface>>
       memory_mngs_;
 
   CINN_DISALLOW_COPY_AND_ASSIGN(MemoryManager);
diff --git a/paddle/cinn/hlir/framework/pir/CMakeLists.txt b/paddle/cinn/hlir/framework/pir/CMakeLists.txt
index 3b09925b94830..bf8cd25f48e4b 100755
--- a/paddle/cinn/hlir/framework/pir/CMakeLists.txt
+++ b/paddle/cinn/hlir/framework/pir/CMakeLists.txt
@@ -8,7 +8,8 @@ gather_srcs(
   op_lowering_impl.cc
   op_mapper.cc
   op_lowering_util.cc
+  trivial_op_impl.cc
+  trivial_op_util.cc
   compilation_task.cc
   compilation_cache.cc
-  trivial_op_impl.cc
-  trivial_op_util.cc)
+  fusion_info.cc)
diff --git a/paddle/cinn/hlir/framework/pir/compilation_cache.cc b/paddle/cinn/hlir/framework/pir/compilation_cache.cc
index 47a38442b58a5..9b98597a50265 100644
--- a/paddle/cinn/hlir/framework/pir/compilation_cache.cc
+++ b/paddle/cinn/hlir/framework/pir/compilation_cache.cc
@@ -39,38 +39,20 @@ void* BackendResource::GetInferFuncPtr() const {
   return ptr;
 }
 
-std::shared_ptr<backends::Compiler>& BackendResource::GetBackendCompiler() {
-  return backend_compiler_;
-}
-
-const std::shared_ptr<backends::Compiler>& BackendResource::GetBackendCompiler()
-    const {
-  return backend_compiler_;
-}
-
-void BackendResource::SetHostFnName(const std::string& name) {
-  host_fn_name_ = name;
-}
-
-void BackendResource::SetInferFnName(const std::string& name) {
-  infer_fn_name_ = name;
-}
-
-pir::CINNKernelInfo BackendResource::GernerateKernelInfo(
-    const std::shared_ptr<pir::OpLoweringGroup>& group) const {
+pir::CINNKernelInfo BackendResource::GenerateKernelInfo() const {
   pir::CINNKernelInfo kernel_info;
   kernel_info.fn_name = host_fn_name_;
   kernel_info.fn_ptr = GetHostFuncPtr();
   kernel_info.infer_shape_fn_ptr = GetInferFuncPtr();
-  kernel_info.int_args_map = group->int_args_map();
+  kernel_info.int_args_map = GetIntArgsMap();
   return kernel_info;
 }
 }  // namespace pir
 
 bool CompilationCache::Has(const CacheKey& key) const {
-  const bool has_existed = cache_.find(KeyHash(key)) != cache_.end();
-  VLOG(6) << "Check IsExisted in CompilationCache: " << key->FuncName() << " "
-          << has_existed;
+  const bool has_existed = cache_.find(key) != cache_.end();
+  VLOG(6) << "Check IsExisted in CompilationCache: " << has_existed << " - "
+          << key;
   return has_existed;
 }
 
@@ -79,24 +61,19 @@ const CompilationCache::CacheValue& CompilationCache::Get(
   PADDLE_ENFORCE_EQ(
       Has(key),
       true,
-      phi::errors::NotFound("%s is not in CompliatonCache.", key->FuncName()));
-  return cache_.at(KeyHash(key));
+      phi::errors::NotFound("%s is not in CompliatonCache.", key));
+  return cache_.at(key);
 }
 
 pir::CINNKernelInfo CompilationCache::GetKernelInfo(const CacheKey& key) const {
-  return Get(key)->GetKernelInfo(key);
+  return Get(key)->GetKernelInfo();
 }
 
 void CompilationCache::Insert(const CacheKey& key, const CacheValue& value) {
-  VLOG(6) << "Insert CompilationCache for: " << key->FuncName();
-  cache_.insert({KeyHash(key), value});
+  VLOG(6) << "Insert CompilationCache for: " << key;
+  cache_.insert({key, value});
 }
 
 void CompilationCache::Clear() { cache_.clear(); }
 
-size_t CompilationCache::KeyHash(const CacheKey& key) const {
-  // TODO(Aurelius84): use a better hash function in next pr.
-  return std::hash<std::string>{}(key->FuncName());
-}
-
 }  // namespace cinn::hlir::framework
diff --git a/paddle/cinn/hlir/framework/pir/compilation_cache.h b/paddle/cinn/hlir/framework/pir/compilation_cache.h
index 018bd6fd85572..547a1889f01a6 100644
--- a/paddle/cinn/hlir/framework/pir/compilation_cache.h
+++ b/paddle/cinn/hlir/framework/pir/compilation_cache.h
@@ -19,6 +19,7 @@
 #include "paddle/cinn/backends/compiler.h"
 #include "paddle/cinn/common/macros.h"
 #include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/framework/pir/fusion_info.h"
 #include "paddle/cinn/hlir/framework/pir/utils.h"
 
 namespace cinn::hlir::framework {
@@ -27,76 +28,79 @@ namespace pir {
 class OpLoweringGroup;
 class BackendResource final {
  public:
-  BackendResource(const Target& target) {
-    backend_compiler_ = backends::Compiler::Create(target);
-  }
-
   BackendResource(const Target& target,
                   const std::string& host_fn_name,
-                  const std::string& infer_fn_name)
-      : host_fn_name_(host_fn_name), infer_fn_name_(infer_fn_name) {
+                  const std::string& infer_fn_name,
+                  const std::map<int, CINNKernelInfo::ArgDimIdx>& int_args_map)
+      : host_fn_name_(host_fn_name),
+        infer_fn_name_(infer_fn_name),
+        int_args_map_(int_args_map) {
     backend_compiler_ = backends::Compiler::Create(target);
   }
 
   void* GetHostFuncPtr() const;
   void* GetInferFuncPtr() const;
-  pir::CINNKernelInfo GernerateKernelInfo(
-      const std::shared_ptr<pir::OpLoweringGroup>& group) const;
-  std::shared_ptr<backends::Compiler>& GetBackendCompiler();
-  const std::shared_ptr<backends::Compiler>& GetBackendCompiler() const;
-  void SetHostFnName(const std::string& name);
-  void SetInferFnName(const std::string& name);
+  const std::map<int, CINNKernelInfo::ArgDimIdx>& GetIntArgsMap() const {
+    return int_args_map_;
+  }
+  const std::shared_ptr<backends::Compiler>& GetBackendCompiler() const {
+    return backend_compiler_;
+  }
+  pir::CINNKernelInfo GenerateKernelInfo() const;
 
  private:
   std::string host_fn_name_;
   std::string infer_fn_name_;
-  // std::string host_code_;
-  // std::vector<std::string> device_code_;
-  std::shared_ptr<backends::Compiler> backend_compiler_;
+  std::map<int, CINNKernelInfo::ArgDimIdx> int_args_map_;
+
+  std::shared_ptr<backends::Compiler> backend_compiler_{nullptr};
 };
 
 class CompilationResult final {
  public:
-  explicit CompilationResult(const Target& target)
-      : target_(target), backend_resource_(target) {}
-
-  BackendResource& MutableBackendResource() { return backend_resource_; }
-  const BackendResource& GetBackendResource() const {
+  explicit CompilationResult(const Target& target) : target_(target) {}
+  const std::shared_ptr<BackendResource>& GetBackendResource() const {
     return backend_resource_;
   }
-  pir::CINNKernelInfo GetKernelInfo(
-      const std::shared_ptr<pir::OpLoweringGroup>& group) {
-    return backend_resource_.GernerateKernelInfo(group);
+
+  void SetBackendResource(const std::shared_ptr<BackendResource>& other) {
+    backend_resource_ = other;
+  }
+
+  pir::CINNKernelInfo GetKernelInfo() {
+    // TODO(Aurelius84): add ENFORCE_NOT_NULL
+    return backend_resource_->GenerateKernelInfo();
   }
 
  private:
   Target target_;
-  BackendResource backend_resource_;
+  std::shared_ptr<BackendResource> backend_resource_{nullptr};
 };
+
 }  // namespace pir
 
 class CompilationCache {
  public:
-  using CacheKey = std::shared_ptr<pir::OpLoweringGroup>;
+  using CacheKey = pir::FusionInfo;
   using CacheValue = std::shared_ptr<pir::CompilationResult>;
 
   static CompilationCache& Instance() {
-    static CompilationCache instance;
+    thread_local static CompilationCache instance;
     return instance;
   }
 
   bool Has(const CacheKey& key) const;
   const CacheValue& Get(const CacheKey& key) const;
-  pir::CINNKernelInfo GetKernelInfo(const CacheKey& key) const;
   void Insert(const CacheKey& key, const CacheValue& value);
   void Clear();
-  size_t KeyHash(const CacheKey& key) const;
+
+  pir::CINNKernelInfo GetKernelInfo(const CacheKey& key) const;
 
  private:
   CompilationCache() = default;
   CINN_DISALLOW_COPY_AND_ASSIGN(CompilationCache);
 
-  std::unordered_map<size_t, CacheValue> cache_;
+  std::unordered_map<CacheKey, CacheValue> cache_;
 };
 
 }  // namespace cinn::hlir::framework
diff --git a/paddle/cinn/hlir/framework/pir/compilation_task.cc b/paddle/cinn/hlir/framework/pir/compilation_task.cc
index a93ac960d496a..85f4d2849ea80 100644
--- a/paddle/cinn/hlir/framework/pir/compilation_task.cc
+++ b/paddle/cinn/hlir/framework/pir/compilation_task.cc
@@ -42,15 +42,9 @@ std::string GroupCompilationContext::PrintPredicate2Funcs() const {
   return ss.str();
 }
 
-void CompilationTask::operator()() {
-  VLOG(4) << "Run Compilation Task for : " << context_->group_.get();
-  if (CompilationCache::Instance().Has(context_->group_)) {
-    VLOG(4) << "Found cached kernel info for group: "
-            << context_->group_->FuncName();
-    return;
-  }
+std::shared_ptr<pir::CompilationResult> CompilationTask::operator()() {
   Lowering();
-  CodegenAndJit();
+  return CodegenAndJit();
 }
 
 void CompilationTask::Lowering() {
@@ -62,7 +56,7 @@ void CompilationTask::Lowering() {
                              /* apply pass = */ true));
 }
 
-void CompilationTask::CodegenAndJit() {
+std::shared_ptr<pir::CompilationResult> CompilationTask::CodegenAndJit() {
   ir::Module::Builder builder(cinn::common::UniqName("module"),
                               context_->target_);
   CHECK_EQ(context_->predicates_.size(), context_->lowered_funcs_.size());
@@ -74,27 +68,22 @@ void CompilationTask::CodegenAndJit() {
   }
   builder.SetInferShapeFunc(context_->infer_shape_lowered_func_);
   ir::Module ir_module = builder.Build();
-  BuildPirCINNKernelInfo(ir_module);
-}
-
-pir::CINNKernelInfo CompilationTask::GetCINNKernelInfo() {
-  if (!CompilationCache::Instance().Has(context_->group_)) {
-    PADDLE_THROW(phi::errors::NotFound(
-        "Kernel info has been cached for current group."));
-  }
-  return CompilationCache::Instance().GetKernelInfo(context_->group_);
+  return BuildPirCINNKernelInfo(ir_module);
 }
 
-void CompilationTask::BuildPirCINNKernelInfo(const ir::Module& module) {
+std::shared_ptr<pir::CompilationResult> CompilationTask::BuildPirCINNKernelInfo(
+    const ir::Module& module) {
   auto compilation_result =
       std::make_shared<pir::CompilationResult>(context_->target_);
-  pir::BackendResource& backend_resource =
-      compilation_result->MutableBackendResource();
-  backend_resource.GetBackendCompiler()->Build(module, "");
-  backend_resource.SetHostFnName(context_->group_->FuncName());
-  backend_resource.SetInferFnName(context_->group_->FuncName() +
-                                  "_infer_shape");
-  CompilationCache::Instance().Insert(context_->group_, compilation_result);
+  auto backend_resource = std::make_shared<pir::BackendResource>(
+      context_->target_,
+      context_->group_->FuncName(),
+      context_->group_->FuncName() + "_infer_shape",
+      context_->group_->int_args_map());
+  VLOG(5) << "Start to compile module into cuda kernel...";
+  backend_resource->GetBackendCompiler()->Build(module, "");
+  compilation_result->SetBackendResource(backend_resource);
+  return compilation_result;
 }
 
 }  // namespace framework
diff --git a/paddle/cinn/hlir/framework/pir/compilation_task.h b/paddle/cinn/hlir/framework/pir/compilation_task.h
index 69e985afd7869..d104d264b6852 100644
--- a/paddle/cinn/hlir/framework/pir/compilation_task.h
+++ b/paddle/cinn/hlir/framework/pir/compilation_task.h
@@ -50,14 +50,13 @@ class CompilationTask {
   explicit CompilationTask(GroupCompilationContext* context)
       : context_(context) {}
 
-  void operator()();
-  pir::CINNKernelInfo GetCINNKernelInfo();
+  std::shared_ptr<pir::CompilationResult> operator()();
 
  private:
   void Lowering();
-  void CodegenAndJit();
-  std::unique_ptr<Instruction> BuildInstruction();
-  void BuildPirCINNKernelInfo(const ir::Module& module);
+  std::shared_ptr<pir::CompilationResult> CodegenAndJit();
+  std::shared_ptr<pir::CompilationResult> BuildPirCINNKernelInfo(
+      const ir::Module& module);
 
   GroupCompilationContext* context_;
 };
diff --git a/paddle/cinn/hlir/framework/pir/fusion_info.cc b/paddle/cinn/hlir/framework/pir/fusion_info.cc
new file mode 100644
index 0000000000000..f3b1979e6627e
--- /dev/null
+++ b/paddle/cinn/hlir/framework/pir/fusion_info.cc
@@ -0,0 +1,146 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/hlir/framework/pir/fusion_info.h"
+#include "paddle/common/enforce.h"
+#include "paddle/pir/include/core/ir_printer.h"
+
+namespace cinn::hlir::framework::pir {
+
+constexpr static char* kOpCallStack = "op_callstack";
+
+std::size_t AttributeInfo::hash() const { return attr_.hash(); }
+
+std::ostream& operator<<(std::ostream& os, const AttributeInfo& attr_info) {
+  os << "AttributeInfo - " << attr_info.name_ << ", " << attr_info.hash();
+  if (VLOG_IS_ON(7)) {
+    os << " (";
+    ::pir::IrPrinter(os).PrintAttribute(attr_info.attr_);
+    os << ")";
+  }
+  return os;
+}
+
+std::size_t ValueInfo::hash() const { return type_.hash(); }
+
+std::ostream& operator<<(std::ostream& os, const ValueInfo& value_info) {
+  os << "ValueInfo - " << value_info.hash();
+  if (VLOG_IS_ON(7)) {
+    os << "(";
+    ::pir::IrPrinter(os).PrintType(value_info.type_);
+    os << ")";
+  }
+  return os;
+}
+
+OperationInfo::OperationInfo(const ::pir::Operation& op) {
+  name_ = op.name();
+  for (const auto value : op.operands_source()) {
+    if (!value || !value.type()) continue;
+    input_infos_.emplace_back(value);
+  }
+  for (const auto value : op.results()) {
+    if (!value || !value.type()) continue;
+    output_infos_.emplace_back(value);
+  }
+  // Keep attribute always in order.
+  const auto& attributes = op.attributes();
+  std::map<std::string, ::pir::Attribute, std::less<>> order_attributes(
+      attributes.begin(), attributes.end());
+  for (const auto& [attr_name, attr_value] : order_attributes) {
+    if (!attr_value || attr_name == kOpCallStack) continue;
+    attr_infos_.emplace_back(attr_name, attr_value);
+  }
+}
+
+std::size_t OperationInfo::hash() const {
+  std::size_t seed = 1789;
+  hash_combine(seed, name_);
+  for (const auto& info : input_infos_) hash_combine(seed, info);
+  for (const auto& info : output_infos_) hash_combine(seed, info);
+  for (const auto& info : attr_infos_) hash_combine(seed, info);
+  return seed;
+}
+
+std::ostream& operator<<(std::ostream& os, const OperationInfo& op_info) {
+  os << op_info.name_ << " - " << op_info.hash();
+  if (VLOG_IS_ON(7)) {
+    os << "{\n";
+    for (const auto& info : op_info.input_infos_) os << info << "\n";
+    for (const auto& info : op_info.output_infos_) os << info << "\n";
+    for (const auto& info : op_info.attr_infos_) os << info << "\n";
+    os << "}";
+  }
+  return os;
+}
+
+FusionInfo::FusionInfo(const OpLoweringGroup& group) {
+  for (const auto* op : TopologySort(group)) {
+    op_infos_.emplace_back(*op);
+  }
+}
+
+std::size_t FusionInfo::hash() const {
+  if (cached_hash_value_ != 0U) {
+    return cached_hash_value_;
+  }
+  std::size_t seed = 2153;
+  for (const auto& info : op_infos_) hash_combine(seed, info);
+  return seed;
+}
+
+std::ostream& operator<<(std::ostream& os, const FusionInfo& fusion_info) {
+  os << "FusionInfo - " << fusion_info.hash();
+  if (VLOG_IS_ON(5)) {
+    os << "{\n";
+    for (const auto& op_info : fusion_info.op_infos_) os << op_info << "\n";
+    os << "}\n";
+  }
+  return os;
+}
+
+std::size_t HashIntArgsMap(
+    const std::map<int, CINNKernelInfo::ArgDimIdx>& int_args_map) {
+  std::size_t seed = 2153;
+  for (const auto& [input_idx, dim_idx] : int_args_map) {
+    hash_combine(seed, input_idx);
+    hash_combine(seed, dim_idx.arg_idx);
+    hash_combine(seed, dim_idx.dim_idx);
+  }
+  return seed;
+}
+std::ostream& operator<<(
+    std::ostream& os,
+    const std::map<int, CINNKernelInfo::ArgDimIdx>& int_args_map) {
+  os << "int_args_map: {\n";
+  for (const auto& [input_idx, dim_idx] : int_args_map) {
+    os << "input_idx: " << input_idx << ":[ " << dim_idx.arg_idx << ", "
+       << dim_idx.dim_idx << " ]\n";
+  }
+  os << "}\n";
+}
+
+std::vector<const ::pir::Operation*> TopologySort(
+    const OpLoweringGroup& group) {
+  // NOTE(Aurelius84): Use simplest one-by-one order temporaly.
+  auto* block = group.GetParentBlock();
+  std::vector<const ::pir::Operation*> ops;
+  ops.reserve(block->size());
+  for (auto& op : *block) {
+    ops.push_back(&op);
+  }
+  return ops;
+}
+
+}  // namespace cinn::hlir::framework::pir
diff --git a/paddle/cinn/hlir/framework/pir/fusion_info.h b/paddle/cinn/hlir/framework/pir/fusion_info.h
new file mode 100644
index 0000000000000..477e6934319cf
--- /dev/null
+++ b/paddle/cinn/hlir/framework/pir/fusion_info.h
@@ -0,0 +1,118 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <ostream>
+#include "paddle/cinn/hlir/framework/pir/op_lowering_group.h"
+
+namespace cinn::hlir::framework::pir {
+
+class AttributeInfo {
+ public:
+  AttributeInfo(const std::string &name, const ::pir::Attribute &attr)
+      : name_(name), attr_(attr) {}
+
+  std::size_t hash() const;
+  friend std::ostream &operator<<(std::ostream &os, const AttributeInfo &info);
+
+ private:
+  std::string name_;
+  ::pir::Attribute attr_;
+};
+
+class ValueInfo {
+ public:
+  explicit ValueInfo(const ::pir::Value &value) : type_(value.type()) {}
+
+  std::size_t hash() const;
+  friend std::ostream &operator<<(std::ostream &os, const ValueInfo &info);
+
+ private:
+  // All value information is in TypeStorage.
+  ::pir::Type type_;
+};
+
+class OperationInfo {
+ public:
+  explicit OperationInfo(const ::pir::Operation &op);
+
+  std::size_t hash() const;
+  friend std::ostream &operator<<(std::ostream &os, const OperationInfo &info);
+
+ private:
+  std::string name_;
+  std::vector<ValueInfo> input_infos_;
+  std::vector<ValueInfo> output_infos_;
+  std::vector<AttributeInfo> attr_infos_;
+};
+
+class FusionInfo {
+  using IntArgsMap = std::map<int, CINNKernelInfo::ArgDimIdx>;
+
+ public:
+  explicit FusionInfo(const OpLoweringGroup &group);
+  FusionInfo() = delete;
+  FusionInfo(const FusionInfo &) = default;
+  FusionInfo(FusionInfo &&) = default;
+
+  std::size_t hash() const;
+
+  bool operator==(const FusionInfo &other) const {
+    return this->hash() == other.hash();
+  }
+  friend std::ostream &operator<<(std::ostream &os, const FusionInfo &info);
+
+ private:
+  std::vector<OperationInfo> op_infos_;
+  std::size_t cached_hash_value_{0};
+};
+
+std::ostream &operator<<(std::ostream &os, const AttributeInfo &info);
+std::ostream &operator<<(std::ostream &os, const ValueInfo &info);
+std::ostream &operator<<(std::ostream &os, const OperationInfo &info);
+std::ostream &operator<<(std::ostream &os, const FusionInfo &info);
+
+// See boost.hash_combine for details
+template <class T>
+inline void hash_combine(std::size_t &seed,  // NOLINT
+                         const T &v) {
+  std::hash<T> hasher;
+  seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+
+std::size_t HashIntArgsMap(
+    const std::map<int, CINNKernelInfo::ArgDimIdx> &int_args_map);
+std::ostream &operator<<(
+    std::ostream &os,
+    const std::map<int, CINNKernelInfo::ArgDimIdx> &int_args_map);
+std::vector<const ::pir::Operation *> TopologySort(
+    const OpLoweringGroup &group);
+
+}  // namespace cinn::hlir::framework::pir
+
+namespace std {
+#define REGISTER_STD_HASH(class_name)                              \
+  template <>                                                      \
+  struct hash<cinn::hlir::framework::pir::class_name> {            \
+    std::size_t operator()(                                        \
+        const cinn::hlir::framework::pir::class_name &obj) const { \
+      return obj.hash();                                           \
+    }                                                              \
+  };
+
+REGISTER_STD_HASH(AttributeInfo);
+REGISTER_STD_HASH(ValueInfo);
+REGISTER_STD_HASH(OperationInfo);
+REGISTER_STD_HASH(FusionInfo)
+}  // namespace std
diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_group.cc b/paddle/cinn/hlir/framework/pir/op_lowering_group.cc
index 8799c84969a04..f9bfed7c92727 100644
--- a/paddle/cinn/hlir/framework/pir/op_lowering_group.cc
+++ b/paddle/cinn/hlir/framework/pir/op_lowering_group.cc
@@ -19,6 +19,113 @@ namespace hlir {
 namespace framework {
 namespace pir {
 
+::pir::Program* OpLoweringGroup::GetParentProgram() const {
+  PADDLE_ENFORCE_GT(ops_.size(),
+                    0,
+                    ::common::errors::PreconditionNotMet(
+                        "Require at least one op in the group."));
+  PADDLE_ENFORCE_NOT_NULL(
+      ops_[0],
+      ::common::errors::Unavailable("Found group.ops_[0] is nullptr."));
+  return ops_[0]->GetParentProgram();
+}
+
+::pir::Block* OpLoweringGroup::GetParentBlock() const {
+  PADDLE_ENFORCE_GT(this->ops_.size(),
+                    0,
+                    ::common::errors::PreconditionNotMet(
+                        "Required at least one operation in OpLoweringGroup."));
+  auto* block = this->ops_[0]->GetParent();
+  PADDLE_ENFORCE_NOT_NULL(
+      block,
+      ::common::errors::Unavailable(
+          "Required inner op's parent block must not be nullptr."));
+  for (size_t i = 1; i < this->ops_.size(); ++i) {
+    PADDLE_ENFORCE_EQ(this->ops_[0]->GetParent(),
+                      block,
+                      ::common::errors::PreconditionNotMet(
+                          "Required all ops must belong into same block."));
+  }
+
+  return block;
+}
+
+std::vector<::pir::Value> OpLoweringGroup::GetGroupOutputValues() const {
+  std::unordered_set<::pir::Operation*> group_ops_set(this->ops_.begin(),
+                                                      this->ops_.end());
+
+  std::vector<::pir::Value> output_values;
+  for (auto* op : this->ops_) {
+    for (size_t i = 0; i < op->num_results(); ++i) {
+      auto result = op->result(i);
+      if (!result) {
+        continue;
+      }
+      for (auto use_iter = result.use_begin(); use_iter != result.use_end();
+           ++use_iter) {
+        auto* use_op = use_iter->owner();
+        if (group_ops_set.find(use_op) == group_ops_set.end()) {
+          output_values.push_back(result);
+          break;
+        }
+      }
+    }
+  }
+  return output_values;
+}
+
+std::unordered_set<::pir::Value> OpLoweringGroup::GetInputOpValues() const {
+  std::unordered_set<::pir::Value> group_inputs;
+  std::unordered_set<::pir::Operation*> ops_set(this->ops_.begin(),
+                                                this->ops_.end());
+
+  // count all op's input Value
+  for (auto op : ops_set) {
+    for (auto& value : op->operands_source()) {
+      if (!value || !value.type() || ops_set.count(value.defining_op()))
+        continue;
+      // if the input value owner op is not in OpSet, it's the group's input
+      group_inputs.insert(value);
+    }
+  }
+  return group_inputs;
+}
+
+std::unordered_set<::pir::Value> OpLoweringGroup::GetOutputOpValues() const {
+  std::unordered_set<::pir::Value> group_outputs;
+
+  for (auto op : this->output_ops_) {
+    for (auto& result : op->results()) {
+      if (!result || result.type()) {
+        continue;
+      }
+
+      group_outputs.insert(result);
+    }
+  }
+  return group_outputs;
+}
+
+const symbol::ShapeOrDataDimExprs& OpLoweringGroup::GetShapeOrDataExprs(
+    const ::pir::Value& value) const {
+  PADDLE_ENFORCE_EQ(HasShapeOrDataExprs(value),
+                    true,
+                    ::common::errors::NotFound(
+                        "value not found in value_to_shape_or_data_exprs_"));
+  return value_to_shape_or_data_exprs_.at(value);
+}
+
+void OpLoweringGroup::SetShapeOrDataExprs(
+    const ::pir::Value& value,
+    const symbol::ShapeOrDataDimExprs& shape_or_data) {
+  auto iter = value_to_shape_or_data_exprs_.find(value);
+  if (iter == value_to_shape_or_data_exprs_.end()) {
+    value_to_shape_or_data_exprs_.emplace(value, shape_or_data);
+  } else {
+    iter->second = shape_or_data;
+  }
+}
+
 std::shared_ptr<OpLoweringGroup> OpLoweringGroup::Clone(
     ::pir::Block* target_block, ::pir::IrMapping* ir_mapping) const {
   std::vector<::pir::Operation*> new_ops;
@@ -46,7 +153,6 @@ std::shared_ptr<OpLoweringGroup> OpLoweringGroup::Clone(
 
   new_group->input_names_ = this->input_names_;
   new_group->output_names_ = this->output_names_;
-  new_group->fn_name_ = this->fn_name_;
   new_group->int_args_map_ = this->int_args_map_;
   new_group->alignment_schedule_info_ = this->alignment_schedule_info_;
   new_group->reduce_axis_ = this->reduce_axis_;
diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_group.h b/paddle/cinn/hlir/framework/pir/op_lowering_group.h
index aaa2f31f0a60c..bfaf843cdf5f0 100644
--- a/paddle/cinn/hlir/framework/pir/op_lowering_group.h
+++ b/paddle/cinn/hlir/framework/pir/op_lowering_group.h
@@ -22,6 +22,7 @@
 #include "paddle/cinn/common/context.h"
 #include "paddle/cinn/hlir/framework/op.h"
 #include "paddle/cinn/hlir/framework/pir/utils.h"
+#include "paddle/common/enforce.h"
 #include "paddle/pir/include/core/builtin_type_interfaces.h"
 #include "paddle/pir/include/core/operation.h"
 #include "paddle/pir/include/core/value.h"
@@ -38,124 +39,34 @@ namespace framework {
 namespace pir {
 class OpLoweringGroup {
  public:
-  OpLoweringGroup() = default;
   OpLoweringGroup(const OpLoweringGroup&) = delete;
   OpLoweringGroup(OpLoweringGroup&&) = delete;
 
   explicit OpLoweringGroup(const std::vector<::pir::Operation*>& group_ops)
-      : ops_(group_ops) {}
-
-  explicit OpLoweringGroup(std::initializer_list<::pir::Operation*> group_ops)
-      : ops_(group_ops) {}
-
-  struct SharedGroupHasher {
-    size_t operator()(
-        const std::shared_ptr<OpLoweringGroup>& group) const noexcept {
-      return std::hash<std::string>()(group->group_id());
-    }
-  };
-  struct SharedGroupComparator {
-    bool operator()(
-        const std::shared_ptr<OpLoweringGroup>& first,
-        const std::shared_ptr<OpLoweringGroup>& second) const noexcept {
-      return first->group_id() == second->group_id();
-    }
-  };
-
-  std::vector<::pir::Value> GetGroupOutputValues() const {
-    std::unordered_set<::pir::Operation*> group_ops_set(this->ops_.begin(),
-                                                        this->ops_.end());
-
-    std::vector<::pir::Value> output_values;
-    for (auto* op : this->ops_) {
-      for (size_t i = 0; i < op->num_results(); ++i) {
-        auto result = op->result(i);
-        if (!result) {
-          continue;
-        }
-        for (auto use_iter = result.use_begin(); use_iter != result.use_end();
-             ++use_iter) {
-          auto* use_op = use_iter->owner();
-          if (group_ops_set.find(use_op) == group_ops_set.end()) {
-            output_values.push_back(result);
-            break;
-          }
-        }
-      }
-    }
-    return output_values;
-  }
-
-  std::unordered_set<::pir::Value> GetInputOpValues() const {
-    std::unordered_set<::pir::Value> group_inputs;
-
-    std::unordered_set<::pir::Operation*> ops_set;
-    for (auto op : this->ops_) {
-      ops_set.insert(op);
-    }
-
-    // count all op's input Value
-    for (auto op : this->ops_) {
-      for (auto& value : op->operands_source()) {
-        if (!value || !value.type()) {
-          continue;
-        }
-
-        if (!ops_set.count(value.defining_op())) {
-          // if the input value owner op is not in OpSet, it's the group's input
-          group_inputs.insert(value);
-          continue;
-        }
-      }
-    }
-
-    return group_inputs;
+      : ops_(group_ops) {
+    fn_name_ = CompatibleInfo::GroupOpsName(ops_);
   }
 
-  std::unordered_set<::pir::Value> GetOutputOpValues() const {
-    std::unordered_set<::pir::Value> group_outputs;
-
-    for (auto op : this->output_ops_) {
-      for (auto& result : op->results()) {
-        if (!result || result.type()) {
-          continue;
-        }
-
-        group_outputs.insert(result);
-      }
-    }
-    return group_outputs;
-  }
-
-  std::string FuncName() const {
-    if (fn_name_ == "") {
-      // TODO(Aurelius84): Polish this implementation.
-      const_cast<OpLoweringGroup*>(this)->fn_name_ =
-          CompatibleInfo::GroupOpsName(ops_);
-    }
-    return this->fn_name_;
+  explicit OpLoweringGroup(std::initializer_list<::pir::Operation*> group_ops)
+      : ops_(group_ops) {
+    fn_name_ = CompatibleInfo::GroupOpsName(ops_);
   }
 
+  const std::string& FuncName() const { return this->fn_name_; }
+  ::pir::Block* GetParentBlock() const;
+  ::pir::Program* GetParentProgram() const;
+  std::vector<::pir::Value> GetGroupOutputValues() const;
+  std::unordered_set<::pir::Value> GetInputOpValues() const;
+  std::unordered_set<::pir::Value> GetOutputOpValues() const;
   const symbol::ShapeOrDataDimExprs& GetShapeOrDataExprs(
-      const ::pir::Value& value) const {
-    CHECK(value_to_shape_or_data_exprs_.count(value))
-        << "value not found in value_to_shape_or_data_exprs_";
-    return value_to_shape_or_data_exprs_.at(value);
-  }
+      const ::pir::Value& value) const;
 
   bool HasShapeOrDataExprs(const ::pir::Value& value) const {
     return value_to_shape_or_data_exprs_.count(value);
   }
 
   void SetShapeOrDataExprs(const ::pir::Value& value,
-                           const symbol::ShapeOrDataDimExprs& shape_or_data) {
-    auto iter = value_to_shape_or_data_exprs_.find(value);
-    if (iter == value_to_shape_or_data_exprs_.end()) {
-      value_to_shape_or_data_exprs_.emplace(value, shape_or_data);
-    } else {
-      iter->second = shape_or_data;
-    }
-  }
+                           const symbol::ShapeOrDataDimExprs& shape_or_data);
 
   void WalkOps(const std::function<void(::pir::Operation*)>& VisitOp) const {
     for (const auto& op : ops_) {
@@ -164,23 +75,17 @@ class OpLoweringGroup {
   }
 
   const std::vector<::pir::Operation*>& ops() const { return ops_; }
-
   std::vector<::pir::Operation*>& mut_ops() { return ops_; }
-
   void SetOps(const std::vector<::pir::Operation*>& new_ops) { ops_ = new_ops; }
 
   const std::vector<std::string>& input_names() const {
     return this->input_names_;
   }
-
   std::vector<std::string>& mut_input_names() { return this->input_names_; }
-
   const std::vector<std::string>& output_names() const {
     return this->output_names_;
   }
-
   std::vector<std::string>& mut_output_names() { return this->output_names_; }
-
   const std::vector<::pir::Value>& output_values() const {
     return this->output_values_;
   }
@@ -188,22 +93,25 @@ class OpLoweringGroup {
   std::vector<::pir::Value>& mut_output_values() {
     return this->output_values_;
   }
-
   const std::unordered_set<::pir::Operation*>& output_ops() const {
     return this->output_ops_;
   }
-
   std::unordered_set<::pir::Operation*>& mut_output_ops() {
     return this->output_ops_;
   }
 
   std::shared_ptr<adt::MapExprCtx> mut_map_expr_ctx() {
-    CHECK_NOTNULL(map_expr_ctx_);
+    PADDLE_ENFORCE_NOT_NULL(
+        map_expr_ctx_,
+        ::common::errors::Unavailable("Required map_expr_ctx_ != nullptr."));
     return map_expr_ctx_;
   }
 
   const adt::MapExprCtx& map_expr_ctx() const {
-    return *CHECK_NOTNULL(map_expr_ctx_);
+    PADDLE_ENFORCE_NOT_NULL(
+        map_expr_ctx_,
+        ::common::errors::Unavailable("Required map_expr_ctx_ != nullptr."));
+    return *map_expr_ctx_;
   }
 
   void set_value_to_shape_or_data_exprs(
@@ -285,6 +193,7 @@ class OpLoweringGroup {
   std::string group_id_{common::UniqName("group_")};
   // op in this group
   std::vector<::pir::Operation*> ops_;
+  std::string fn_name_;
   // output ops of the group.
   std::unordered_set<::pir::Operation*> output_ops_;
   // op pattern kind.
@@ -293,7 +202,6 @@ class OpLoweringGroup {
   std::vector<std::string> input_names_;
   std::vector<std::string> output_names_;
   std::vector<::pir::Value> output_values_;
-  std::string fn_name_{""};
   std::map<int, CINNKernelInfo::ArgDimIdx> int_args_map_;
 
   alignment_schedule_info_t alignment_schedule_info_;
diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
index eea87c639cc96..bab37b959ddfc 100644
--- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
+++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
@@ -204,6 +204,7 @@ BucketLoweredFuncsWrapper OpLowererImpl::BucketLower(
   if (ops.size() == 1 && ops[0]->name() == "custom_call") {
     return {{{ir::Expr(1), LowerCustomCall(group)[0]}}, ir::LoweredFunc()};
   }
+
   std::vector<ir::Tensor> group_func_arg_tensors;
   std::unordered_map<::pir::Value, ir::Tensor> tensor_map;
   // for some op, it will output more tmp value and regard as
diff --git a/paddle/cinn/hlir/framework/pir/trivial_op_impl.cc b/paddle/cinn/hlir/framework/pir/trivial_op_impl.cc
index 8b97871211a55..deda666331f2f 100644
--- a/paddle/cinn/hlir/framework/pir/trivial_op_impl.cc
+++ b/paddle/cinn/hlir/framework/pir/trivial_op_impl.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/cinn/hlir/framework/pir/trivial_op_impl.h"
-
 #include <variant>
+#include "paddle/cinn/operator_fusion/backend/pattern.h"
+#include "paddle/cinn/operator_fusion/backend/pattern_fuser.h"
+#include "paddle/cinn/operator_fusion/group_cluster.h"
 
 #include "paddle/cinn/hlir/dialect/operator/ir/manual_op.h"
 #include "paddle/cinn/hlir/framework/compile_error.h"
@@ -327,66 +329,17 @@ ir::Expr CreateExprWithNewComputeBody(const FusibleOp& fusible_op,
   return std::visit(Visitor(new_compute_body), fusible_op);
 }
 
-FusionNode::FusionNode(FusibleOp fusible_op) : fusible_op(fusible_op) {}
-
-std::string FusionNode::GetTensorCounter() {
-  static int i = 0;
-  return std::to_string(i++);
-}
-
-void FusionNode::replace_topo_structure_of_fused_nodes(
-    FusionNode* fused_up_node, FusionNode* fused_down_node) {
-  upstream.insert(fused_up_node->upstream.begin(),
-                  fused_up_node->upstream.end());
-  upstream.insert(fused_down_node->upstream.begin(),
-                  fused_down_node->upstream.end());
-  upstream.erase(fused_up_node);
-
-  downstream.insert(fused_up_node->downstream.begin(),
-                    fused_up_node->downstream.end());
-  downstream.insert(fused_down_node->downstream.begin(),
-                    fused_down_node->downstream.end());
-  downstream.erase(fused_down_node);
-
-  expr_related_op = fused_down_node->expr_related_op;
-
-  for (const auto& pair_data : upstream) {
-    FusionNode* upstream_node = pair_data.first;
-    ::pir::Value related_value = pair_data.second;
-    if (upstream_node->downstream.find(fused_up_node) !=
-        upstream_node->downstream.end()) {
-      upstream_node->downstream.erase(fused_up_node);
-    }
-    if (upstream_node->downstream.find(fused_down_node) !=
-        upstream_node->downstream.end()) {
-      upstream_node->downstream.erase(fused_down_node);
-    }
-    upstream_node->downstream[this] = related_value;
-  }
-
-  for (const auto& pair_data : downstream) {
-    FusionNode* downstream_node = pair_data.first;
-    ::pir::Value related_value = pair_data.second;
-    if (downstream_node->upstream.find(fused_up_node) !=
-        downstream_node->upstream.end()) {
-      downstream_node->upstream.erase(fused_up_node);
-    }
-    if (downstream_node->upstream.find(fused_down_node) !=
-        downstream_node->upstream.end()) {
-      downstream_node->upstream.erase(fused_down_node);
-    }
-    downstream_node->upstream[this] = related_value;
-  }
-}
+bool CheckAllLoopRangeEq(ReduceOp reduce_upper, TrivialOp trivial_down) {}
 
-bool FusionNode::IsTrivial() const {
-  return std::holds_alternative<TrivialOp>(fusible_op);
+int GetTensorCounter() {
+  static int counter = 1;
+  return counter++;
 }
 
-bool CheckAllLoopRangeEq(ReduceOp reduce_upper, TrivialOp trivial_down) {}
-
-std::vector<FusibleOp> TransformReduceLoopRange(const ReduceOp& upstream,
-                                                FusibleOp* downstream) {
+std::vector<FusibleOp> TransformReduceLoopRange(
+    const ReduceOp& upstream,
+    FusibleOp* downstream,
+    std::vector<size_t> fake_reduce_iter_idx) {
   // downstream will be mutated by this transform.
   VLOG(4) << "RRTransform begin";
   VLOG(4) << "RRTransform Upstream is \n" << _GetRootExpr(upstream);
@@ -396,13 +349,22 @@ std::vector<FusibleOp> TransformReduceLoopRange(const ReduceOp& upstream,
       modified_downstream_compute_body, GetOutputTensor(upstream));
   std::vector<FusibleOp> results;
   ir::Tensor downstream_output_tensor = GetOutputTensor(*downstream);
+
+  bool is_trivial_downstream = std::holds_alternative<TrivialOp>(*downstream);
+
   const auto create_new_tensor = [&](const ir::Tensor& downstream_load_tensor) {
     VLOG(4) << "Create New Tensor Start";
     ir::Tensor result = ir::Tensor(
-        downstream_load_tensor->name + "_" + FusionNode::GetTensorCounter(),
+        downstream_load_tensor->name + "_" + std::to_string(GetTensorCounter()),
         downstream_load_tensor->type(),
-        downstream_output_tensor->shape,
-        downstream_output_tensor->domain,
+        is_trivial_downstream
+            ? FilterWithFakeReduceIter(downstream_output_tensor->shape,
+                                       fake_reduce_iter_idx)
+            : downstream_output_tensor->shape,
+        is_trivial_downstream
+            ? FilterWithFakeReduceIter(downstream_output_tensor->domain,
+                                       fake_reduce_iter_idx)
+            : downstream_output_tensor->domain,
         GetOutputTensor(upstream)->operation,
         GetReduceIters(upstream));
     result->WithBuffer();
@@ -414,7 +376,10 @@ std::vector<FusibleOp> TransformReduceLoopRange(const ReduceOp& upstream,
     const auto& new_tensor =
         create_new_tensor(load_tensor.As<ir::Load>()->tensor.as_tensor_ref());
     ir::Expr new_reduce = CreateReduceExpr(
-        GetOutputIters(*downstream),
+        is_trivial_downstream
+            ? FilterWithFakeReduceIter(GetOutputIters(*downstream),
+                                       fake_reduce_iter_idx)
+            : GetOutputIters(*downstream),
         GetReduceIters(upstream),
         GetInitExpr(upstream),
         ComposeUtils::CopyedReplaceExpr(GetComputeBody(upstream),
@@ -423,10 +388,16 @@ std::vector<FusibleOp> TransformReduceLoopRange(const ReduceOp& upstream,
         new_tensor,
         GetOutputTensor(upstream));
     results.emplace_back(ReduceOp(new_reduce));
+    VLOG(4) << "After Tmp Transform, upstream is : \n"
+            << _GetRootExpr(results.back());
     ExprTransformerUtils::ReplaceTarget(
         &modified_downstream_compute_body,
         load_tensor,
-        new_tensor(ComposeUtils::VarVec2ExprVec(GetOutputIters(*downstream))));
+        new_tensor(ComposeUtils::VarVec2ExprVec(
+            is_trivial_downstream
+                ? FilterWithFakeReduceIter(GetOutputIters(*downstream),
+                                           fake_reduce_iter_idx)
+                : GetOutputIters(*downstream))));
   }
   _SetFuncBody(*downstream,
                CreateExprWithNewComputeBody(*downstream,
@@ -436,61 +407,52 @@ std::vector<FusibleOp> TransformReduceLoopRange(const ReduceOp& upstream,
   return results;
 }
 
-FusibleOp TrivialFusion(FusionNode* upstream, FusionNode* downstream) {
-  CHECK(upstream->IsTrivial());
-  if (downstream->IsTrivial()) {
-    return TrivalxOther_Fusion(std::get<TrivialOp>(upstream->fusible_op),
-                               std::get<TrivialOp>(downstream->fusible_op));
-  } else {
-    return TrivalxOther_Fusion(std::get<TrivialOp>(upstream->fusible_op),
-                               std::get<ReduceOp>(downstream->fusible_op));
+FusibleOp SinkTrivialLoopAlign(TrivialOp trivial_op,
+                               ReduceOp reduce_op,
+                               std::vector<size_t> fake_reduce_iter_idx) {
+  VLOG(4) << "SinkTrivialLoopAlign";
+  ir::Expr new_trivial_body = ir::ir_utils::IRCopy(trivial_op.GetFuncBody());
+  std::vector<ir::Var> all_out_iter_vars = GetOutputIters(trivial_op);
+  std::vector<ir::Var> non_reduce_iter_vars =
+      FilterWithFakeReduceIter(all_out_iter_vars, fake_reduce_iter_idx);
+  std::vector<ir::Var> fake_reduce_iter_vars;
+  for (const auto& idx : fake_reduce_iter_idx) {
+    fake_reduce_iter_vars.emplace_back(
+        all_out_iter_vars.at(static_cast<int>(idx)));
   }
-}
 
-FusibleOp SinkTrivialLoopAlign(TrivialOp trivial_op, ReduceOp reduce_op) {
-  ir::Expr new_trivial_body = ir::ir_utils::IRCopy(trivial_op.GetFuncBody());
-  ir::Var last_iter = GetOutputIters(trivial_op).back();
-  ir::Expr trivial_last_for = (ExprSetFinderUtils::ChildFors *
-                               ExprSetFinderUtils::IsForIterVar(last_iter))
-                                  .GetSingle(new_trivial_body);
+  VLOG(4) << "all_out_iter_vars: "
+          << cinn::utils::Join(all_out_iter_vars, ", ");
+  VLOG(4) << "non_reduce_iter_vars: "
+          << cinn::utils::Join(non_reduce_iter_vars, ", ");
+  VLOG(4) << "fake_reduce_iter_vars: "
+          << cinn::utils::Join(fake_reduce_iter_vars, ", ");
+
+  ir::Expr trivial_last_for =
+      (ExprSetFinderUtils::ChildFors *
+       ExprSetFinderUtils::IsForIterVar(all_out_iter_vars.back()))
+          .GetSingle(new_trivial_body);
   ir::Expr new_for_body = trivial_last_for.As<ir::For>()->body;
-  new_for_body = ExprTransformerUtils::WrapForsTransformer(
-      GetReduceIters(reduce_op))(new_for_body);
-  trivial_last_for.As<ir::For>()->body = new_for_body;
-  return TrivialOp(new_trivial_body);
-}
 
-std::vector<FusibleOp> ReduceTransformRecursive(FusibleOp root_op,
-                                                FusionNode* fusion_tree) {
-  VLOG(4) << "ReduceTransformRecursive: " << *_GetFuncBodyPointer(root_op);
-  std::vector<FusibleOp> result;
-  for (auto& pair : fusion_tree->upstream) {
-    auto transformed_nodes = TransformReduceLoopRange(
-        std::get<ReduceOp>(pair.first->fusible_op), &root_op);
-    for (auto& node : transformed_nodes) {
-      auto child_flatten = ReduceTransformRecursive(node, pair.first);
-      result.insert(result.end(), child_flatten.begin(), child_flatten.end());
+  const auto ExpandIterVars = [&]() {
+    std::vector<ir::Var> result =
+        ComposeUtils::ConcatVector(non_reduce_iter_vars, fake_reduce_iter_vars);
+    auto upstream_reduce_iters = GetReduceIters(reduce_op);
+    if (fake_reduce_iter_vars.size() != upstream_reduce_iters.size()) {
+      result.insert(result.end(),
+                    upstream_reduce_iters.begin(),
+                    upstream_reduce_iters.end());
     }
-  }
-  VLOG(4) << "Before push_back, is trivial_op: "
-          << std::holds_alternative<TrivialOp>(root_op);
-  result.push_back(
-      std::holds_alternative<TrivialOp>(root_op)
-          ? SinkTrivialLoopAlign(
-                std::get<TrivialOp>(root_op),
-                std::get<ReduceOp>(
-                    fusion_tree->upstream.begin()->first->fusible_op))
-          : root_op);
-  VLOG(4) << "After push_back.";
-  return result;
-}
+    VLOG(4) << "ExpandIterVars: " << cinn::utils::Join(result, ", ");
+    return result;
+  };
 
-std::vector<FusibleOp> ReduceTransform(FusionNode* downstream) {
-  if (downstream->IsTrivial() && downstream->upstream.empty()) {
-    return {downstream->fusible_op};
-  }
-  auto reduces = ReduceTransformRecursive(downstream->fusible_op, downstream);
-  return reduces;
+  ir::Expr new_schedule_realizer =
+      (ExprTransformerUtils::WrapForsTransformer(ExpandIterVars()) *
+       ExprTransformerUtils::WrapScheduleRealizer({}, "root"))(new_for_body);
+
+  VLOG(4) << "new_schedule_realizer\n" << new_schedule_realizer;
+  return TrivialOp(new_schedule_realizer);
 }
 
 FusibleOp CreateFusibleOp(ir::Expr compute_body, OpPatternKind op_pattern) {
@@ -512,77 +474,6 @@ std::vector<T> FilterVector(const std::vector<T>& ops, const F& f) {
   return res;
 }
 
-FusionGraph::FusionGraph(const std::vector<::pir::Operation*>& ops,
-                         const std::vector<ir::Expr>& op_compute_bodies) {
-  // shardable_axes_ = InferShardableAxes(ops);
-  VLOG(4) << "CreateFusionGraph";
-  const auto& filtered_ops = FilterVector(ops, [](const ::pir::Operation* op) {
-    if (op->name() == "cinn_op.generate_shape") {
-      return false;
-    }
-    return true;
-  });
-  const auto& op_patterns = GetOpPatternKindVector(filtered_ops);
-  CheckFusionInputValid(op_compute_bodies, op_patterns);
-
-  std::unordered_map<::pir::Operation*, FusionNode*> op_to_node_map;
-
-  for (int i = 0; i < filtered_ops.size(); ++i) {
-    FusionNode* node =
-        new FusionNode(CreateFusibleOp(op_compute_bodies[i], op_patterns[i]));
-    op_to_node_map[filtered_ops[i]] = node;
-    all_fusion_nodes_.emplace(node);
-    node->expr_related_op = filtered_ops[i];
-  }
-
-  for (::pir::Operation* op : filtered_ops) {
-    FusionNode* cur_node = op_to_node_map[op];
-
-    // add upstream nodes
-    for (int i = 0; i < op->num_operands(); ++i) {
-      ::pir::Value related_value = op->operand_source(i);
-      ::pir::Operation* input_op = related_value.defining_op();
-      if (op_to_node_map.find(input_op) != op_to_node_map.end()) {
-        FusionNode* upstream_node = op_to_node_map[input_op];
-        cur_node->upstream[upstream_node] = related_value;
-        upstream_node->downstream[cur_node] = related_value;
-      }
-    }
-
-    // add downstream nodes
-    for (int i = 0; i < op->num_results(); ++i) {
-      ::pir::Value related_value = op->result(i);
-      for (auto consumer_it = related_value.use_begin();
-           consumer_it != related_value.use_end();
-           ++consumer_it) {
-        ::pir::Operation* output_op = consumer_it->owner();
-        if (op_to_node_map.find(output_op) != op_to_node_map.end()) {
-          FusionNode* downstream_node = op_to_node_map[output_op];
-          cur_node->downstream[downstream_node] = related_value;
-          downstream_node->upstream[cur_node] = related_value;
-        }
-      }
-    }
-
-    if (cur_node->upstream.empty()) {
-      entrance_nodes_.emplace(cur_node);
-    }
-
-    if (cur_node->downstream.empty()) {
-      exit_nodes_.emplace(cur_node);
-    }
-  }
-
-  VLOG(4) << "FusionGraph Created, fusion node size: "
-          << all_fusion_nodes_.size();
-}
-
-FusionGraph::~FusionGraph() {
-  for (FusionNode* node : all_fusion_nodes_) {
-    delete node;
-  }
-}
-
 std::vector<ir::Expr> GetShapeFromVars(const std::vector<ir::Var>& vars) {
   std::vector<ir::Expr> res;
   for (const auto& v : vars) {
@@ -605,175 +496,90 @@ void DebugPrintReduceVar(const FusibleOp& op) {
   }
 }
 
-void FusionGraph::SplitReduceTransform() {
-  VLOG(4) << "SplitReduceTransform Start.";
-  std::vector<FusibleOp> result;
-  for (const auto& fop : fusion_results_) {
-    if (std::holds_alternative<ReduceOp>(fop)) {
-      VLOG(4) << "DebugPrint Op Origin: ";
-      ReduceOp reduce_op = std::get<ReduceOp>(fop);
-      ir::Tensor reduce_out_tensor = GetOutputTensor(reduce_op);
-      // substitude compute_body with a new init value.
-      ir::Expr trivial_compute_body =
-          ExprTransformerUtils::ChangeTensorLoadTransformer(
-              GetOutputTensor(fop),
-              GetInitExpr(reduce_op))(GetComputeBody(reduce_op));
-
-      const std::vector<ir::Var>& all_iters = ComposeUtils::ConcatVector(
-          GetOutputIters(reduce_op), GetReduceIters(reduce_op));
-      VLOG(4) << "Trivial Compute Body is " << trivial_compute_body;
-      ir::Tensor new_trivial_tensor =
-          ir::Tensor(reduce_out_tensor->name + "_split_transform",
-                     reduce_out_tensor->type(),
+std::pair<TrivialOp, ReduceOp> SplitReduceOp(const ReduceOp& reduce_op) {
+  VLOG(4) << "DebugPrint Op Origin: ";
+  ir::Tensor reduce_out_tensor = GetOutputTensor(reduce_op);
+  // substitude compute_body with a new init value.
+  ir::Expr trivial_compute_body =
+      ExprTransformerUtils::ChangeTensorLoadTransformer(
+          GetOutputTensor(reduce_op),
+          GetInitExpr(reduce_op))(GetComputeBody(reduce_op));
+
+  const std::vector<ir::Var>& all_iters = ComposeUtils::ConcatVector(
+      GetOutputIters(reduce_op), GetReduceIters(reduce_op));
+  VLOG(4) << "Trivial Compute Body is " << trivial_compute_body;
+  ir::Tensor new_trivial_tensor =
+      ir::Tensor(reduce_out_tensor->name + "_split_transform",
+                 reduce_out_tensor->type(),
+                 GetShapeFromVars(all_iters),
+                 GetShapeFromVars(all_iters),
+                 ir::ComputeOp::Make(
+                     reduce_out_tensor->name + "_split_transform",
+                     [body = trivial_compute_body](
+                         const std::vector<Expr>& indices) { return body; },
                      GetShapeFromVars(all_iters),
                      GetShapeFromVars(all_iters),
-                     ir::ComputeOp::Make(
-                         reduce_out_tensor->name + "_split_transform",
-                         [body = trivial_compute_body](
-                             const std::vector<Expr>& indices) { return body; },
-                         GetShapeFromVars(all_iters),
-                         GetShapeFromVars(all_iters),
-                         {}),
-                     {});
-      new_trivial_tensor->WithBuffer();
-      VLOG(4) << "Created Tensor is: " << new_trivial_tensor;
-      VLOG(4) << "Load Expr is: "
-              << new_trivial_tensor(ComposeUtils::VarVec2ExprVec(all_iters));
-
-      // push trivial op
-      VLOG(4) << "Splited TrivialOp is "
-              << CreateTrivialExpr(
-                     all_iters, trivial_compute_body, new_trivial_tensor);
-
-      result.emplace_back(TrivialOp(CreateTrivialExpr(
-          all_iters, trivial_compute_body, new_trivial_tensor)));
-
-      // push reduce op, change compute_body to
-      VLOG(4)
-          << "WrapReduceOperation start: with reduce_type: "
+                     {}),
+                 {});
+  new_trivial_tensor->WithBuffer();
+  VLOG(4) << "Created Tensor is: " << new_trivial_tensor;
+  VLOG(4) << "Load Expr is: "
+          << new_trivial_tensor(ComposeUtils::VarVec2ExprVec(all_iters));
+
+  // push trivial op
+  VLOG(4) << "Splited TrivialOp is "
+          << CreateTrivialExpr(
+                 all_iters, trivial_compute_body, new_trivial_tensor);
+
+  const auto& result_trivial = TrivialOp(
+      CreateTrivialExpr(all_iters, trivial_compute_body, new_trivial_tensor));
+
+  // push reduce op, change compute_body to
+  VLOG(4) << "WrapReduceOperation start: with reduce_type: "
           << GetOutputTensor(reduce_op)->body().As<ir::Reduce>()->reduce_type;
-      VLOG(4) << "WrapReduceOperation new_trivial_tensor: "
-              << new_trivial_tensor(ComposeUtils::VarVec2ExprVec(all_iters));
-      const ir::Expr& new_reduce_body =
-          ExprTransformerUtils::WrapReduceOperation(
-              GetOutputTensor(reduce_op)->body().As<ir::Reduce>()->reduce_type,
-              GetOutputTensor(reduce_op),
-              ComposeUtils::VarVec2ExprVec(GetOutputIters(reduce_op)))(
-              new_trivial_tensor(ComposeUtils::VarVec2ExprVec(all_iters)));
-      VLOG(4) << "Splited ReduceOp body is " << new_reduce_body;
-      VLOG(4) << "Splited ReduceOp is "
-              << CreateExprWithNewComputeBody(
-                     fop,
-                     ExprSetFinderUtils::Store2Value.GetSingle(
-                         new_reduce_body));
-      result.emplace_back(ReduceOp(CreateExprWithNewComputeBody(
-          fop, ExprSetFinderUtils::Store2Value.GetSingle(new_reduce_body))));
-    } else {
-      result.emplace_back(fop);
-    }
-  }
-  fusion_results_ = result;
+  VLOG(4) << "WrapReduceOperation new_trivial_tensor: "
+          << new_trivial_tensor(ComposeUtils::VarVec2ExprVec(all_iters));
+  const ir::Expr& new_reduce_body = ExprTransformerUtils::WrapReduceOperation(
+      GetOutputTensor(reduce_op)->body().As<ir::Reduce>()->reduce_type,
+      GetOutputTensor(reduce_op),
+      ComposeUtils::VarVec2ExprVec(GetOutputIters(reduce_op)))(
+      new_trivial_tensor(ComposeUtils::VarVec2ExprVec(all_iters)));
+  VLOG(4) << "Splited ReduceOp body is " << new_reduce_body;
+  VLOG(4) << "Splited ReduceOp is "
+          << CreateExprWithNewComputeBody(
+                 reduce_op,
+                 ExprSetFinderUtils::Store2Value.GetSingle(new_reduce_body));
+  const auto& result_reduce = ReduceOp(CreateExprWithNewComputeBody(
+      reduce_op, ExprSetFinderUtils::Store2Value.GetSingle(new_reduce_body)));
   VLOG(4) << "SplitReduceTransform End~";
+  return std::make_pair(result_trivial, result_reduce);
 }
 
-std::vector<ir::Expr> FusionGraph::DoFusion() {
-  VLOG(4) << "Start Trivial Fusion";
-  DoTrivialFusion();
-  VLOG(4) << "Start R + T and R + R Fusion";
-  ReduceLoopTranform();
-  // TODO(@xubin): remove this when backend support arbitrary reduce.
-  VLOG(4) << "Split Reduce Transform into a tmp tensor to keep reduce clean.";
-  SplitReduceTransform();
-  return GetExprResults();
-}
-
-FusionNode* FusionGraph::FindTrivialFusibleNode() {
-  for (FusionNode* node : all_fusion_nodes_) {
-    if (node->IsTrivial() && !node->downstream.empty()) {
-      return node;
-    }
-  }
-  return nullptr;
-}
-
-void FusionGraph::DoTrivialFusion() {
-  FusionNode* upstream = nullptr;
-  // use funcion to get upstream and downstream is save here
-  // cause we might delete Nodes in this process
-  while ((upstream = FindTrivialFusibleNode()) != nullptr) {
-    std::unordered_map<FusionNode*, ::pir::Value> fusion_candidate =
-        upstream->downstream;
-    upstream->downstream.clear();
-    for (const auto& pair_data : fusion_candidate) {
-      FusionNode* downstream = pair_data.first;
-      FusionNode* new_node =
-          new FusionNode(TrivialFusion(upstream, downstream));
-      new_node->replace_topo_structure_of_fused_nodes(upstream, downstream);
-      AppendNode(new_node);
-      RemoveNode(downstream);
-    }
-    RemoveNode(upstream);
-  }
-}
-
-void FusionGraph::ReduceLoopTranform() {
-  for (FusionNode* node : exit_nodes_) {
-    auto fusion_nodes = ReduceTransform(node);
-    fusion_results_.insert(
-        fusion_results_.end(), fusion_nodes.begin(), fusion_nodes.end());
-  }
-}
-
-std::vector<ir::Expr> FusionGraph::GetExprResults() {
-  std::vector<ir::Expr> output_exprs;
-  for (const auto& node : fusion_results_) {
-    output_exprs.emplace_back(_GetRootExpr(node));
-  }
-  return output_exprs;
-}
-
-void FusionGraph::RemoveNode(FusionNode* node) {
-  if (all_fusion_nodes_.find(node) != all_fusion_nodes_.end()) {
-    all_fusion_nodes_.erase(node);
-  }
-  if (entrance_nodes_.find(node) != entrance_nodes_.end()) {
-    entrance_nodes_.erase(node);
-  }
-  if (exit_nodes_.find(node) != exit_nodes_.end()) {
-    exit_nodes_.erase(node);
-  }
-  delete node;
-}
-
-void FusionGraph::AppendNode(FusionNode* node) {
-  all_fusion_nodes_.emplace(node);
-  if (node->upstream.empty()) {
-    entrance_nodes_.emplace(node);
-  }
+}  // namespace trivial_fusion_detail
 
-  if (node->downstream.empty()) {
-    exit_nodes_.emplace(node);
-  }
-}
+std::vector<ir::Expr> OperationFusion(
+    const std::vector<::pir::Operation*>& original_ops,
+    const std::vector<ir::Expr>& op_compute_bodies) {
+  const auto& ops = trivial_fusion_detail::FilterVector(
+      original_ops, [](const ::pir::Operation* op) {
+        if (op->name() == "cinn_op.generate_shape") {
+          return false;
+        }
+        return true;
+      });
 
-FusionNode* FusionGraph::FindReduceUpstream(FusionNode* node) {
-  for (const auto& pair_data : node->upstream) {
-    FusionNode* upstream = pair_data.first;
-    if (!upstream->IsTrivial()) {
-      return upstream;
-    }
+  std::vector<cinn::fusion::BackendContent> contents;
+  for (int i = 0; i < ops.size(); i++) {
+    contents.emplace_back(ops[i], op_compute_bodies[i]);
+    // contents.emplace_back(ops[i]);
   }
-  return nullptr;
-}
+  const auto& fusion_nodes =
+      cinn::fusion::ClusterOps<cinn::fusion::BackendStage>(contents);
 
-}  // namespace trivial_fusion_detail
+  CHECK(fusion_nodes.size() == 1)
+      << "Only support one fusion node in backend now.";
 
-std::vector<ir::Expr> OperationFusion(
-    const std::vector<::pir::Operation*>& ops,
-    const std::vector<ir::Expr>& op_compute_bodies) {
-  trivial_fusion_detail::FusionGraph graph =
-      trivial_fusion_detail::FusionGraph(ops, op_compute_bodies);
-  auto output = graph.DoFusion();
+  const auto& output = GetExprFromPattern(fusion_nodes[0]->stmt_pattern_);
   VLOG(4) << "Fusion Result: output size is " << output.size();
   for (const auto& expr : output) {
     VLOG(4) << expr;
diff --git a/paddle/cinn/hlir/framework/pir/trivial_op_impl.h b/paddle/cinn/hlir/framework/pir/trivial_op_impl.h
index f5964ad854848..48eddf4852870 100644
--- a/paddle/cinn/hlir/framework/pir/trivial_op_impl.h
+++ b/paddle/cinn/hlir/framework/pir/trivial_op_impl.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
 
+#include <unordered_map>
 #include <variant>
 
 #include "paddle/cinn/hlir/dialect/operator/ir/manual_op.h"
@@ -28,6 +29,7 @@
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "paddle/cinn/ir/schedule/ir_schedule_util.h"
 #include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/operator_fusion/pattern_graph.h"
 #include "paddle/cinn/optim/schedule_block_dce.h"
 #include "paddle/cinn/optim/transform_gpu_forloop.h"
 #include "paddle/common/ddim.h"
@@ -92,6 +94,8 @@ ir::Expr* _GetFuncBodyPointer(FusibleOp op);
 
 ir::Expr CopyReduceBody(const FusibleOp& downstream, const ReduceOp& upstream);
 
+int GetTensorCounter();
+
 ir::Expr CreateReduceExpr(
     const std::vector<ir::Var>& output_iters,
     const std::vector<ir::Var>& reduce_iters,
@@ -103,23 +107,13 @@ ir::Expr CreateReduceExpr(
 ir::Expr CreateTrivialExpr(const std::vector<ir::Var>& output_iters,
                            const ir::Expr& function_body,
                            const ir::Tensor& new_write_tensor);
+
 ir::Expr CreateExprWithNewComputeBody(const FusibleOp& fusible_op,
                                       const ir::Expr& new_compute_body);
-struct FusionNode {
-  FusibleOp fusible_op;
-  ::pir::Operation* expr_related_op;
-
-  std::unordered_map<FusionNode*, ::pir::Value> upstream;
-  std::unordered_map<FusionNode*, ::pir::Value> downstream;
-
-  explicit FusionNode(FusibleOp fusible_op);
 
-  static std::string GetTensorCounter();
-  void replace_topo_structure_of_fused_nodes(FusionNode* fused_up_node,
-                                             FusionNode* fused_down_node);
+bool CheckAllLoopRangeEq(ReduceOp reduce_upper, TrivialOp trivial_down);
 
-  bool IsTrivial() const;
-};
+FusibleOp CreateFusibleOp(ir::Expr compute_body, OpPatternKind op_pattern);
 
 template <class DownStreamOp>
 DownStreamOp TrivalxOther_Fusion(TrivialOp upstream, DownStreamOp downstream) {
@@ -142,54 +136,30 @@ DownStreamOp TrivalxOther_Fusion(TrivialOp upstream, DownStreamOp downstream) {
   return DownStreamOp(modified_body);
 }
 
-bool CheckAllLoopRangeEq(ReduceOp reduce_upper, TrivialOp trivial_down);
-
-std::vector<FusibleOp> TransformReduceLoopRange(const ReduceOp& upstream,
-                                                FusibleOp* downstream);
-
-FusibleOp TrivialFusion(FusionNode* upstream, FusionNode* downstream);
-
-FusibleOp SinkTrivialLoopAlign(TrivialOp trivial_op, ReduceOp reduce_op);
-
-std::vector<FusibleOp> ReduceTransformRecursive(FusibleOp root_op,
-                                                FusionNode* fusion_tree);
-std::vector<FusibleOp> ReduceTransform(FusionNode* downstream);
-
-FusibleOp CreateFusibleOp(ir::Expr compute_body, OpPatternKind op_pattern);
-
-struct FusionGraph {
-  explicit FusionGraph(const std::vector<::pir::Operation*>& ops,
-                       const std::vector<ir::Expr>& op_compute_bodies);
-
-  ~FusionGraph();
-
-  std::vector<ir::Expr> DoFusion();
-
- private:
-  FusionNode* FindTrivialFusibleNode();
-
-  void DoTrivialFusion();
-
-  void ReduceLoopTranform();
-
-  void SplitReduceTransform();
-
-  std::vector<ir::Expr> GetExprResults();
-
-  void RemoveNode(FusionNode* node);
-
-  void AppendNode(FusionNode* node);
-
-  FusionNode* FindReduceUpstream(FusionNode* node);
-
- private:
-  std::unordered_set<FusionNode*> all_fusion_nodes_;
-  std::vector<FusibleOp> fusion_results_;
-  std::unordered_set<FusionNode*> entrance_nodes_;
-  std::unordered_set<FusionNode*> exit_nodes_;
+std::pair<TrivialOp, ReduceOp> SplitReduceOp(const ReduceOp& reduce_op);
+
+std::vector<FusibleOp> TransformReduceLoopRange(
+    const ReduceOp& upstream,
+    FusibleOp* downstream,
+    std::vector<size_t> fake_reduce_iter_idx);
+
+template <typename T>
+std::vector<T> FilterWithFakeReduceIter(
+    const std::vector<T>& input, std::vector<size_t> fake_reduce_iter_idx) {
+  std::vector<T> result;
+  for (size_t i = 0; i < input.size(); i++) {
+    if (std::find(fake_reduce_iter_idx.begin(),
+                  fake_reduce_iter_idx.end(),
+                  i) == fake_reduce_iter_idx.end()) {
+      result.emplace_back(input.at(i));
+    }
+  }
+  return result;
+}
 
-  // std::unordered_map<::pir::Value, ShardableAxes> shardable_axes_;
-};
+FusibleOp SinkTrivialLoopAlign(TrivialOp trivial_op,
+                               ReduceOp reduce_op,
+                               std::vector<size_t> fake_reduce_iter_idx);
 
 }  // namespace trivial_fusion_detail
 
diff --git a/paddle/cinn/hlir/framework/pir/trivial_op_util.cc b/paddle/cinn/hlir/framework/pir/trivial_op_util.cc
index 9b776aae4e454..c930aa8a8fd95 100644
--- a/paddle/cinn/hlir/framework/pir/trivial_op_util.cc
+++ b/paddle/cinn/hlir/framework/pir/trivial_op_util.cc
@@ -502,7 +502,7 @@ void CheckFusionInputValid(const std::vector<ir::Expr>& op_compute_bodies,
                            const std::vector<OpPatternKind>& op_patterns) {
   if (VLOG_IS_ON(4)) {
     for (const auto& func : op_compute_bodies) {
-      VLOG(4) << "TrivialOpFusion: {FuncBody is} :" << func;
+      VLOG(4) << "FuncBody is :" << func;
     }
     for (const auto& op_ptn : op_patterns) {
       VLOG(4) << "OpPattern is :" << op_ptn;
diff --git a/paddle/cinn/hlir/framework/pir/trivial_op_util.h b/paddle/cinn/hlir/framework/pir/trivial_op_util.h
index e28cad31310f7..9dbddc6ada18c 100644
--- a/paddle/cinn/hlir/framework/pir/trivial_op_util.h
+++ b/paddle/cinn/hlir/framework/pir/trivial_op_util.h
@@ -46,6 +46,18 @@ std::vector<T> ConcatVector(const std::vector<T>& first,
   return result;
 }
 
+template <typename T, typename U>
+std::unordered_map<T, U> MakeMap(const std::vector<T>& keys,
+                                 const std::vector<U>& values) {
+  std::unordered_map<T, U> result = std::unordered_map<T, U>();
+
+  CHECK(keys.size() == values.size());
+  for (int i = 0; i < keys.size(); i++) {
+    result[keys[i]] = values[i];
+  }
+  return result;
+}
+
 std::vector<ir::Var> ExprVec2VarVec(const std::vector<ir::Expr>& in);
 std::vector<ir::Expr> VarVec2ExprVec(const std::vector<ir::Var>& in);
 
diff --git a/paddle/cinn/hlir/framework/pir/utils.cc b/paddle/cinn/hlir/framework/pir/utils.cc
index 4d20fbf382fe6..1bc39aee5370f 100644
--- a/paddle/cinn/hlir/framework/pir/utils.cc
+++ b/paddle/cinn/hlir/framework/pir/utils.cc
@@ -37,6 +37,7 @@
 
 PD_DECLARE_string(allow_cinn_ops);
 PD_DECLARE_string(deny_cinn_ops);
+COMMON_DECLARE_bool(disable_dyshape_in_train);
 
 namespace cinn {
 namespace hlir {
@@ -125,23 +126,23 @@ class OpTransInfo {
   DeParamCondT deny_param_cond_{{"batch_norm", {"ReserveSpace"}},
                                 {"batch_norm_grad", {"ReserveSpace"}}};
 
-  std::unordered_set<std::string> default_deny_ops_{
-      "feed",
-      "fetch",
-      "conv2d",
-      "conv2d_grad",
-      "depthwise_conv2d",
-      "depthwise_conv2d_grad",
-      "dropout",
-      "pool2d",
-      "pool2d_grad",
-      "split",
-      "matmul",
-      "matmul_grad",
-      "embedding_grad",
-      "embedding",
-      "arange",
-  };
+  std::unordered_set<std::string> default_deny_ops_{"feed",
+                                                    "fetch",
+                                                    "conv2d",
+                                                    "conv2d_grad",
+                                                    "depthwise_conv2d",
+                                                    "depthwise_conv2d_grad",
+                                                    "dropout",
+                                                    "pool2d",
+                                                    "pool2d_grad",
+                                                    "split",
+                                                    "matmul",
+                                                    "matmul_grad",
+                                                    "embedding_grad",
+                                                    "embedding",
+                                                    "arange",
+                                                    "softmax",
+                                                    "randint"};
 };
 
 std::string OpNameAfterStripDialect(const ::pir::Operation& op) {
@@ -173,12 +174,7 @@ bool UnimplementOps(const ::pir::Operation& op) {
   return false;
 }
 
-bool HaveZeroDimInput(const ::pir::Operation& op) {
-  auto HasZeroDim = [](const ::pir::Type& type) {
-    auto tensor_type = type.dyn_cast<::pir::DenseTensorType>();
-    return tensor_type && tensor_type.dims().size() == 0U;
-  };
-
+bool HaveUnkDim(const ::pir::Operation& op) {
   auto HasNegDim = [](const ::pir::Type& type) {
     auto tensor_type = type.dyn_cast<::pir::DenseTensorType>();
 
@@ -194,9 +190,9 @@ bool HaveZeroDimInput(const ::pir::Operation& op) {
   };
 
   // Judge for vector<Type>
-  auto HasZeroDimInVT = [&](const std::vector<::pir::Type>& types) {
+  auto HasUnkDimInVT = [&](const std::vector<::pir::Type>& types) {
     for (auto& type : types) {
-      if (HasZeroDim(type)) return true;
+      if (HasNegDim(type)) return true;
     }
     return false;
   };
@@ -205,8 +201,18 @@ bool HaveZeroDimInput(const ::pir::Operation& op) {
     auto value = op.operand_source(i);
     if (!value || !value.type()) continue;
     if (auto vector_type = value.type().dyn_cast<::pir::VectorType>()) {
-      if (HasZeroDimInVT(vector_type.data())) return true;
-    } else if (HasZeroDim(value.type()) || HasNegDim(value.type())) {
+      if (HasUnkDimInVT(vector_type.data())) return true;
+    } else if (HasNegDim(value.type())) {
+      return true;
+    }
+  }
+
+  for (size_t i = 0; i < op.num_results(); ++i) {
+    auto value = op.result(i);
+    if (!value || !value.type()) continue;
+    if (auto vector_type = value.type().dyn_cast<::pir::VectorType>()) {
+      if (HasUnkDimInVT(vector_type.data())) return true;
+    } else if (HasNegDim(value.type())) {
       return true;
     }
   }
@@ -323,22 +329,21 @@ bool IsTempDenySpecialOp(const ::pir::Operation& op) {
   if (op.name() == "cinn_op.generate_shape") {
     return false;
   }
-  return IsShapeComputeOp(op) || IsSmallNumelOp(op);
+  return IsShapeComputeOp(op);
 }
 
 // Mainly used for pd_to_cinn_pass and reused in IsSupportInCinn function.
 bool IsDeniedInCinn(const ::pir::Operation& op) {
+  if (FLAGS_disable_dyshape_in_train && HaveUnkDim(op)) {
+    return true;
+  }
   if (!AllInputDenseTensor(op) || UnimplementOps(op)) {
     VLOG(5) << "Found " << op.name()
             << " UnimplementOps or NotAllInputDenseTensor. "
             << "So mark IsDeniedForCinn: " << true;
     return true;
   }
-  if (IsTempDenySpecialOp(op)) {
-    VLOG(5) << "Found " << op.name() << " is in TempDenySpecialOp."
-            << "So mark IsDeniedForCinn: " << true;
-    return true;
-  }
+
   // Strip the dialect, like pd_op.abs -> abs
   const auto op_name = OpNameAfterStripDialect(op);
   const bool is_denied = OpTransInfo().IsDeniedByDefault(op_name);
@@ -423,12 +428,12 @@ std::string CompatibleInfo::OpFuncName(const ::pir::Operation& op) {
 
 std::string CompatibleInfo::GroupOpsName(
     const std::vector<::pir::Operation*>& ops) {
-  std::string name = "fn";
+  std::string name = "fn_";
   for (auto* op : ops) {
-    std::string op_name = OpName(*op);
-    name += "_" + cinn::common::Context::Global().NewName(op_name);
+    name += OpName(*op);
+    name += "_";
   }
-  return name;
+  return cinn::common::Context::Global().NewName(name);
 }
 
 std::string CompatibleInfo::ValueName(const ::pir::Value& value) {
diff --git a/paddle/cinn/hlir/framework/pir_compiler.cc b/paddle/cinn/hlir/framework/pir_compiler.cc
index 2db39508ce1e1..73f2d11f3e1b4 100644
--- a/paddle/cinn/hlir/framework/pir_compiler.cc
+++ b/paddle/cinn/hlir/framework/pir_compiler.cc
@@ -16,23 +16,128 @@
 
 #include "paddle/cinn/hlir/framework/pir/utils.h"
 #include "paddle/cinn/utils/multi_threading.h"
+#include "paddle/common/enforce.h"
+#include "paddle/common/flags.h"
+
+PD_DECLARE_bool(enable_cinn_compile_cache);
 
 namespace cinn::hlir::framework {
 
+class CompilationContextMapper {
+ public:
+  CompilationContextMapper(const Target& target,
+                           const std::vector<pir::OpLoweringGroupPtr>& groups) {
+    Construct(target, groups);
+  }
+  std::vector<GroupCompilationContext>& UniqueCompilationContexts() {
+    return group_compilation_contexts_;
+  }
+  std::vector<std::shared_ptr<pir::CompilationResult>>&
+  MutableCompilationResult() {
+    return compilation_results_;
+  }
+
+  std::vector<pir::CINNKernelInfo> RecoverKernelInfos();
+  void UpdateGlobalCache();
+  void SetFinalize(bool val) { is_finalized_ = val; }
+
+ private:
+  void Construct(const Target& target,
+                 const std::vector<pir::OpLoweringGroupPtr>& groups);
+  std::vector<size_t> mapper_index_;
+  std::vector<pir::FusionInfo> fusion_infos_;
+  std::vector<GroupCompilationContext> group_compilation_contexts_;
+  std::vector<std::shared_ptr<pir::CompilationResult>> compilation_results_;
+
+  bool is_finalized_{false};
+};
+
 std::vector<pir::CINNKernelInfo> PirCompiler::Build(
     const std::vector<pir::OpLoweringGroupPtr>& groups) {
-  std::vector<pir::CINNKernelInfo> kernel_infos(groups.size());
-  for (int i = 0; i < groups.size(); ++i) {
-    group_compilation_contexts_.emplace_back(target_, groups[i]);
+  CompilationContextMapper ctx_mapper(target_, groups);
+  auto& group_compilation_contexts = ctx_mapper.UniqueCompilationContexts();
+  auto& compilation_results = ctx_mapper.MutableCompilationResult();
+
+  const size_t task_size = group_compilation_contexts.size();
+  const size_t thread_size = FLAGS_enable_cinn_compile_cache ? task_size : 1;
+  VLOG(5) << "Found " << task_size << " new groups parsed from "
+          << groups.size();
+  if (task_size > 0) {
+    auto worker_fn = [&](int index) {
+      CompilationTask task(&group_compilation_contexts[index]);
+      compilation_results[index] = task();
+    };
+    utils::parallel_run(worker_fn,
+                        utils::SequenceDispatcher(0, task_size),
+                        /*thread_num=*/thread_size);
   }
-  auto worker_fn = [&](int index) {
-    CompilationTask task(&group_compilation_contexts_[index]);
-    task();
-    kernel_infos[index] = task.GetCINNKernelInfo();
+  ctx_mapper.SetFinalize(true);
+  ctx_mapper.UpdateGlobalCache();
+  return ctx_mapper.RecoverKernelInfos();
+}
+
+void CompilationContextMapper::Construct(
+    const Target& target, const std::vector<pir::OpLoweringGroupPtr>& groups) {
+  std::unordered_set<size_t> unique_infos;
+  const auto IsNewAndUnique =
+      [&unique_infos](const pir::FusionInfo& info) -> bool {
+    const bool is_unique = unique_infos.find(info.hash()) == unique_infos.end();
+    const bool is_new = !CompilationCache::Instance().Has(info);
+    return is_new && is_unique;
   };
-  utils::parallel_run(
-      worker_fn, utils::SequenceDispatcher(0, groups.size()), -1);
+
+  for (size_t i = 0; i < groups.size(); ++i) {
+    fusion_infos_.emplace_back(*groups[i]);
+    // If FLAGS_enable_cinn_compile_cache=False, Cache strategy will not take
+    // effects.
+    if (IsNewAndUnique(fusion_infos_[i]) || !FLAGS_enable_cinn_compile_cache) {
+      mapper_index_.push_back(i);
+      group_compilation_contexts_.emplace_back(target, groups[i]);
+      compilation_results_.push_back(
+          std::make_shared<pir::CompilationResult>(target));
+    }
+    unique_infos.insert(fusion_infos_[i].hash());
+  }
+}
+
+std::vector<pir::CINNKernelInfo>
+CompilationContextMapper::RecoverKernelInfos() {
+  PADDLE_ENFORCE_EQ(
+      is_finalized_,
+      true,
+      ::common::errors::PreconditionNotMet(
+          "Required is_finalized_ = true, please call SetFinalize() firstly."));
+  PADDLE_ENFORCE_EQ(group_compilation_contexts_.size(),
+                    compilation_results_.size(),
+                    ::common::errors::PreconditionNotMet(
+                        "Required group_compilation_contexts_.size() = "
+                        "compilation_results_.size()."));
+
+  std::vector<pir::CINNKernelInfo> kernel_infos(fusion_infos_.size());
+  for (size_t i = 0; i < fusion_infos_.size(); ++i) {
+    kernel_infos[i] =
+        CompilationCache::Instance().GetKernelInfo(fusion_infos_[i]);
+  }
   return kernel_infos;
 }
 
+void CompilationContextMapper::UpdateGlobalCache() {
+  PADDLE_ENFORCE_EQ(
+      is_finalized_,
+      true,
+      ::common::errors::PreconditionNotMet(
+          "Required is_finalized_ = true, please call SetFinalize() firstly."));
+  for (size_t i = 0; i < compilation_results_.size(); ++i) {
+    PADDLE_ENFORCE_LT(mapper_index_[i],
+                      fusion_infos_.size(),
+                      ::common::errors::PreconditionNotMet(
+                          "Required mapper_index < fusion_infos_.size()."));
+    const auto& fusion_info = fusion_infos_[mapper_index_[i]];
+    const auto& int_args_map =
+        compilation_results_[i]->GetBackendResource()->GetIntArgsMap();
+    VLOG(5) << "Insert new compiled result into cache, fusion_info: "
+            << fusion_info << ", int_args_map: " << int_args_map;
+    CompilationCache::Instance().Insert(fusion_info, compilation_results_[i]);
+  }
+}
 }  // namespace cinn::hlir::framework
diff --git a/paddle/cinn/hlir/framework/pir_compiler.h b/paddle/cinn/hlir/framework/pir_compiler.h
index d9429b76a6fa8..9ea83defa0cb9 100644
--- a/paddle/cinn/hlir/framework/pir_compiler.h
+++ b/paddle/cinn/hlir/framework/pir_compiler.h
@@ -31,7 +31,6 @@ class PirCompiler final {
   CINN_DISALLOW_COPY_AND_ASSIGN(PirCompiler);
 
   Target target_;
-  std::vector<GroupCompilationContext> group_compilation_contexts_;
 };
 
 }  // namespace cinn::hlir::framework
diff --git a/paddle/cinn/hlir/framework/program.cc b/paddle/cinn/hlir/framework/program.cc
index 0e00795ae775d..dd8d8aba91da0 100644
--- a/paddle/cinn/hlir/framework/program.cc
+++ b/paddle/cinn/hlir/framework/program.cc
@@ -169,6 +169,33 @@ void Program::Export(const std::vector<std::string>& persistent_vars,
   fclose(f);
 }
 
+void DeviceSynchronizeImpl(common::UnknownArch, void* stream) {
+  LOG(FATAL) << "NotImplemented.";
+}
+
+void DeviceSynchronizeImpl(common::X86Arch, void* stream) {
+  // Do nothing.
+}
+
+void DeviceSynchronizeImpl(common::ARMArch, void* stream) {
+  // Do nothing.
+}
+
+void DeviceSynchronizeImpl(common::NVGPUArch, void* stream) {
+#ifdef CINN_WITH_CUDA
+  VLOG(4) << "-- The value of the used stream: " << stream;
+  if (stream == nullptr) {
+    CUDA_CALL(cudaDeviceSynchronize());
+  }
+#endif
+}
+
+void DeviceSynchronize(common::Arch arch, void* stream) {
+  return std::visit(
+      [&](const auto& impl) { return DeviceSynchronizeImpl(impl, stream); },
+      arch.variant());
+}
+
 void Program::Execute(
     const std::map<std::string, cinn_pod_value_t>* name2podargs,
     void* stream,
@@ -176,12 +203,7 @@ void Program::Execute(
   for (auto& ins : instrs_) {
     ins->Run(name2podargs, false, stream, use_cache);
   }
-#ifdef CINN_WITH_CUDA
-  VLOG(4) << "-- The value of the used stream: " << stream;
-  if (instrs_[0]->target_.arch == Target::Arch::NVGPU && stream == nullptr) {
-    CUDA_CALL(cudaDeviceSynchronize());
-  }
-#endif
+  DeviceSynchronize(instrs_[0]->target_.arch, stream);
 }
 
 void Program::ExecuteTest(int repeat_) {
@@ -197,11 +219,7 @@ void Program::ExecuteTest(int repeat_) {
       ins->Run();
     }
   }
-#ifdef CINN_WITH_CUDA
-  if (instrs_[0]->target_.arch == Target::Arch::NVGPU) {
-    CUDA_CALL(cudaDeviceSynchronize());
-  }
-#endif
+  DeviceSynchronize(instrs_[0]->target_.arch, nullptr);
   double test_op_time = timer1.Stop() / repeat_;
   VLOG(3) << "Repeat times: [" << repeat_ << "], average op time: ["
           << test_op_time << "] ms";
diff --git a/paddle/cinn/hlir/op/contrib/argmax.cc b/paddle/cinn/hlir/op/contrib/argmax.cc
index b3c6a647c4bc3..f1ccccd61d7c4 100644
--- a/paddle/cinn/hlir/op/contrib/argmax.cc
+++ b/paddle/cinn/hlir/op/contrib/argmax.cc
@@ -184,7 +184,7 @@ std::shared_ptr<framework::OpStrategy> StrategyForArgmax(
                                         output_shapes[0].end(),
                                         1,
                                         std::multiplies<int>());
-    if (prod_size > 1 && target.arch == Target::Arch::X86) {
+    if (prod_size > 1 && std::holds_alternative<common::X86Arch>(target.arch)) {
       pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target, true);
     }
     std::vector<cinn::common::CINNValue> res{
diff --git a/paddle/cinn/hlir/op/contrib/argmin.cc b/paddle/cinn/hlir/op/contrib/argmin.cc
index dff137f0d9952..798f420cc76fc 100644
--- a/paddle/cinn/hlir/op/contrib/argmin.cc
+++ b/paddle/cinn/hlir/op/contrib/argmin.cc
@@ -182,7 +182,7 @@ std::shared_ptr<framework::OpStrategy> StrategyForArgmin(
                                         output_shapes[0].end(),
                                         1,
                                         std::multiplies<int>());
-    if (prod_size > 1 && target.arch == Target::Arch::X86) {
+    if (prod_size > 1 && std::holds_alternative<common::X86Arch>(target.arch)) {
       pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target, true);
     }
     std::vector<cinn::common::CINNValue> res{
diff --git a/paddle/cinn/hlir/op/contrib/gather_nd.cc b/paddle/cinn/hlir/op/contrib/gather_nd.cc
index 8080cabb83609..92ba839f17211 100644
--- a/paddle/cinn/hlir/op/contrib/gather_nd.cc
+++ b/paddle/cinn/hlir/op/contrib/gather_nd.cc
@@ -187,11 +187,17 @@ std::shared_ptr<framework::OpStrategy> StrategyForGatherNd(
                                         1,
                                         std::multiplies<int>());
     if (prod_size > 1) {
-      if (target.arch == Target::Arch::NVGPU) {
-        pe::IRCudaScheduleInjective(ir_sch, output_shapes.front(), target);
-      } else if (target.arch == Target::Arch::X86) {
-        pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target, true);
-      }
+      target.arch.Visit(adt::match{
+          [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+          [&](common::X86Arch) {
+            pe::IRScheduleInjectiveCPU(
+                ir_sch, output_shapes.front(), target, true);
+          },
+          [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+          [&](common::NVGPUArch) {
+            pe::IRCudaScheduleInjective(ir_sch, output_shapes.front(), target);
+          },
+      });
     }
     std::vector<cinn::common::CINNValue> res{
         cinn::common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
diff --git a/paddle/cinn/hlir/op/contrib/repeat.cc b/paddle/cinn/hlir/op/contrib/repeat.cc
index f77e5939099b5..5347d454c39aa 100644
--- a/paddle/cinn/hlir/op/contrib/repeat.cc
+++ b/paddle/cinn/hlir/op/contrib/repeat.cc
@@ -198,11 +198,17 @@ std::shared_ptr<framework::OpStrategy> StrategyForRepeat(
                                         1,
                                         std::multiplies<int>());
     if (prod_size > 1) {
-      if (target.arch == Target::Arch::NVGPU) {
-        pe::IRCudaScheduleInjective(ir_sch, output_shapes.front(), target);
-      } else if (target.arch == Target::Arch::X86) {
-        pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target, true);
-      }
+      target.arch.Visit(adt::match{
+          [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+          [&](common::X86Arch) {
+            pe::IRScheduleInjectiveCPU(
+                ir_sch, output_shapes.front(), target, true);
+          },
+          [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+          [&](common::NVGPUArch) {
+            pe::IRCudaScheduleInjective(ir_sch, output_shapes.front(), target);
+          },
+      });
     }
     std::vector<cinn::common::CINNValue> res{
         cinn::common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
diff --git a/paddle/cinn/hlir/op/contrib/resize.cc b/paddle/cinn/hlir/op/contrib/resize.cc
index 91319ef7e5ac1..63329c5602013 100644
--- a/paddle/cinn/hlir/op/contrib/resize.cc
+++ b/paddle/cinn/hlir/op/contrib/resize.cc
@@ -55,15 +55,18 @@ ir::Tensor Resize(const ir::Tensor &input,
                   const std::string &mode,
                   const std::string &output_name) {
   std::string func_name;
-
-  if (target.arch == cinn::common::Target::Arch::NVGPU) {
-    func_name.assign("cinn_cuda_resize_");
-  } else if (target.arch == cinn::common::Target::Arch::X86) {
-    func_name.assign("cinn_host_resize_");
-  } else {
-    PADDLE_THROW(phi::errors::Fatal(
-        "Resize only supports X86 and NVGPU ! Please Check.\n"));
-  }
+  target.arch.Visit(adt::match{
+      [&](common::UnknownArch) {
+        PADDLE_THROW(phi::errors::Fatal(
+            "Resize only supports X86 and NVGPU ! Please Check.\n"));
+      },
+      [&](common::X86Arch) { func_name.assign("cinn_host_resize_"); },
+      [&](common::ARMArch) {
+        PADDLE_THROW(phi::errors::Fatal(
+            "Resize only supports X86 and NVGPU ! Please Check.\n"));
+      },
+      [&](common::NVGPUArch) { func_name.assign("cinn_cuda_resize_"); },
+  });
 
   if (mode == "bilinear") {
     func_name.append("bilinear");
@@ -241,11 +244,17 @@ std::shared_ptr<framework::OpStrategy> StrategyForResize(
                                         1,
                                         std::multiplies<int>());
     if (prod_size > 1) {
-      if (target.arch == Target::Arch::NVGPU) {
-        pe::IRCudaScheduleInjective(ir_sch, output_shapes.front(), target);
-      } else if (target.arch == Target::Arch::X86) {
-        pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target, true);
-      }
+      target.arch.Visit(adt::match{
+          [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+          [&](common::X86Arch) {
+            pe::IRScheduleInjectiveCPU(
+                ir_sch, output_shapes.front(), target, true);
+          },
+          [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+          [&](common::NVGPUArch) {
+            pe::IRCudaScheduleInjective(ir_sch, output_shapes.front(), target);
+          },
+      });
     }
     std::vector<cinn::common::CINNValue> res{
         cinn::common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
diff --git a/paddle/cinn/hlir/op/contrib/sort.cc b/paddle/cinn/hlir/op/contrib/sort.cc
index 49f50a13ab6c9..2ecb08b41749c 100644
--- a/paddle/cinn/hlir/op/contrib/sort.cc
+++ b/paddle/cinn/hlir/op/contrib/sort.cc
@@ -51,14 +51,22 @@ std::vector<ir::Tensor> ArgSort(const ir::Tensor &A,
                                 const std::string &name) {
   std::string find_func_name;
   std::string index_func_name;
-  if (target.arch == cinn::common::Target::Arch::NVGPU) {
-    find_func_name.assign("cinn_nvgpu_next_smallest_int32");
-  } else if (target.arch == cinn::common::Target::Arch::X86) {
-    find_func_name.assign("cinn_host_next_smallest_int32");
-  } else {
-    PADDLE_THROW(phi::errors::Fatal(
-        "ArgSort only supports X86 and NVGPU ! Please Check.\n"));
-  }
+  target.arch.Visit(adt::match{
+      [&](common::UnknownArch) {
+        PADDLE_THROW(phi::errors::Fatal(
+            "ArgSort only supports X86 and NVGPU ! Please Check.\n"));
+      },
+      [&](common::X86Arch) {
+        find_func_name.assign("cinn_host_next_smallest_int32");
+      },
+      [&](common::ARMArch) {
+        PADDLE_THROW(phi::errors::Fatal(
+            "ArgSort only supports X86 and NVGPU ! Please Check.\n"));
+      },
+      [&](common::NVGPUArch) {
+        find_func_name.assign("cinn_nvgpu_next_smallest_int32");
+      },
+  });
   if (is_ascend) {
     index_func_name =
         cinn::hlir::GetExternFuncName(target, A->type(), "lt_num");
@@ -215,7 +223,8 @@ std::shared_ptr<framework::OpStrategy> StrategyForSort(
                                             output_shapes[0].end(),
                                             1,
                                             std::multiplies<int>());
-        if (prod_size > 1 && target.arch == Target::Arch::X86) {
+        if (prod_size > 1 &&
+            std::holds_alternative<common::X86Arch>(target.arch)) {
           pe::IRScheduleInjectiveCPU(
               ir_sch, output_shapes.front(), target, true);
         }
@@ -298,7 +307,7 @@ std::shared_ptr<framework::OpStrategy> StrategyForArgSort(
                                         output_shapes[0].end(),
                                         1,
                                         std::multiplies<int>());
-    if (prod_size > 1 && target.arch == Target::Arch::X86) {
+    if (prod_size > 1 && std::holds_alternative<common::X86Arch>(target.arch)) {
       pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target, true);
     }
     std::vector<cinn::common::CINNValue> res{
diff --git a/paddle/cinn/hlir/op/elementwise.cc b/paddle/cinn/hlir/op/elementwise.cc
index 243ea5f0eb8a2..d32c2c0af8b2f 100644
--- a/paddle/cinn/hlir/op/elementwise.cc
+++ b/paddle/cinn/hlir/op/elementwise.cc
@@ -1763,6 +1763,8 @@ CINN_REGISTER_HELPER(elementwise_ops) {
       .set_num_outputs(1)
       .set_attr<cinn::hlir::framework::StrategyFunction>(
           "CINNStrategy", cinn::hlir::op::StrategyForLogicalNot)
+      .set_attr<cinn::hlir::framework::StrategyFunctionSymbolic>(
+          "CINNStrategySymbolic", cinn::hlir::op::StrategyForLogicalNotSymbolic)
       .set_attr("infershape",
                 MakeOpFunction(cinn::hlir::op::InferShapeForElementwise))
       .set_attr("inferdtype",
diff --git a/paddle/cinn/hlir/op/nn.cc b/paddle/cinn/hlir/op/nn.cc
index 2b1ce342e0810..995a5a6bc4787 100644
--- a/paddle/cinn/hlir/op/nn.cc
+++ b/paddle/cinn/hlir/op/nn.cc
@@ -71,6 +71,35 @@ std::shared_ptr<OpStrategy> StrategyForRelu(
   return strategy;
 }
 
+std::shared_ptr<OpStrategy> StrategyForRelu6Symbolic(
+    const framework::NodeAttr &attrs,
+    const std::vector<ir::Tensor> &inputs,
+    const std::vector<Type> &out_type,
+    const std::vector<std::vector<ir::Dim>> &output_shapes,
+    const Target &target) {
+  framework::CINNCompute relu6_compute(
+      [](lang::Args args, lang::RetValue *ret) {
+        CHECK(!args.empty())
+            << "The input argument of relu6 compute is empty! Please check.\n";
+        CINNValuePack pack_args = args[0];
+        CHECK(!pack_args.empty())
+            << "at least one input tensor for relu6 compute\n";
+        Expr A = pack_args[0];
+        CHECK(A.as_tensor());
+        CHECK_EQ(pack_args.size(), 2);
+        CHECK(pack_args[1].is_string());
+        std::string tensor_name = pack_args[1].operator std::string();
+        auto out = pe::Relu6(A.as_tensor_ref(), 0.0, tensor_name);
+        auto stages = CreateStages({out});
+        *ret = CINNValuePack{{CINNValue(Expr(out.get())), CINNValue(stages)}};
+      });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  CHECK(out_type.size()) << "Out_type of relu6 op is empty! Please check.";
+  strategy->AddImpl(relu6_compute, lang::PackedFunc(), "strategy.relu6.x86", 1);
+  return strategy;
+}
+
 std::shared_ptr<OpStrategy> StrategyForReluSymbolic(
     const framework::NodeAttr &attrs,
     const std::vector<ir::Tensor> &inputs,
@@ -167,7 +196,7 @@ std::shared_ptr<OpStrategy> StrategyForConv2d(
   int groups = 1;
   std::string key = "";
   std::string conv_type = "";
-  bool use_mkldnn = false;
+  bool use_onednn = false;
   if (attrs.attr_store.find("padding") != attrs.attr_store.end()) {
     padding = absl::get<std::vector<int>>(attrs.attr_store.at("padding"));
   }
@@ -183,8 +212,8 @@ std::shared_ptr<OpStrategy> StrategyForConv2d(
   if (attrs.attr_store.find("groups") != attrs.attr_store.end()) {
     groups = absl::get<int>(attrs.attr_store.at("groups"));
   }
-  if (attrs.attr_store.find("use_mkldnn") != attrs.attr_store.end()) {
-    use_mkldnn = absl::get<bool>(attrs.attr_store.at("use_mkldnn"));
+  if (attrs.attr_store.find("use_onednn") != attrs.attr_store.end()) {
+    use_onednn = absl::get<bool>(attrs.attr_store.at("use_onednn"));
   }
   if (attrs.attr_store.find("key") != attrs.attr_store.end()) {
     key = absl::get<std::string>(attrs.attr_store.at("key"));
@@ -230,22 +259,34 @@ std::shared_ptr<OpStrategy> StrategyForConv2d(
         if (data_format == "NCHW") {
           // A is input: [N, C, H, W], B is filter: [C_out, C_in/group,
           // filter_h, filter_w]
-          if (target.arch == Target::Arch::X86) {
-            if (groups == 1 && !use_mkldnn) {
-              out = pe::Conv2d_NCHW_5D(A.as_tensor_ref(),
-                                       B.as_tensor_ref(),
-                                       padding[0],
-                                       padding[1],
-                                       stride[0],
-                                       stride[1],
-                                       dilation[0],
-                                       dilation[1],
-                                       key,
-                                       tensor_name,
-                                       target);
-            } else {
+          target.arch.Visit(adt::match{
+              [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+              [&](common::X86Arch) {
+                if (groups == 1 && !use_onednn) {
+                  out = pe::Conv2d_NCHW_5D(A.as_tensor_ref(),
+                                           B.as_tensor_ref(),
+                                           padding[0],
+                                           padding[1],
+                                           stride[0],
+                                           stride[1],
+                                           dilation[0],
+                                           dilation[1],
+                                           key,
+                                           tensor_name,
+                                           target);
+                } else {
 #ifdef CINN_WITH_DNNL
-              out = pe::Conv2d_NCHW_MKLDNN(A.as_tensor_ref(),
+                  out = pe::Conv2d_NCHW_ONEDNN(A.as_tensor_ref(),
+                                               B.as_tensor_ref(),
+                                               padding[0],
+                                               padding[1],
+                                               stride[0],
+                                               stride[1],
+                                               dilation[0],
+                                               dilation[1],
+                                               tensor_name);
+#else
+                  out = pe::Conv2d_NCHW_5D(A.as_tensor_ref(),
                                            B.as_tensor_ref(),
                                            padding[0],
                                            padding[1],
@@ -253,45 +294,38 @@ std::shared_ptr<OpStrategy> StrategyForConv2d(
                                            stride[1],
                                            dilation[0],
                                            dilation[1],
+                                           key,
                                            tensor_name);
-#else
-              out = pe::Conv2d_NCHW_5D(A.as_tensor_ref(),
-                                       B.as_tensor_ref(),
-                                       padding[0],
-                                       padding[1],
-                                       stride[0],
-                                       stride[1],
-                                       dilation[0],
-                                       dilation[1],
-                                       key,
-                                       tensor_name);
 #endif
-            }
-          } else {
-            if (conv_type == "forward") {
-              out = pe::Conv2d_NCHW(A.as_tensor_ref(),
-                                    B.as_tensor_ref(),
-                                    padding[0],
-                                    padding[1],
-                                    stride[0],
-                                    stride[1],
-                                    dilation[0],
-                                    dilation[1],
-                                    tensor_name);
-              out.push_back(B.as_tensor_ref());
-            } else {
+                }
+              },
+              [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+              [&](common::NVGPUArch) {
+                if (conv_type == "forward") {
+                  out = pe::Conv2d_NCHW(A.as_tensor_ref(),
+                                        B.as_tensor_ref(),
+                                        padding[0],
+                                        padding[1],
+                                        stride[0],
+                                        stride[1],
+                                        dilation[0],
+                                        dilation[1],
+                                        tensor_name);
+                  out.push_back(B.as_tensor_ref());
+                } else {
 #ifdef CINN_WITH_CUDNN
-              // as backward_data and backward_filter is not support now, we
-              // built a fake op to instead. as the runtime use cudnn to compute
-              // the conv2d, so this fake op is not been called. When cinn
-              // support backward_filter/backward_data code gen, this code is to
-              // be removed.
-              out = pe::Identity(A.as_tensor_ref());
-              out.push_back(A.as_tensor_ref());
-              out.push_back(B.as_tensor_ref());
+                  // as backward_data and backward_filter is not support now, we
+                  // built a fake op to instead. as the runtime use cudnn to
+                  // compute the conv2d, so this fake op is not been called.
+                  // When cinn support backward_filter/backward_data code gen,
+                  // this code is to be removed.
+                  out = pe::Identity(A.as_tensor_ref());
+                  out.push_back(A.as_tensor_ref());
+                  out.push_back(B.as_tensor_ref());
 #endif
-            }
-          }
+                }
+              },
+          });
         } else if (data_format == "NHWC") {
           // A is input: [N, H, W, C], B is filter: [C_out, C_in/group,
           // filter_h, filter_w]
@@ -339,39 +373,48 @@ std::shared_ptr<OpStrategy> StrategyForConv2d(
     ir::ModuleExpr mod_expr(vec_ast);
     ir::IRSchedule ir_sch(mod_expr);
     ir_sch.MergeExprs();
-    if (target.arch == Target::Arch::NVGPU) {
+    target.arch.Visit(adt::match{
+        [&](common::UnknownArch) {
+          PADDLE_THROW(phi::errors::InvalidArgument(
+              "This target [%s] is not supported yet.", target));
+        },
+        [&](common::X86Arch) {
+          PADDLE_THROW(phi::errors::InvalidArgument(
+              "This target [%s] is not supported yet.", target));
+        },
+        [&](common::ARMArch) {
+          PADDLE_THROW(phi::errors::InvalidArgument(
+              "This target [%s] is not supported yet.", target));
+        },
+        [&](common::NVGPUArch) {
 #ifdef CINN_WITH_CUDNN
-      // If conv_type is backward_filter or backward_data, we built a fake op.
-      // As runtime use cudnn to compute conv2d, this fake op is not to be
-      // called. When cinn support backward_filter/backward_data code gen,
-      // this code is to be removed.
-      if (conv_type != "forward") {
-        CHECK_EQ(vec_ast.size(), 1);
-        pe::IRCudaScheduleInjective(ir_sch, output_shapes.front(), target);
-        std::vector<CINNValue> res{
-            CINNValue(ir_sch.GetModule().GetExprs().at(0))};
-        *ret = CINNValuePack{res};
-        return;
-      }
+          // If conv_type is backward_filter or backward_data, we built a fake
+          // op. As runtime use cudnn to compute conv2d, this fake op is not to
+          // be called. When cinn support backward_filter/backward_data code
+          // gen, this code is to be removed.
+          if (conv_type != "forward") {
+            CHECK_EQ(vec_ast.size(), 1);
+            pe::IRCudaScheduleInjective(ir_sch, output_shapes.front(), target);
+            std::vector<CINNValue> res{
+                CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+            *ret = CINNValuePack{res};
+            return;
+          }
 #endif
-      int expr_size = vec_ast.size();
-      if (expr_size == 2) {
-        pe::IRCudaScheduleConv(ir_sch, target);
-        VLOG(3) << "After IRCudaScheduleConv, arg_pack[0] is : "
-                << ir_sch.GetModule().GetExprs().at(0);
-        std::vector<CINNValue> res{
-            CINNValue(ir_sch.GetModule().GetExprs().at(0))};
-        *ret = CINNValuePack{res};
-        return;
-      } else {
-        CINN_NOT_IMPLEMENTED
-      }
-    } else if (target.arch == Target::Arch::X86) {
-      CINN_NOT_IMPLEMENTED
-    }
-    std::stringstream ss;
-    ss << "This target [" << target << "] is not supported yet.";
-    PADDLE_THROW(phi::errors::InvalidArgument(ss.str()));
+          int expr_size = vec_ast.size();
+          if (expr_size == 2) {
+            pe::IRCudaScheduleConv(ir_sch, target);
+            VLOG(3) << "After IRCudaScheduleConv, arg_pack[0] is : "
+                    << ir_sch.GetModule().GetExprs().at(0);
+            std::vector<CINNValue> res{
+                CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+            *ret = CINNValuePack{res};
+            return;
+          } else {
+            CINN_NOT_IMPLEMENTED
+          }
+        },
+    });
   });
 
   auto strategy = std::make_shared<framework::OpStrategy>();
@@ -619,7 +662,7 @@ std::shared_ptr<OpStrategy> StrategyForConv2dNCHWc(
         CHECK_EQ(dilation.size(), 2)
             << "The size of stride in conv2d_NCHWc op is not 2! Please check.";
         std::vector<ir::Tensor> out;
-        CHECK(target.arch == Target::Arch::X86)
+        CHECK(std::holds_alternative<common::X86Arch>(target.arch))
             << "conv2d_NCHWc op is only used in x86";
         // A is input: [N, C_in_outer, H, W, C_in_inner], B is filter: [C_out,
         // C_in_group_outer, filter_h, filter_w, C_in_group_inner]
@@ -867,27 +910,32 @@ std::shared_ptr<OpStrategy> StrategyForDepthwiseConv2d(
     CHECK(pack_args[2].is_string());
     std::string tensor_name = pack_args[2].operator std::string();
     if (data_format == "NCHW") {
-      if (target.arch == Target::Arch::X86) {
-        out = pe::Conv2d_NCHW_5D(A.as_tensor_ref(),
-                                 B.as_tensor_ref(),
-                                 padding[0],
-                                 padding[1],
-                                 stride[0],
-                                 stride[1],
-                                 dilation[0],
-                                 dilation[1],
-                                 key,
-                                 tensor_name,
-                                 target);
-      } else {
-        out = pe::Depthwise_Conv2d_NCHW(A.as_tensor_ref(),
-                                        B.as_tensor_ref(),
-                                        padding[0],
-                                        padding[1],
-                                        stride[0],
-                                        stride[1],
-                                        tensor_name);
-      }
+      target.arch.Visit(adt::match{
+          [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+          [&](common::X86Arch) {
+            out = pe::Conv2d_NCHW_5D(A.as_tensor_ref(),
+                                     B.as_tensor_ref(),
+                                     padding[0],
+                                     padding[1],
+                                     stride[0],
+                                     stride[1],
+                                     dilation[0],
+                                     dilation[1],
+                                     key,
+                                     tensor_name,
+                                     target);
+          },
+          [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+          [&](common::NVGPUArch) {
+            out = pe::Depthwise_Conv2d_NCHW(A.as_tensor_ref(),
+                                            B.as_tensor_ref(),
+                                            padding[0],
+                                            padding[1],
+                                            stride[0],
+                                            stride[1],
+                                            tensor_name);
+          },
+      });
     } else if (data_format == "NHWC") {
       out = pe::Depthwise_Conv2d_NHWC(A.as_tensor_ref(),
                                       B.as_tensor_ref(),
@@ -934,11 +982,14 @@ std::shared_ptr<OpStrategy> StrategyForDepthwiseConv2d(
         ir::ModuleExpr mod_expr(vec_ast);
         ir::IRSchedule ir_sch(mod_expr);
         ir_sch.MergeExprs();
-        if (target.arch == Target::Arch::NVGPU) {
-          pe::IRCudaScheduleDepthwiseConv(ir_sch, vec_tensor);
-        } else {
-          CINN_NOT_IMPLEMENTED
-        }
+        target.arch.Visit(adt::match{
+            [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+            [&](common::X86Arch) { CINN_NOT_IMPLEMENTED; },
+            [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+            [&](common::NVGPUArch) {
+              pe::IRCudaScheduleDepthwiseConv(ir_sch, vec_tensor);
+            },
+        });
         std::vector<cinn::common::CINNValue> res{
             cinn::common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
         *ret = cinn::common::CINNValuePack{res};
@@ -1063,7 +1114,8 @@ std::shared_ptr<OpStrategy> StrategyForBatchNorm(
     CHECK(Variance.as_tensor());
     ir::Tensor out;
     auto tensor_input = A.as_tensor_ref();
-    if (tensor_input->shape.size() != 4 && target.arch == Target::Arch::X86) {
+    if (tensor_input->shape.size() != 4 &&
+        std::holds_alternative<common::X86Arch>(target.arch)) {
       CHECK_EQ(input_layouts.size(), 5U)
           << "batch_norm_NCHWc's input layout should be 5";
       std::string input_layout = input_layouts[0];
@@ -1246,16 +1298,25 @@ std::shared_ptr<OpStrategy> StrategyForPool1d(
       auto block_input_pad = ir_sch.GetBlock(input_pad.as_tensor()->name);
       ir_sch.ComputeInline(block_input_pad);
     }
-    if (target.arch == Target::Arch::NVGPU) {
-      CHECK(!vec_tensor.empty());
-      Expr Out = vec_tensor[0];
-      CHECK(Out.as_tensor());
-      auto loops = ir_sch.GetLoops(Out.as_tensor()->name);
-      ir_sch.Split(loops[1], {-1, 2});
-      loops = ir_sch.GetLoops(Out.as_tensor()->name);
-      ir_sch.Bind(loops[0], "blockIdx.x");
-      ir_sch.Bind(loops[1], "threadIdx.x");
-    }
+    target.arch.Visit(adt::match{
+        [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+        [&](common::X86Arch) {
+          // Do nothing.
+        },
+        [&](common::ARMArch) {
+          // Do nothing.
+        },
+        [&](common::NVGPUArch) {
+          CHECK(!vec_tensor.empty());
+          Expr Out = vec_tensor[0];
+          CHECK(Out.as_tensor());
+          auto loops = ir_sch.GetLoops(Out.as_tensor()->name);
+          ir_sch.Split(loops[1], {-1, 2});
+          loops = ir_sch.GetLoops(Out.as_tensor()->name);
+          ir_sch.Bind(loops[0], "blockIdx.x");
+          ir_sch.Bind(loops[1], "threadIdx.x");
+        },
+    });
     std::vector<CINNValue> res{CINNValue(ir_sch.GetModule().GetExprs().at(0))};
     *ret = CINNValuePack{res};
   });
@@ -1469,11 +1530,12 @@ std::shared_ptr<OpStrategy> StrategyForPool2d(
     ir::ModuleExpr mod_expr(vec_ast);
     ir::IRSchedule ir_sch(mod_expr);
     ir_sch.MergeExprs();
-    if (target.arch == Target::Arch::NVGPU) {
-      pe::IRGlobalPoolScheduleGPU(ir_sch, target);
-    } else {
-      CINN_NOT_IMPLEMENTED
-    }
+    target.arch.Visit(adt::match{
+        [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+        [&](common::X86Arch) { CINN_NOT_IMPLEMENTED; },
+        [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+        [&](common::NVGPUArch) { pe::IRGlobalPoolScheduleGPU(ir_sch, target); },
+    });
     std::vector<CINNValue> res{CINNValue(ir_sch.GetModule().GetExprs().at(0))};
     *ret = CINNValuePack{res};
   });
@@ -1548,9 +1610,14 @@ std::shared_ptr<OpStrategy> StrategyForPool2d(
       auto block_input_pad = ir_sch.GetBlock(input_pad_name);
       ir_sch.ComputeInline(block_input_pad);
     }
-    if (target.arch == Target::Arch::NVGPU) {
-      pe::IRPoolScheduleGPU(ir_sch, target, arg_pack_size);
-    }
+    target.arch.Visit(adt::match{
+        [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+        [&](common::X86Arch) {},
+        [&](common::ARMArch) {},
+        [&](common::NVGPUArch) {
+          pe::IRPoolScheduleGPU(ir_sch, target, arg_pack_size);
+        },
+    });
     std::vector<CINNValue> res{CINNValue(ir_sch.GetModule().GetExprs().at(0))};
     *ret = CINNValuePack{res};
   });
@@ -1558,15 +1625,23 @@ std::shared_ptr<OpStrategy> StrategyForPool2d(
   auto strategy = std::make_shared<framework::OpStrategy>();
 
   bool use_warp_reduce = false;
-  if (global_pooling && data_format == "NCHW" &&
-      target.arch == Target::Arch::NVGPU) {
-    // TODO(hp03): 32 may not be the exact number, try also 16 or 8 or other
-    // number
-    //      we choose 32 to make sure all the threads in a warp has work to do,
-    if ((A_tensor->shape[2].as_int32() * A_tensor->shape[3].as_int32()) >= 32) {
-      use_warp_reduce = true;
-    }
-  }
+  target.arch.Visit(adt::match{
+      [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+      [&](common::X86Arch) { use_warp_reduce = false; },
+      [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+      [&](common::NVGPUArch) {
+        if (global_pooling && data_format == "NCHW") {
+          // TODO(hp03): 32 may not be the exact number, try also 16 or 8 or
+          // other number
+          //      we choose 32 to make sure all the threads in a warp has work
+          //      to do,
+          if ((A_tensor->shape[2].as_int32() * A_tensor->shape[3].as_int32()) >=
+              32) {
+            use_warp_reduce = true;
+          }
+        }
+      },
+  });
   strategy->AddImpl(pool2d_compute, pool2d_schedule, "strategy.pool2d.x86", 1);
   if (use_warp_reduce) {
     strategy->AddImpl(global_pool2d_compute,
@@ -1778,16 +1853,21 @@ std::shared_ptr<OpStrategy> StrategyForPool3d(
       auto block_input_pad = ir_sch.GetBlock(input_pad.as_tensor()->name);
       ir_sch.ComputeInline(block_input_pad);
     }
-    if (target.arch == Target::Arch::NVGPU) {
-      CHECK(!vec_tensor.empty());
-      Expr Out = vec_tensor[0];
-      CHECK(Out.as_tensor());
-      auto loops = ir_sch.GetLoops(Out.as_tensor()->name);
-      ir_sch.Split(loops[1], {-1, 2});
-      loops = ir_sch.GetLoops(Out.as_tensor()->name);
-      ir_sch.Bind(loops[0], "blockIdx.x");
-      ir_sch.Bind(loops[1], "threadIdx.x");
-    }
+    target.arch.Visit(adt::match{
+        [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+        [&](common::X86Arch) { /*nothing*/ },
+        [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+        [&](common::NVGPUArch) {
+          CHECK(!vec_tensor.empty());
+          Expr Out = vec_tensor[0];
+          CHECK(Out.as_tensor());
+          auto loops = ir_sch.GetLoops(Out.as_tensor()->name);
+          ir_sch.Split(loops[1], {-1, 2});
+          loops = ir_sch.GetLoops(Out.as_tensor()->name);
+          ir_sch.Bind(loops[0], "blockIdx.x");
+          ir_sch.Bind(loops[1], "threadIdx.x");
+        },
+    });
     std::vector<CINNValue> res{CINNValue(ir_sch.GetModule().GetExprs().at(0))};
     *ret = CINNValuePack{res};
   });
@@ -1912,12 +1992,12 @@ std::shared_ptr<OpStrategy> StrategyForSoftmax(
     const std::vector<std::vector<int>> &output_shapes,
     const Target &target) {
   int axis = -1;
-  bool use_mkldnn = false;
+  bool use_onednn = false;
   if (attrs.attr_store.count("axis")) {
     axis = absl::get<int>(attrs.attr_store.at("axis"));
   }
-  if (attrs.attr_store.count("use_mkldnn")) {
-    use_mkldnn = absl::get<bool>(attrs.attr_store.at("use_mkldnn"));
+  if (attrs.attr_store.count("use_onednn")) {
+    use_onednn = absl::get<bool>(attrs.attr_store.at("use_onednn"));
   }
   framework::CINNCompute softmax_compute(
       [=](lang::Args args, lang::RetValue *ret) {
@@ -1942,8 +2022,8 @@ std::shared_ptr<OpStrategy> StrategyForSoftmax(
             pack_args[pack_args.size() - 1].operator std::string();
 
 #ifdef CINN_WITH_DNNL
-        if (use_mkldnn) {
-          out = pe::SoftmaxMKLDNN(A, new_axis, tensor_name);
+        if (use_onednn) {
+          out = pe::SoftmaxONEDNN(A, new_axis, tensor_name);
         } else {
           out = pe::Softmax(A, new_axis, tensor_name);
         }
@@ -1979,37 +2059,42 @@ std::shared_ptr<OpStrategy> StrategyForSoftmax(
     ir::ModuleExpr mod_expr(vec_ast);
     ir::IRSchedule ir_sch(mod_expr);
     ir_sch.MergeExprs();
-    if (target.arch == Target::Arch::NVGPU) {
-      if (output_shapes[0].size() > 1) {
-        auto all_blocks = ir_sch.GetAllBlocks();
-        CHECK_EQ(all_blocks.size(), 3);
-        auto loops = ir_sch.GetLoops(all_blocks[2]);
-        ir_sch.ComputeAt(all_blocks[1], loops.back());
-
-        if (output_shapes[0][0] != 1) {
-          ir_sch.SimpleComputeAt(all_blocks[0], loops[0]);
-        }
+    target.arch.Visit(adt::match{
+        [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+        [&](common::X86Arch) {
+          pe::IRSoftmaxScheduleCPU(ir_sch, axis);
+          std::vector<CINNValue> res{
+              CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+          *ret = CINNValuePack{res};
+        },
+        [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+        [&](common::NVGPUArch) {
+          if (output_shapes[0].size() > 1) {
+            auto all_blocks = ir_sch.GetAllBlocks();
+            CHECK_EQ(all_blocks.size(), 3);
+            auto loops = ir_sch.GetLoops(all_blocks[2]);
+            ir_sch.ComputeAt(all_blocks[1], loops.back());
+
+            if (output_shapes[0][0] != 1) {
+              ir_sch.SimpleComputeAt(all_blocks[0], loops[0]);
+            }
 
-        loops = ir_sch.GetLoops(all_blocks[2]);
-        int loop_index = 1;
-        if (output_shapes[0][0] == 1) loop_index--;
-        CHECK_GE(loops.size(), loop_index + 1);
-        auto splited_loops = ir_sch.Split(loops[loop_index], {-1, 5});
+            loops = ir_sch.GetLoops(all_blocks[2]);
+            int loop_index = 1;
+            if (output_shapes[0][0] == 1) loop_index--;
+            CHECK_GE(loops.size(), loop_index + 1);
+            auto splited_loops = ir_sch.Split(loops[loop_index], {-1, 5});
 
-        all_blocks = ir_sch.GetAllBlocks();
-        loops = ir_sch.GetLoops(all_blocks[2]);
-        ir_sch.Bind(loops[0], "blockIdx.x");
-        ir_sch.Bind(loops[1], "threadIdx.x");
-      }
-      std::vector<CINNValue> res{
-          CINNValue(ir_sch.GetModule().GetExprs().at(0))};
-      *ret = CINNValuePack{res};
-    } else if (target.arch == Target::Arch::X86) {
-      pe::IRSoftmaxScheduleCPU(ir_sch, axis);
-      std::vector<CINNValue> res{
-          CINNValue(ir_sch.GetModule().GetExprs().at(0))};
-      *ret = CINNValuePack{res};
-    }
+            all_blocks = ir_sch.GetAllBlocks();
+            loops = ir_sch.GetLoops(all_blocks[2]);
+            ir_sch.Bind(loops[0], "blockIdx.x");
+            ir_sch.Bind(loops[1], "threadIdx.x");
+          }
+          std::vector<CINNValue> res{
+              CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+          *ret = CINNValuePack{res};
+        },
+    });
   });
 
   auto strategy = std::make_shared<framework::OpStrategy>();
@@ -2043,7 +2128,7 @@ std::vector<std::vector<std::string>> InferLayoutForSoftmax(
   CHECK_EQ(input_layouts.size(), 1U)
       << "The input's layout size is not 1! Please check again.";
   if (input_shapes[0].size() > 4) {
-    // input tensor needs to be transformed back to NCHW for mkldnn
+    // input tensor needs to be transformed back to NCHW for onednn
     return {{"NCHW", "NCHW"}, {"NCHW"}};
   }
   return {{input_layouts[0], input_layouts[0]}, input_layouts};
@@ -2399,6 +2484,8 @@ CINN_REGISTER_HELPER(nn_ops) {
       .set_num_outputs(1)
       .set_attr<cinn::hlir::framework::StrategyFunction>(
           "CINNStrategy", cinn::hlir::op::StrategyForRelu6)
+      .set_attr<cinn::hlir::framework::StrategyFunctionSymbolic>(
+          "CINNStrategySymbolic", cinn::hlir::op::StrategyForRelu6Symbolic)
       .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForRelu))
       .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForRelu))
 #ifndef CINN_WITH_CUDA
diff --git a/paddle/cinn/hlir/op/op_util.cc b/paddle/cinn/hlir/op/op_util.cc
index b0976f22c38cb..37eef516bac46 100644
--- a/paddle/cinn/hlir/op/op_util.cc
+++ b/paddle/cinn/hlir/op/op_util.cc
@@ -72,18 +72,45 @@ CINNSchedule GetInjectiveScheduleFunc(
     ir::IRSchedule ir_sch(mod_expr);
     ir_sch.MergeExprs();
     pe::IRInjectiveSchedule(ir_sch, output_shapes.front(), target);
-    /*if (target.arch == Target::Arch::NVGPU) {
-      pe::IRInjectiveSchedule(ir_sch, output_shapes.front(), target);
-    } else if (target.arch == Target::Arch::X86) {
-      pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target,
-    vectorizable);
-    }*/
     std::vector<cinn::common::CINNValue> res{
         cinn::common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
     *ret = cinn::common::CINNValuePack{res};
   });
 }
 
+std::string GetExternFuncNameArchPrefixImpl(common::UnknownArch,
+                                            const std::string& func_name) {
+  std::stringstream ss;
+  ss << func_name << " only supports X86 and NVGPU! Please Check.\n";
+  PADDLE_THROW(phi::errors::Fatal(ss.str()));
+}
+
+std::string GetExternFuncNameArchPrefixImpl(common::X86Arch,
+                                            const std::string& func_name) {
+  return "host_";
+}
+
+std::string GetExternFuncNameArchPrefixImpl(common::ARMArch,
+                                            const std::string& func_name) {
+  std::stringstream ss;
+  ss << func_name << " only supports X86 and NVGPU! Please Check.\n";
+  PADDLE_THROW(phi::errors::Fatal(ss.str()));
+}
+
+std::string GetExternFuncNameArchPrefixImpl(common::NVGPUArch,
+                                            const std::string& func_name) {
+  return "nvgpu_";
+}
+
+std::string GetExternFuncNameArchPrefix(common::Arch arch,
+                                        const std::string& func_name) {
+  return std::visit(
+      [&](const auto& impl) {
+        return GetExternFuncNameArchPrefixImpl(impl, func_name);
+      },
+      arch.variant());
+}
+
 std::string GetExternFuncName(const cinn::common::Target& target,
                               const cinn::common::Type& type,
                               const std::string& func_name,
@@ -95,15 +122,8 @@ std::string GetExternFuncName(const cinn::common::Target& target,
     func_proto_name.append("cinn_");
   }
   if (need_target) {
-    if (target.arch == cinn::common::Target::Arch::NVGPU) {
-      func_proto_name.append("nvgpu_");
-    } else if (target.arch == cinn::common::Target::Arch::X86) {
-      func_proto_name.append("host_");
-    } else {
-      std::stringstream ss;
-      ss << func_name << " only supports X86 and NVGPU! Please Check.\n";
-      PADDLE_THROW(phi::errors::Fatal(ss.str()));
-    }
+    const auto& prefix = GetExternFuncNameArchPrefix(target.arch, func_name);
+    func_proto_name.append(prefix);
   }
   func_proto_name.append(func_name);
   if (!need_type) {
diff --git a/paddle/cinn/hlir/op/reduction.cc b/paddle/cinn/hlir/op/reduction.cc
index d5a378dc809e6..b3180ba555f3a 100644
--- a/paddle/cinn/hlir/op/reduction.cc
+++ b/paddle/cinn/hlir/op/reduction.cc
@@ -205,7 +205,7 @@ std::shared_ptr<OpStrategy> StrategyForReduce(
     ir::ModuleExpr mod_expr(vec_ast);
     ir::IRSchedule ir_sch(mod_expr);
     ir_sch.MergeExprs();
-    if (!FLAGS_cinn_new_group_scheduler && target.arch == Target::Arch::NVGPU) {
+    const auto ReduceSchedule = [&]() {
       if (!WithoutLastDimInReduce(inputs[0]->shape, reduce_axes)) {
         if (arg_pack.size() == 4) {
           CHECK_EQ(vec_tensor.size(), 2);
@@ -307,11 +307,29 @@ std::shared_ptr<OpStrategy> StrategyForReduce(
           PADDLE_THROW(phi::errors::InvalidArgument("Unkown Reduce Type!"));
         }
       }
-    } else {
-      std::vector<CINNValue> res{
-          CINNValue(ir_sch.GetModule().GetExprs().at(0))};
-      *ret = CINNValuePack{res};
-    }
+    };
+    target.arch.Visit(adt::match{
+        [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+        [&](common::X86Arch) {
+          std::vector<CINNValue> res{
+              CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+          *ret = CINNValuePack{res};
+        },
+        [&](common::ARMArch) {
+          std::vector<CINNValue> res{
+              CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+          *ret = CINNValuePack{res};
+        },
+        [&](common::NVGPUArch) {
+          if (!FLAGS_cinn_new_group_scheduler) {
+            ReduceSchedule();
+          } else {
+            std::vector<CINNValue> res{
+                CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+            *ret = CINNValuePack{res};
+          }
+        },
+    });
   });
 
   auto strategy = std::make_shared<framework::OpStrategy>();
diff --git a/paddle/cinn/hlir/op/transform.cc b/paddle/cinn/hlir/op/transform.cc
index 21754487e7846..3d7bfdbf3623c 100644
--- a/paddle/cinn/hlir/op/transform.cc
+++ b/paddle/cinn/hlir/op/transform.cc
@@ -89,27 +89,32 @@ std::shared_ptr<OpStrategy> StrategyForMatMul(
     auto new_B = tensor_B->Reshape(new_shape_B_e, stages);
 
     std::vector<ir::Tensor> out;
-    if (target.arch == Target::Arch::X86) {
+    target.arch.Visit(adt::match{
+        [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+        [&](common::X86Arch) {
 #ifdef CINN_WITH_MKL_CBLAS
-      out = pe::MatmulMKL(new_A,
-                          new_B,
-                          trans_a,
-                          trans_b,
-                          alpha,
-                          UniqName("MatmulMKL_output"),
-                          target);
+          out = pe::MatmulMKL(new_A,
+                              new_B,
+                              trans_a,
+                              trans_b,
+                              alpha,
+                              UniqName("MatmulMKL_output"),
+                              target);
 #else
-      out = pe::MatmulV2(new_A,
-                         new_B,
-                         trans_a,
-                         trans_b,
-                         alpha,
-                         UniqName("MatmulV2_output"),
-                         target);
+          out = pe::MatmulV2(new_A,
+                             new_B,
+                             trans_a,
+                             trans_b,
+                             alpha,
+                             UniqName("MatmulV2_output"),
+                             target);
 #endif
-    } else {
-      out = pe::Matmul(new_A, new_B, trans_a, trans_b, alpha, tensor_name);
-    }
+        },
+        [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+        [&](common::NVGPUArch) {
+          out = pe::Matmul(new_A, new_B, trans_a, trans_b, alpha, tensor_name);
+        },
+    });
 
     std::vector<CINNValue> res;
     for (auto &t : out) {
@@ -619,17 +624,23 @@ std::shared_ptr<OpStrategy> StrategyForMul(
         CHECK(pack_args.back().is_string());
         std::string tensor_name = pack_args.back().operator std::string();
 
-        if (target.arch == Target::Arch::X86) {
+        target.arch.Visit(adt::match{
+            [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+            [&](common::X86Arch) {
 #ifdef CINN_WITH_MKL_CBLAS
-          out = pe::MatmulMKL(
-              new_A, new_B, false, is_infer, 1.0f, tensor_name, target);
+              out = pe::MatmulMKL(
+                  new_A, new_B, false, is_infer, 1.0f, tensor_name, target);
 #else
-          out = pe::MatmulV2(
-              new_A, new_B, false, is_infer, 1.0f, tensor_name, target);
+              out = pe::MatmulV2(
+                  new_A, new_B, false, is_infer, 1.0f, tensor_name, target);
 #endif
-        } else {
-          out = pe::Matmul(new_A, new_B, false, is_infer, 1.0f, tensor_name);
-        }
+            },
+            [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+            [&](common::NVGPUArch) {
+              out =
+                  pe::Matmul(new_A, new_B, false, is_infer, 1.0f, tensor_name);
+            },
+        });
 
         std::vector<CINNValue> res;
         for (auto &t : out) {
@@ -854,7 +865,7 @@ std::shared_ptr<OpStrategy> StrategyForLayoutTransform(
     ir::IRSchedule ir_sch(mod_expr);
     ir_sch.MergeExprs();
 
-    if (target.arch == Target::Arch::X86) {
+    if (std::holds_alternative<common::X86Arch>(target.arch)) {
       pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target);
     } else {
       CINN_NOT_IMPLEMENTED
diff --git a/paddle/cinn/hlir/pass/CMakeLists.txt b/paddle/cinn/hlir/pass/CMakeLists.txt
index 328ecb6d4e0dc..744c0fe12c8a0 100644
--- a/paddle/cinn/hlir/pass/CMakeLists.txt
+++ b/paddle/cinn/hlir/pass/CMakeLists.txt
@@ -46,7 +46,7 @@ if(NOT WITH_CUDA)
   #cinn_cc_test(test_alterlayout SRCS alterlayout_test.cc DEPS cinncore)
 endif()
 cinn_cc_test(test_dot_merger SRCS test_dot_merger.cc DEPS cinncore)
-cinn_cc_test(test_dce_pass SRCS dce_pass_test.cc DEPS cinncore)
+# cinn_cc_test(test_dce_pass SRCS dce_pass_test.cc DEPS cinncore)
 cinn_cc_test(test_common_subexpression_elimination SRCS
              common_subexpression_elimination_test.cc DEPS cinncore)
 cinn_cc_test(test_constant_folding_pass SRCS constant_folding_pass_test.cc DEPS
diff --git a/paddle/cinn/hlir/pass/alterlayout.cc b/paddle/cinn/hlir/pass/alterlayout.cc
index 8ca3475c2d7e3..74c8c0915e0af 100644
--- a/paddle/cinn/hlir/pass/alterlayout.cc
+++ b/paddle/cinn/hlir/pass/alterlayout.cc
@@ -140,7 +140,7 @@ std::vector<framework::shape_t> UpdateInferInfos(
 
 void AlterLayoutPass(Graph* graph) {
   // alter layout only in X86 for it's specific layout requirements
-  if (graph->target_.arch == Target::Arch::X86) {
+  if (std::holds_alternative<common::X86Arch>(graph->target_.arch)) {
     auto store_nodes = std::get<0>(graph->topological_order());
     auto& shape_dict = graph->GetMutableAttrs<
         absl::flat_hash_map<std::string, framework::shape_t>>("infershape");
diff --git a/paddle/cinn/hlir/pe/broadcast.cc b/paddle/cinn/hlir/pe/broadcast.cc
index fb47ed737fdf3..fab2af9c5f0dc 100644
--- a/paddle/cinn/hlir/pe/broadcast.cc
+++ b/paddle/cinn/hlir/pe/broadcast.cc
@@ -400,10 +400,7 @@ Tensor BroadcastTo(const Tensor& A,
           } else if (MathEqual(a_shape_i, out_shape[idx])) {
             broadcast_indice.push_back(indice[idx]);
           } else {
-            std::stringstream ss;
-            ss << "fail to broad cast input shape " << a_shape_i
-               << " to output shape " << out_shape[idx];
-            PADDLE_THROW(phi::errors::InvalidArgument(ss.str()));
+            broadcast_indice.push_back(indice[idx] % a_shape_i);
           }
         }
         return A(broadcast_indice);
diff --git a/paddle/cinn/hlir/pe/elementwise.cc b/paddle/cinn/hlir/pe/elementwise.cc
index 663b32451ae12..559014658de0e 100644
--- a/paddle/cinn/hlir/pe/elementwise.cc
+++ b/paddle/cinn/hlir/pe/elementwise.cc
@@ -288,7 +288,6 @@ ir::Tensor Reshape(const ir::Tensor& A,
           auto temp = inner_offset % A_expr_shape[i];
           indice_a.insert(indice_a.begin(), temp);
         }
-        LOG(INFO) << "indice_a = " << indice_a[0];
         return A(indice_a);
       },
       name);
@@ -340,9 +339,16 @@ ir::Tensor Tril(const ir::Tensor& A,
   ir::Tensor res = Compute(
       ToCinnExprs(out_shape),
       [=](const std::vector<Expr>& indice) {
-        return ir::Select::Make(indice[0] >= indice[1] - diagonal,
+        PADDLE_ENFORCE_GE(indice.size(),
+                          size_t(2),
+                          phi::errors::InvalidArgument(
+                              "The Tril op input tensor must have a rank "
+                              "greater than or equal to 2."));
+        std::vector<Expr> new_indice(indice.end() - 2, indice.end());
+        Expr col_indice = indice.back();
+        return ir::Select::Make(new_indice[0] >= new_indice[1] - diagonal,
                                 A(indice),
-                                ir::Expr(static_cast<float>(0.)));
+                                ir::Zero(A->type()));
       },
       name);
   return res;
diff --git a/paddle/cinn/hlir/pe/ir_schedule_pe.cc b/paddle/cinn/hlir/pe/ir_schedule_pe.cc
index d224a5fd1e1ca..4950d575015bc 100644
--- a/paddle/cinn/hlir/pe/ir_schedule_pe.cc
+++ b/paddle/cinn/hlir/pe/ir_schedule_pe.cc
@@ -184,8 +184,8 @@ std::vector<cinn::common::CINNValue> IRCudaScheduleMatMul(
     const cinn::common::CINNValuePack &arg_pack,
     const std::vector<int> &output_shape,
     const cinn::common::Target &target) {
-  if (target.arch == Target::Arch::X86) {
-    CINN_NOT_IMPLEMENTED
+  if (!std::holds_alternative<common::NVGPUArch>(target.arch)) {
+    CINN_NOT_IMPLEMENTED;
   }
   std::vector<Expr> vec_ast;
   for (int i = 0; i < arg_pack.size(); i++) {
diff --git a/paddle/cinn/hlir/pe/nn.cc b/paddle/cinn/hlir/pe/nn.cc
index 9e48b26ae9392..48bbda538351b 100644
--- a/paddle/cinn/hlir/pe/nn.cc
+++ b/paddle/cinn/hlir/pe/nn.cc
@@ -652,7 +652,7 @@ std::vector<ir::Tensor> Conv2d_NCHWc(const ir::Tensor &input,
 }
 
 #ifdef CINN_WITH_DNNL
-std::vector<ir::Tensor> Conv2d_NCHW_MKLDNN(const ir::Tensor &input,
+std::vector<ir::Tensor> Conv2d_NCHW_ONEDNN(const ir::Tensor &input,
                                            const ir::Tensor &weights,
                                            int pad_h,
                                            int pad_w,
@@ -674,7 +674,7 @@ std::vector<ir::Tensor> Conv2d_NCHW_MKLDNN(const ir::Tensor &input,
   auto call = Compute(
       {Expr(1)},
       [=]() -> Expr {
-        return lang::CallExtern("cinn_cpu_mkldnn_conv2d_nchw_fp32",
+        return lang::CallExtern("cinn_cpu_onednn_conv2d_nchw_fp32",
                                 {
                                     Expr(input->shape[0]),    // batch_size
                                     Expr(input->shape[1]),    // c_in
@@ -694,7 +694,7 @@ std::vector<ir::Tensor> Conv2d_NCHW_MKLDNN(const ir::Tensor &input,
                                     weights                   // weights
                                 });
       },
-      UniqName("conv2d_nchw_mkldnn_out"));
+      UniqName("conv2d_nchw_onednn_out"));
   auto out = call->TupleGet(0);
   out->WithBuffer(input->type());
   return {out, call};
@@ -1020,11 +1020,11 @@ std::vector<ir::Tensor> Softmax(const ir::Tensor &A,
 }
 
 #ifdef CINN_WITH_DNNL
-std::vector<ir::Tensor> SoftmaxMKLDNN(const ir::Tensor &A,
+std::vector<ir::Tensor> SoftmaxONEDNN(const ir::Tensor &A,
                                       int axis,
                                       const std::string &output_name) {
   CHECK_LE(A->shape.size(), 4U)
-      << "Input's dimension of mkldnn softmax op is less than 4! Please check.";
+      << "Input's dimension of onednn softmax op is less than 4! Please check.";
   if (axis == -1) {
     axis = A->shape.size() - 1;
   }
@@ -1036,7 +1036,7 @@ std::vector<ir::Tensor> SoftmaxMKLDNN(const ir::Tensor &A,
   auto call = Compute(
       {Expr(1)},
       [=]() -> Expr {
-        return lang::CallExtern("cinn_cpu_mkldnn_softmax_fp32",
+        return lang::CallExtern("cinn_cpu_onednn_softmax_fp32",
                                 {
                                     shape[0],    // batch_size
                                     shape[1],    // c_in
diff --git a/paddle/cinn/hlir/pe/nn.h b/paddle/cinn/hlir/pe/nn.h
index 32e2db2dc38f7..c3cc860e657bc 100755
--- a/paddle/cinn/hlir/pe/nn.h
+++ b/paddle/cinn/hlir/pe/nn.h
@@ -179,7 +179,7 @@ std::vector<ir::Tensor> Conv2d_NCHWc(
     const cinn::common::Target &target = cinn::common::DefaultHostTarget());
 
 #ifdef CINN_WITH_DNNL
-std::vector<ir::Tensor> Conv2d_NCHW_MKLDNN(
+std::vector<ir::Tensor> Conv2d_NCHW_ONEDNN(
     const ir::Tensor &input,
     const ir::Tensor &weights,
     int pad_h,
@@ -333,7 +333,7 @@ std::vector<ir::Tensor> Softmax(
     const std::string &output_name = UniqName("T_softmax_out"));
 
 #ifdef CINN_WITH_DNNL
-std::vector<ir::Tensor> SoftmaxMKLDNN(
+std::vector<ir::Tensor> SoftmaxONEDNN(
     const ir::Tensor &A,
     int axis = -1,
     const std::string &output_name = UniqName("T_softmax_out"));
diff --git a/paddle/cinn/hlir/pe/schedule.cc b/paddle/cinn/hlir/pe/schedule.cc
index 3e4af70e1b1cc..0206c288738ff 100644
--- a/paddle/cinn/hlir/pe/schedule.cc
+++ b/paddle/cinn/hlir/pe/schedule.cc
@@ -36,21 +36,31 @@ namespace cinn {
 namespace hlir {
 namespace pe {
 
-ScheduleParam::ScheduleParam(cinn::common::Target::Arch arch) {
-  switch (arch) {
-    case cinn::common::Target::Arch::X86: {
-      param_data = CreateX86Params();
-      break;
-    }
-    case cinn::common::Target::Arch::NVGPU: {
-      param_data = CreateCudaParams();
-      break;
-    }
-    default: {
-      PADDLE_THROW(phi::errors::InvalidArgument(
-          "Schedule params must be initialized with target x86 or nvgpu."));
-    }
-  }
+using ParamsT =
+    absl::flat_hash_map<std::string,
+                        absl::flat_hash_map<std::string, std::vector<int>>>;
+
+ParamsT CreateParamsImpl(common::UnknownArch) {
+  PADDLE_THROW(phi::errors::InvalidArgument(
+      "Schedule params must be initialized with target x86 or nvgpu."));
+}
+
+ParamsT CreateParamsImpl(common::X86Arch) { return CreateX86Params(); }
+
+ParamsT CreateParamsImpl(common::ARMArch) {
+  PADDLE_THROW(phi::errors::InvalidArgument(
+      "Schedule params must be initialized with target x86 or nvgpu."));
+}
+
+ParamsT CreateParamsImpl(common::NVGPUArch) { return CreateCudaParams(); }
+
+ParamsT CreateParams(common::Arch arch) {
+  return std::visit([](const auto &impl) { return CreateParamsImpl(impl); },
+                    arch.variant());
+}
+
+ScheduleParam::ScheduleParam(cinn::common::Arch arch) {
+  param_data = CreateParams(arch);
 }
 
 ScheduleParam::~ScheduleParam() {}
@@ -873,7 +883,7 @@ void Conv2d_NCHWc_1X1_Schedule_CPU(poly::StageMap stages,
                                    const cinn::common::Target &target,
                                    const std::string &key,
                                    bool do_padding) {
-  CHECK(target.arch == Target::Arch::X86)
+  CHECK(std::holds_alternative<common::X86Arch>(target.arch))
       << "Conv2d_NCHWc_1X1_Schedule_CPU schedule only used in x86";
   CHECK(packed_out.defined());
   CHECK(input_pad.defined());
@@ -1022,7 +1032,7 @@ void Conv2d_NCHWc_1X1_Schedule_CPU_Nofuse(poly::StageMap stages,
                                           const ir::Tensor &weights_dilation,
                                           const ir::Tensor &data,
                                           const cinn::common::Target &target) {
-  CHECK(target.arch == Target::Arch::X86)
+  CHECK(std::holds_alternative<common::X86Arch>(target.arch))
       << "Conv2d_NCHWc_1X1_Schedule_CPU_Nofuse schedule only used in x86";
   CHECK(packed_out.defined());
   CHECK(input_pad.defined());
@@ -1144,7 +1154,7 @@ void Conv2d_NCHWc_Schedule_CPU_Nofuse(poly::StageMap stages,
                                       const ir::Tensor &weights_dilation,
                                       const ir::Tensor &data,
                                       const cinn::common::Target &target) {
-  CHECK(target.arch == Target::Arch::X86)
+  CHECK(std::holds_alternative<common::X86Arch>(target.arch))
       << "Conv2d_NCHWc_Schedule_CPU_Nofuse schedule only used in x86";
   CHECK(packed_out.defined());
   CHECK(input_pad.defined());
@@ -1251,7 +1261,7 @@ void Conv2d_NCHWc_Schedule_CPU(poly::StageMap stages,
                                const cinn::common::Target &target,
                                const std::string &key,
                                bool do_padding) {
-  CHECK(target.arch == Target::Arch::X86)
+  CHECK(std::holds_alternative<common::X86Arch>(target.arch))
       << "Conv2d_NCHWc_Schedule_CPU schedule only used in x86";
   CHECK(packed_out.defined());
   CHECK(input_pad.defined());
@@ -1383,7 +1393,7 @@ void Depthwise_Conv2d_NCHWc_Schedule_CPU_Nofuse(
     const ir::Tensor &data,
     const cinn::common::Target &target,
     bool do_padding) {
-  CHECK(target.arch == Target::Arch::X86)
+  CHECK(std::holds_alternative<common::X86Arch>(target.arch))
       << "Depthwise_Conv2d_NCHWc_Schedule_CPU_Nofuse schedule only used in x86";
   CHECK(packed_out.defined());
   CHECK(input_pad.defined());
@@ -2813,16 +2823,21 @@ void CudaSplitSchedule(cinn::common::CINNValuePack *arg_pack,
     if (i != axis) fused_shape = fused_shape * output_shapes[0][i];
   }
   int compute_at_level = 0;
-  if (target.arch == Target::Arch::NVGPU) {
-    if (fused_shape > target.max_num_threads()) {
-      stages[last_output]->Split(0, target.max_num_threads());
-      stages[last_output]->Bind(0, "blockIdx.x");
-      stages[last_output]->Bind(1, "threadIdx.x");
-      compute_at_level++;
-    } else {
-      stages[last_output]->Bind(0, "threadIdx.x");
-    }
-  }
+  target.arch.Visit(adt::match{
+      [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+      [&](common::X86Arch) {},
+      [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+      [&](common::NVGPUArch) {
+        if (fused_shape > target.max_num_threads()) {
+          stages[last_output]->Split(0, target.max_num_threads());
+          stages[last_output]->Bind(0, "blockIdx.x");
+          stages[last_output]->Bind(1, "threadIdx.x");
+          compute_at_level++;
+        } else {
+          stages[last_output]->Bind(0, "threadIdx.x");
+        }
+      },
+  });
 
   for (int i = 0; i < out_tensors.size() - 1; i++) {
     stages[out_tensors[i]]->ComputeAt2(stages[last_output], compute_at_level);
diff --git a/paddle/cinn/hlir/pe/schedule.h b/paddle/cinn/hlir/pe/schedule.h
index 7aef85c77518e..a8af004e04960 100644
--- a/paddle/cinn/hlir/pe/schedule.h
+++ b/paddle/cinn/hlir/pe/schedule.h
@@ -35,11 +35,11 @@ class ScheduleParam {
   ScheduleParam(const ScheduleParam &) = delete;
   ScheduleParam &operator=(const ScheduleParam &) = delete;
   static ScheduleParam &get_cuda_instance() {
-    static ScheduleParam instance{cinn::common::Target::Arch::NVGPU};
+    static ScheduleParam instance{cinn::common::NVGPUArch{}};
     return instance;
   }
   static ScheduleParam &get_x86_instance() {
-    static ScheduleParam instance{cinn::common::Target::Arch::X86};
+    static ScheduleParam instance{cinn::common::X86Arch{}};
     return instance;
   }
   absl::flat_hash_map<std::string,
@@ -54,7 +54,7 @@ class ScheduleParam {
   int Count(const std::string &key) { return param_data.count(key); }
 
  private:
-  explicit ScheduleParam(cinn::common::Target::Arch arch);
+  explicit ScheduleParam(cinn::common::Arch arch);
   absl::flat_hash_map<std::string,
                       absl::flat_hash_map<std::string, std::vector<int>>>
       param_data;
diff --git a/paddle/cinn/hlir/pe/transform.cc b/paddle/cinn/hlir/pe/transform.cc
index 3cd4120f89a1b..6f42a2268b35d 100644
--- a/paddle/cinn/hlir/pe/transform.cc
+++ b/paddle/cinn/hlir/pe/transform.cc
@@ -565,7 +565,7 @@ std::vector<Tensor> MatmulMKL(const Tensor& A,
                               float alpha,
                               const std::string& name,
                               const cinn::common::Target& target) {
-  CHECK(target.arch == Target::Arch::X86)
+  CHECK(std::holds_alternative<common::X86Arch>(target.arch))
       << "mkl should be used in the cpu environment";
   std::vector<Expr> shape_A = A->shape;
   std::vector<Expr> shape_B = B->shape;
@@ -658,10 +658,19 @@ int GetMulFactor(int shape,
   return split_factor;
 }
 
-std::vector<Tensor> MulBase(const Tensor& A,
-                            const Tensor& B,
-                            const std::string& name,
-                            const cinn::common::Target& target) {
+std::vector<Tensor> MulBaseCallImpl(common::UnknownArch,
+                                    const Tensor& A,
+                                    const Tensor& B,
+                                    const std::string& name,
+                                    const cinn::common::Target& target) {
+  LOG(FATAL) << "NotImplemented.";
+}
+
+std::vector<Tensor> MulBaseCallImpl(common::X86Arch,
+                                    const Tensor& A,
+                                    const Tensor& B,
+                                    const std::string& name,
+                                    const cinn::common::Target& target) {
   std::vector<Expr> output_shape;
   CHECK_EQ(A->shape.size(), 2U)
       << "tensor_A's shape size should be two while current shape size is "
@@ -674,55 +683,96 @@ std::vector<Tensor> MulBase(const Tensor& A,
   output_shape.push_back(A->shape[0]);
   output_shape.push_back(B->shape[0]);
 
-  if (target.arch == Target::Arch::X86) {
-    int reduce_dim = A->shape[1].as_int32();
-    int split_factor = GetMulFactor(reduce_dim, A->type(), target);
-    Var reduce_k_first(
-        ir::Cast::Make(A->shape[1]->type(), Expr(reduce_dim / split_factor)),
-        UniqName("reduce_k_first"));
-    auto mul_reduce_first = Compute(
-        {A->shape[0], B->shape[0], Expr(split_factor)},
-        [=](const std::vector<Expr>& indice) {
-          CHECK_EQ(indice.size(), 3U)
-              << "indice size should be three while current size is "
-              << indice.size();
-          return lang::ReduceSum(
-              A({indice[0], reduce_k_first * Expr(split_factor) + indice[2]}) *
-                  B({indice[1],
-                     reduce_k_first * Expr(split_factor) + indice[2]}),
-              {reduce_k_first});
-        },
-        UniqName("mul_reduce_k_first"));
-    Var reduce_k_second(ir::Cast::Make(A->shape[1]->type(), Expr(split_factor)),
-                        UniqName("reduce_k_second"));
-    return {Compute(
-                output_shape,
-                [=](const std::vector<Expr>& indice) {
-                  std::vector<Expr> new_indice = indice;
-                  new_indice.push_back(reduce_k_second);
-                  return lang::ReduceSum(mul_reduce_first(new_indice),
-                                         {reduce_k_second});
-                },
-                name),
-            mul_reduce_first};
-  } else {
-    Var reduce_k(A->shape[1], UniqName("reduce_k"));
-    return {Compute(
-        output_shape,
-        [=](const std::vector<Expr>& indice) {
-          std::vector<Expr> A_indice;
-          std::vector<Expr> B_indice;
-          CHECK_EQ(indice.size(), 2U)
-              << "indice size should be two while current size is "
-              << indice.size();
-          A_indice.push_back(indice[0]);
-          B_indice.push_back(indice[1]);
-          A_indice.push_back(reduce_k);
-          B_indice.push_back(reduce_k);
-          return lang::ReduceSum(A(A_indice) * B(B_indice), {reduce_k});
-        },
-        name)};
-  }
+  int reduce_dim = A->shape[1].as_int32();
+  int split_factor = GetMulFactor(reduce_dim, A->type(), target);
+  Var reduce_k_first(
+      ir::Cast::Make(A->shape[1]->type(), Expr(reduce_dim / split_factor)),
+      UniqName("reduce_k_first"));
+  auto mul_reduce_first = Compute(
+      {A->shape[0], B->shape[0], Expr(split_factor)},
+      [=](const std::vector<Expr>& indice) {
+        CHECK_EQ(indice.size(), 3U)
+            << "indice size should be three while current size is "
+            << indice.size();
+        return lang::ReduceSum(
+            A({indice[0], reduce_k_first * Expr(split_factor) + indice[2]}) *
+                B({indice[1], reduce_k_first * Expr(split_factor) + indice[2]}),
+            {reduce_k_first});
+      },
+      UniqName("mul_reduce_k_first"));
+  Var reduce_k_second(ir::Cast::Make(A->shape[1]->type(), Expr(split_factor)),
+                      UniqName("reduce_k_second"));
+  return {Compute(
+              output_shape,
+              [=](const std::vector<Expr>& indice) {
+                std::vector<Expr> new_indice = indice;
+                new_indice.push_back(reduce_k_second);
+                return lang::ReduceSum(mul_reduce_first(new_indice),
+                                       {reduce_k_second});
+              },
+              name),
+          mul_reduce_first};
+}
+
+std::vector<Tensor> MulBaseCallImpl(common::ARMArch,
+                                    const Tensor& A,
+                                    const Tensor& B,
+                                    const std::string& name,
+                                    const cinn::common::Target& target) {
+  LOG(FATAL) << "NotImplemented.";
+}
+
+std::vector<Tensor> MulBaseCallImpl(common::NVGPUArch,
+                                    const Tensor& A,
+                                    const Tensor& B,
+                                    const std::string& name,
+                                    const cinn::common::Target& target) {
+  std::vector<Expr> output_shape;
+  CHECK_EQ(A->shape.size(), 2U)
+      << "tensor_A's shape size should be two while current shape size is "
+      << A->shape.size();
+  CHECK_EQ(B->shape.size(), 2U)
+      << "tensor_B's shape size should be two while current shape size is "
+      << B->shape.size();
+  CHECK_EQ(A->shape[1], B->shape[1])
+      << "tensor_A's last shape should be same with tensor_B";
+  output_shape.push_back(A->shape[0]);
+  output_shape.push_back(B->shape[0]);
+
+  Var reduce_k(A->shape[1], UniqName("reduce_k"));
+  return {Compute(
+      output_shape,
+      [=](const std::vector<Expr>& indice) {
+        std::vector<Expr> A_indice;
+        std::vector<Expr> B_indice;
+        CHECK_EQ(indice.size(), 2U)
+            << "indice size should be two while current size is "
+            << indice.size();
+        A_indice.push_back(indice[0]);
+        B_indice.push_back(indice[1]);
+        A_indice.push_back(reduce_k);
+        B_indice.push_back(reduce_k);
+        return lang::ReduceSum(A(A_indice) * B(B_indice), {reduce_k});
+      },
+      name)};
+}
+
+std::vector<Tensor> MulBaseCall(const Tensor& A,
+                                const Tensor& B,
+                                const std::string& name,
+                                const cinn::common::Target& target) {
+  return std::visit(
+      [&](const auto& impl) {
+        return MulBaseCallImpl(impl, A, B, name, target);
+      },
+      target.arch.variant());
+}
+
+std::vector<Tensor> MulBase(const Tensor& A,
+                            const Tensor& B,
+                            const std::string& name,
+                            const cinn::common::Target& target) {
+  return MulBaseCall(A, B, name, target);
 }
 
 std::vector<Tensor> Mul(const Tensor& A,
@@ -751,7 +801,7 @@ std::vector<Tensor> MulMKL(const Tensor& A,
                            const Tensor& B,
                            const std::string& name,
                            const cinn::common::Target& target) {
-  CHECK(target.arch == Target::Arch::X86)
+  CHECK(std::holds_alternative<cinn::common::X86Arch>(target.arch))
       << "mkl should be used in the cpu environment";
   std::vector<Expr> shape_A = A->shape;
   std::vector<Expr> shape_B = B->shape;
@@ -1271,14 +1321,18 @@ ir::Tensor ScatterAssign(const ir::Tensor& input,
   CHECK_EQ(index->type(), cinn::common::Int(32))
       << "Param [Index] of ScatterAssign only support int32 ! Please Check.\n";
   std::string extern_fun_name;
-  if (target.arch == cinn::common::Target::Arch::NVGPU) {
-    extern_fun_name.assign("cinn_cuda_find_int");
-  } else if (target.arch == cinn::common::Target::Arch::X86) {
-    extern_fun_name.assign("cinn_host_find_int");
-  } else {
-    PADDLE_THROW(phi::errors::Fatal(
-        "ScatterAssign only support X86 and NVGPU ! Please Check.\n"));
-  }
+  target.arch.Visit(adt::match{
+      [&](common::UnknownArch) {
+        PADDLE_THROW(phi::errors::Fatal(
+            "ScatterAssign only support X86 and NVGPU ! Please Check.\n"));
+      },
+      [&](common::X86Arch) { extern_fun_name.assign("cinn_host_find_int"); },
+      [&](common::ARMArch) {
+        PADDLE_THROW(phi::errors::Fatal(
+            "ScatterAssign only support X86 and NVGPU ! Please Check.\n"));
+      },
+      [&](common::NVGPUArch) { extern_fun_name.assign("cinn_cuda_find_int"); },
+  });
 
   auto pos_axis = axis;
   if (pos_axis < 0) pos_axis += input->shape.size();
@@ -1309,7 +1363,7 @@ ir::Tensor ScatterAdd(const ir::Tensor& input,
                       const cinn::common::Target& target,
                       const int axis,
                       const std::string& output_name) {
-  CHECK_EQ(target.arch, cinn::common::Target::Arch::NVGPU)
+  CHECK(std::holds_alternative<common::NVGPUArch>(target.arch))
       << "Op IndexAdd only support NVGPU now ! Please Check.\n";
 
   CHECK_EQ(index->type(), cinn::common::Int(32))
diff --git a/paddle/cinn/ir/group_schedule/config/group_tile_config.cc b/paddle/cinn/ir/group_schedule/config/group_tile_config.cc
index 0d443086bdce9..eb29a090092e0 100644
--- a/paddle/cinn/ir/group_schedule/config/group_tile_config.cc
+++ b/paddle/cinn/ir/group_schedule/config/group_tile_config.cc
@@ -102,7 +102,8 @@ BuildPureStaticShapeConfig(
   } else if (base_info->reduce_numel <= 256) {
     // warp reduce
     int64_t reduce_block = Next2Power(base_info->reduce_numel);
-    int64_t spatial_inner_num = 256 / reduce_block;
+    int64_t spatial_inner_num =
+        std::min(256 / reduce_block, base_info->spatial_numel);
     int64_t tree_reduce_num = 32;
     int64_t warp_num = 8;
     BucketInfo bucket_info{/* sp_lower_bound = */ 1,
diff --git a/paddle/cinn/ir/group_schedule/st_shape_group_scheduler.cc b/paddle/cinn/ir/group_schedule/st_shape_group_scheduler.cc
index 1dc21ce8a3180..85890576d2647 100644
--- a/paddle/cinn/ir/group_schedule/st_shape_group_scheduler.cc
+++ b/paddle/cinn/ir/group_schedule/st_shape_group_scheduler.cc
@@ -555,7 +555,7 @@ void StaticShapeGroupScheduler::DoVerticalLoopFusion() {
 }
 
 void StaticShapeGroupScheduler::BindCudaAxis() {
-  if (target_.arch != Target::Arch::NVGPU) return;
+  if (!std::holds_alternative<common::NVGPUArch>(target_.arch)) return;
   VLOG(5) << "[Start BindCudaAxis] func body: "
           << ir_sch_->GetModule().GetExprs().front();
 
@@ -594,7 +594,7 @@ std::ostream& operator<<(std::ostream& os, const Range& x) {
 // and MultiDimIntegerSet, re implement this function to simplify these ugly
 // codes.
 void StaticShapeGroupScheduler::AllocateStorage() {
-  if (target_.arch != Target::Arch::NVGPU) return;
+  if (!std::holds_alternative<common::NVGPUArch>(target_.arch)) return;
   VLOG(5) << "[Start AllocateStorage] func body: "
           << ir_sch_->GetModule().GetExprs().front();
 
diff --git a/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc
index 8a3c2dfa71356..08b587f95fd71 100644
--- a/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc
+++ b/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc
@@ -219,19 +219,21 @@ void TileFirstGeneralTactic::SplitWarpNumber(ir::IRSchedule* sch,
   };
   if (!IsWarpNumGT(1)) return;
 
-  const auto LimitWarpNum = [&](const ir::Expr& loop, ScheduleConfig* config) {
+  const auto GetMinimalWarpNum = [&](const ir::Expr& loop,
+                                     const ScheduleConfig& config) -> int {
     ir::Expr extent = loop.As<ir::For>()->extent;
     common::cas_intervals_t var_intervals =
         common::CollectVarIntervalsOfExprs({extent});
     common::SymbolicExprAnalyzer analyzer(var_intervals);
     const auto& proved_gt =
-        analyzer.ProveGT(ir::Expr(config->tile_config.warp_num), extent);
+        analyzer.ProveGT(ir::Expr(config.tile_config.warp_num * 32), extent);
     if (proved_gt.value_or(false)) {
       ir::Expr upper_bound = analyzer.UpperBound(extent);
       if (upper_bound.is_constant()) {
-        config->tile_config.warp_num = upper_bound.get_constant();
+        return (static_cast<int>(upper_bound.get_constant()) + 31) / 32;
       }
     }
+    return config.tile_config.warp_num;
   };
 
   auto loops = sch->GetLoops(block_id);
@@ -248,9 +250,9 @@ void TileFirstGeneralTactic::SplitWarpNumber(ir::IRSchedule* sch,
     }
   } else if (IsWarpReduce(context_->config)) {
     // get num warp from flatten num
-    LimitWarpNum(loops[0], &(context_->config));
-    int thread_y = context_->config.tile_config.warp_num * 32 /
-                   context_->config.tile_config.tree_reduce_num;
+    int minimal_warp_number = GetMinimalWarpNum(loops[0], context_->config);
+    int thread_y =
+        minimal_warp_number * 32 / context_->config.tile_config.tree_reduce_num;
     sch->Split(loops[0], std::vector<int>({-1, thread_y}));
 
     if (IsReduceBlock(context_->config, block_id) &&
diff --git a/paddle/cinn/ir/module.cc b/paddle/cinn/ir/module.cc
index 20298e32920fb..96c0187cbd9ce 100644
--- a/paddle/cinn/ir/module.cc
+++ b/paddle/cinn/ir/module.cc
@@ -35,6 +35,25 @@ void Module::Builder::AddFunctionWithoutOptim(const ir::LoweredFunc &func) {
   module_->functions.push_back(func);
 }
 
+std::optional<int> GetDataAlignmentImpl(common::UnknownArch arch) {
+  return std::nullopt;
+}
+
+std::optional<int> GetDataAlignmentImpl(common::X86Arch arch) { return 32; }
+
+std::optional<int> GetDataAlignmentImpl(common::ARMArch arch) {
+  return std::nullopt;
+}
+
+std::optional<int> GetDataAlignmentImpl(common::NVGPUArch arch) {
+  return std::nullopt;
+}
+
+std::optional<int> GetDataAlignment(common::Arch arch) {
+  return std::visit([](const auto &impl) { return GetDataAlignmentImpl(impl); },
+                    arch.variant());
+}
+
 void Module::Builder::AddBuffer(ir::Buffer buffer) {
   CHECK(buffer->target.defined())
       << "buffer [" << buffer->name << "]'s target is undefined";
@@ -43,8 +62,8 @@ void Module::Builder::AddBuffer(ir::Buffer buffer) {
             return x.as_buffer()->name == buffer->name;
           }) == std::end(module_->buffers)) {
     module_->buffers.push_back(buffer);
-    if (module_->target.arch == Target::Arch::X86) {
-      module_->buffers.back().as_buffer()->data_alignment = 32;
+    if (auto alignment = GetDataAlignment(module_->target.arch)) {
+      module_->buffers.back().as_buffer()->data_alignment = alignment.value();
     }
   }
 }
@@ -64,7 +83,7 @@ void Module::Builder::Clear() {
   module_->predicates.clear();
 }
 
-Target::Arch Module::Builder::GetTargetArch() { return module_->target.arch; }
+common::Arch Module::Builder::GetTargetArch() { return module_->target.arch; }
 
 Module Module::Builder::Build() {
   if (module_->functions.empty()) {
diff --git a/paddle/cinn/ir/module.h b/paddle/cinn/ir/module.h
index 160d0087a0e54..438c0e6db30d5 100644
--- a/paddle/cinn/ir/module.h
+++ b/paddle/cinn/ir/module.h
@@ -47,7 +47,7 @@ class Module : public ir::IrNodeRef {
     void AddPredicate(ir::Expr predicate);
     void SetInferShapeFunc(ir::Expr infer_shape_func);
     void Clear();
-    Target::Arch GetTargetArch();
+    common::Arch GetTargetArch();
 
     Module Build();
 
diff --git a/paddle/cinn/ir/op/ir_operators.cc b/paddle/cinn/ir/op/ir_operators.cc
index d11a26685851f..6b68e3ce60c40 100644
--- a/paddle/cinn/ir/op/ir_operators.cc
+++ b/paddle/cinn/ir/op/ir_operators.cc
@@ -69,6 +69,40 @@ Expr operator>>(Expr a, Expr b) {
   return lang::CallExtern("right_shift", {a, b}, {{"vectorizable", false}});
 }
 
+Expr BitwiseOrCallImpl(common::UnknownArch,
+                       const Target& target,
+                       Expr a,
+                       Expr b) {
+  std::stringstream ss;
+  ss << "Unsupport arch: " << target.arch_str() << " for bitwise_or.";
+  PADDLE_THROW(phi::errors::InvalidArgument(ss.str()));
+}
+
+Expr BitwiseOrCallImpl(common::X86Arch, const Target& target, Expr a, Expr b) {
+  return lang::CallExtern("bitwise_or", {a, b}, {{"vectorizable", false}});
+}
+
+Expr BitwiseOrCallImpl(common::ARMArch, const Target& target, Expr a, Expr b) {
+  std::stringstream ss;
+  ss << "Unsupport arch: " << target.arch_str() << " for bitwise_or.";
+  PADDLE_THROW(phi::errors::InvalidArgument(ss.str()));
+}
+
+Expr BitwiseOrCallImpl(common::NVGPUArch,
+                       const Target& target,
+                       Expr a,
+                       Expr b) {
+  Type t_a = a.type();
+  auto func_name = hlir::GetExternFuncName(target, t_a, "bitwise_or");
+  return lang::CallExtern(func_name, {a, b}, {{"vectorizable", false}});
+}
+
+Expr BitwiseOrCall(const Target& target, Expr a, Expr b) {
+  return std::visit(
+      [&](const auto& arch) { return BitwiseOrCallImpl(arch, target, a, b); },
+      target.arch.variant());
+}
+
 Expr operator|(Expr a, Expr b) {
   CHECK(a.type().is_int() || a.type().is_uint());
   CHECK(b.type().is_int() || b.type().is_uint());
@@ -82,16 +116,41 @@ Expr operator|(Expr a, Expr b) {
     }
   }
   auto target = cinn::runtime::CurrentTarget::GetCurrentTarget();
-  if (target.arch == cinn::common::Target::Arch::X86) {
-    return lang::CallExtern("bitwise_or", {a, b}, {{"vectorizable", false}});
-  } else if (target.arch == cinn::common::Target::Arch::NVGPU) {
-    auto func_name = hlir::GetExternFuncName(target, t_a, "bitwise_or");
-    return lang::CallExtern(func_name, {a, b}, {{"vectorizable", false}});
-  } else {
-    std::stringstream ss;
-    ss << "Unsupport arch: " << target.arch_str() << " for bitwise_or.";
-    PADDLE_THROW(phi::errors::InvalidArgument(ss.str()));
-  }
+  return BitwiseOrCall(target, a, b);
+}
+
+Expr BitwiseAndCallImpl(common::UnknownArch,
+                        const Target& target,
+                        Expr a,
+                        Expr b) {
+  std::stringstream ss;
+  ss << "Unsupport arch: " << target.arch_str() << " for bitwise_and.";
+  PADDLE_THROW(phi::errors::InvalidArgument(ss.str()));
+}
+
+Expr BitwiseAndCallImpl(common::X86Arch, const Target& target, Expr a, Expr b) {
+  return lang::CallExtern("bitwise_and", {a, b}, {{"vectorizable", false}});
+}
+
+Expr BitwiseAndCallImpl(common::ARMArch, const Target& target, Expr a, Expr b) {
+  std::stringstream ss;
+  ss << "Unsupport arch: " << target.arch_str() << " for bitwise_and.";
+  PADDLE_THROW(phi::errors::InvalidArgument(ss.str()));
+}
+
+Expr BitwiseAndCallImpl(common::NVGPUArch,
+                        const Target& target,
+                        Expr a,
+                        Expr b) {
+  Type t_a = a.type();
+  auto func_name = hlir::GetExternFuncName(target, t_a, "bitwise_and");
+  return lang::CallExtern(func_name, {a, b}, {{"vectorizable", false}});
+}
+
+Expr BitwiseAndCall(const Target& target, Expr a, Expr b) {
+  return std::visit(
+      [&](const auto& arch) { return BitwiseAndCallImpl(arch, target, a, b); },
+      target.arch.variant());
 }
 
 Expr operator&(Expr a, Expr b) {
@@ -107,16 +166,41 @@ Expr operator&(Expr a, Expr b) {
     }
   }
   auto target = cinn::runtime::CurrentTarget::GetCurrentTarget();
-  if (target.arch == cinn::common::Target::Arch::X86) {
-    return lang::CallExtern("bitwise_and", {a, b}, {{"vectorizable", false}});
-  } else if (target.arch == cinn::common::Target::Arch::NVGPU) {
-    auto func_name = hlir::GetExternFuncName(target, t_a, "bitwise_and");
-    return lang::CallExtern(func_name, {a, b}, {{"vectorizable", false}});
-  } else {
-    std::stringstream ss;
-    ss << "Unsupport arch: " << target.arch_str() << " for bitwise_and.";
-    PADDLE_THROW(phi::errors::InvalidArgument(ss.str()));
-  }
+  return BitwiseAndCall(target, a, b);
+}
+
+Expr BitwiseXorCallImpl(common::UnknownArch,
+                        const Target& target,
+                        Expr a,
+                        Expr b) {
+  std::stringstream ss;
+  ss << "Unsupport arch: " << target.arch_str() << " for bitwise_xor.";
+  PADDLE_THROW(phi::errors::InvalidArgument(ss.str()));
+}
+
+Expr BitwiseXorCallImpl(common::X86Arch, const Target& target, Expr a, Expr b) {
+  return lang::CallExtern("bitwise_xor", {a, b}, {{"vectorizable", false}});
+}
+
+Expr BitwiseXorCallImpl(common::ARMArch, const Target& target, Expr a, Expr b) {
+  std::stringstream ss;
+  ss << "Unsupport arch: " << target.arch_str() << " for bitwise_xor.";
+  PADDLE_THROW(phi::errors::InvalidArgument(ss.str()));
+}
+
+Expr BitwiseXorCallImpl(common::NVGPUArch,
+                        const Target& target,
+                        Expr a,
+                        Expr b) {
+  Type t_a = a.type();
+  auto func_name = hlir::GetExternFuncName(target, t_a, "bitwise_xor");
+  return lang::CallExtern(func_name, {a, b}, {{"vectorizable", false}});
+}
+
+Expr BitwiseXorCall(const Target& target, Expr a, Expr b) {
+  return std::visit(
+      [&](const auto& arch) { return BitwiseXorCallImpl(arch, target, a, b); },
+      target.arch.variant());
 }
 
 Expr operator^(Expr a, Expr b) {
@@ -132,31 +216,40 @@ Expr operator^(Expr a, Expr b) {
     }
   }
   auto target = cinn::runtime::CurrentTarget::GetCurrentTarget();
-  if (target.arch == cinn::common::Target::Arch::X86) {
-    return lang::CallExtern("bitwise_xor", {a, b}, {{"vectorizable", false}});
-  } else if (target.arch == cinn::common::Target::Arch::NVGPU) {
-    auto func_name = hlir::GetExternFuncName(target, t_a, "bitwise_xor");
-    return lang::CallExtern(func_name, {a, b}, {{"vectorizable", false}});
-  } else {
-    std::stringstream ss;
-    ss << "Unsupport arch: " << target.arch_str() << " for bitwise_xor.";
-    PADDLE_THROW(phi::errors::InvalidArgument(ss.str()));
-  }
+  return BitwiseXorCall(target, a, b);
+}
+
+Expr BitwiseNotCallImpl(common::UnknownArch, const Target& target, Expr a) {
+  std::stringstream ss;
+  ss << "Unsupport arch: " << target.arch_str() << " for bitwise_not.";
+  PADDLE_THROW(phi::errors::InvalidArgument(ss.str()));
+}
+
+Expr BitwiseNotCallImpl(common::X86Arch, const Target& target, Expr a) {
+  return lang::CallExtern("bitwise_not", {a}, {{"vectorizable", false}});
+}
+
+Expr BitwiseNotCallImpl(common::ARMArch, const Target& target, Expr a) {
+  std::stringstream ss;
+  ss << "Unsupport arch: " << target.arch_str() << " for bitwise_not.";
+  PADDLE_THROW(phi::errors::InvalidArgument(ss.str()));
+}
+
+Expr BitwiseNotCallImpl(common::NVGPUArch, const Target& target, Expr a) {
+  auto func_name = hlir::GetExternFuncName(target, a->type(), "bitwise_not");
+  return lang::CallExtern(func_name, {a}, {{"vectorizable", false}});
+}
+
+Expr BitwiseNotCall(const Target& target, Expr a) {
+  return std::visit(
+      [&](const auto& arch) { return BitwiseNotCallImpl(arch, target, a); },
+      target.arch.variant());
 }
 
 Expr operator~(Expr a) {
   CHECK(a.type().is_int() || a.type().is_uint());
   auto target = cinn::runtime::CurrentTarget::GetCurrentTarget();
-  if (target.arch == cinn::common::Target::Arch::X86) {
-    return lang::CallExtern("bitwise_not", {a}, {{"vectorizable", false}});
-  } else if (target.arch == cinn::common::Target::Arch::NVGPU) {
-    auto func_name = hlir::GetExternFuncName(target, a->type(), "bitwise_not");
-    return lang::CallExtern(func_name, {a}, {{"vectorizable", false}});
-  } else {
-    std::stringstream ss;
-    ss << "Unsupport arch: " << target.arch_str() << " for bitwise_not.";
-    PADDLE_THROW(phi::errors::InvalidArgument(ss.str()));
-  }
+  return BitwiseNotCall(target, a);
 }
 
 }  // namespace ir
diff --git a/paddle/cinn/ir/schedule/impl/for_type.cc b/paddle/cinn/ir/schedule/impl/for_type.cc
index a53870f09ea46..84d45d6827d3d 100644
--- a/paddle/cinn/ir/schedule/impl/for_type.cc
+++ b/paddle/cinn/ir/schedule/impl/for_type.cc
@@ -123,8 +123,7 @@ void DyScheduleImpl::Bind(const Expr& loop, const std::string& thread_axis) {
     throw IRScheduleErrorHandler(primitive, os.str(), module_expr_);
   }
   int offset = thread_axis.back() - 'x';
-  auto cur_dev_info =
-      common::DevInfoMgr<common::Target::Arch::NVGPU>::GetDevInfo(0);
+  auto cur_dev_info = common::DevInfoMgr<common::NVGPUArch>::GetDevInfo(0);
   const std::array<int, 3> kMaxBlockDims = cur_dev_info->GetMaxBlockDims();
   const std::array<int, 3> kMaxGridDims = cur_dev_info->GetMaxGridDims();
   auto check_offset = [&](const char& c) -> bool {
@@ -202,8 +201,7 @@ void StScheduleImpl::Bind(const Expr& loop, const std::string& thread_axis) {
       << "thread_axis " << thread_axis << " is not supported";
   int offset = thread_axis.back() - 'x';
   auto cur_dev_info =
-      cinn::common::DevInfoMgr<cinn::common::Target::Arch::NVGPU>::GetDevInfo(
-          0);
+      cinn::common::DevInfoMgr<cinn::common::NVGPUArch>::GetDevInfo(0);
   const std::array<int, 3> kMaxBlockDims = cur_dev_info->GetMaxBlockDims();
   const std::array<int, 3> kMaxGridDims = cur_dev_info->GetMaxGridDims();
   auto check_offset = [&](const char& c) -> bool {
diff --git a/paddle/cinn/ir/schedule/ir_schedule.h b/paddle/cinn/ir/schedule/ir_schedule.h
index cab1b0d38d868..7927efdaa277f 100644
--- a/paddle/cinn/ir/schedule/ir_schedule.h
+++ b/paddle/cinn/ir/schedule/ir_schedule.h
@@ -32,11 +32,11 @@ namespace cinn {
 namespace ir {
 
 /**
- * A struct containing all the schedule primitives. Each shedule primitive is a
- * member function of IRSchedule. Schedule primitves are implmented by
+ * A struct containing all the schedule primitives. Each schedule primitive is a
+ * member function of IRSchedule. Schedule primitives are implemented by
  * StScheduleImpl manipulating the AST - IR(Expr). To support serializing and
  * replaying, each schedule primitive should append a ScheduleDesc::Step to the
- * trace_ in its corresponding function implment.
+ * trace_ in its corresponding function implement.
  */
 class IRSchedule {
  public:
@@ -353,7 +353,7 @@ class IRSchedule {
    * If the rfactor loop is k and rf_axis is 0, the rfactor transformation is
    * divided into 2 steps:
    * 1. get the rfactor block where the reduce loop k is transformed to the
-   * serial loop with no accumalation and a new rfactor tensor is created. The
+   * serial loop with no accumulation and a new rfactor tensor is created. The
    * axis k will be placed in the rf_axis of the new rf_tensor. The rf_block is
    * as follows: \code for (rf_k, 0, 30)      // rfactor loop k is transformed
    * to the serial loop. for (i, 0, 10)       // serial loop for (j, 0, 20) //
@@ -390,7 +390,7 @@ class IRSchedule {
    * If the rf loop is j and rf_axis is 0, the transformation is
    * divided into 2 steps:
    * 1. get the rf block where the reduce loop j is transformed to the
-   * serial loop with no accumalation and a new rf tensor is created.
+   * serial loop with no accumulation and a new rf tensor is created.
    * The axis j will be placed in the rf_axis of the new rf_tensor.
    * The rf_block is as follows:
    * \code
@@ -457,7 +457,7 @@ class IRSchedule {
 
   /*!
    * \brief Insert a tag in schedule_desc to mark the beginning of post
-   * processing, the schedue primitive itself does not make any changes to the
+   * processing, the schedule primitive itself does not make any changes to the
    * IR.
    */
   void TagPostSchedule();
@@ -491,7 +491,7 @@ class IRSchedule {
 /*!
  * \brief The base class of the inliner, which handles:
  * 1) Remove the block to be lined
- * 2) Maintain a list of index variables and their substition of the buffer
+ * 2) Maintain a list of index variables and their substitution of the buffer
  * being inlined
  */
 class BaseInliner : public ir::IRMutator<> {
diff --git a/paddle/cinn/ir/schedule/schedule_base.cc b/paddle/cinn/ir/schedule/schedule_base.cc
index b34221d73f052..885391aecd073 100644
--- a/paddle/cinn/ir/schedule/schedule_base.cc
+++ b/paddle/cinn/ir/schedule/schedule_base.cc
@@ -105,7 +105,7 @@ void ScheduleBase::Broadcast(const std::string& block_name,
   }
   std::vector<Expr> all_loops = this->GetLoops(block_name);
   if (axes[0] >= all_loops.size()) {
-    throw std::runtime_error("axes execeed loop size");
+    throw std::runtime_error("axes exceed loop size");
   }
 
   // Get Last loop
@@ -150,14 +150,14 @@ void ScheduleBase::Broadcast(const std::string& block_name,
     auto stride = Expr(1);
     auto in_offset = Expr(0);
 
-    std::set<int> brodacast_set(info.broadcast_axes.begin(),
+    std::set<int> broadcast_set(info.broadcast_axes.begin(),
                                 info.broadcast_axes.end());
     for (int i = all_loops.size() - 1; i >= 0; --i) {
       auto loop_temp = all_loops[i].As<ir::For>();
       offset = offset + loop_temp->loop_var * stride;
 
       stride = stride * loop_temp->extent;
-      if (!brodacast_set.count(i)) {
+      if (!broadcast_set.count(i)) {
         in_offset = in_offset + loop_temp->loop_var * stride;
       }
     }
diff --git a/paddle/cinn/ir/test/buffer_test.cc b/paddle/cinn/ir/test/buffer_test.cc
index 9dd4c489c999d..b8d4d247b30a9 100644
--- a/paddle/cinn/ir/test/buffer_test.cc
+++ b/paddle/cinn/ir/test/buffer_test.cc
@@ -69,7 +69,7 @@ TEST(Buffer, bind_to_multiple_tensors) {
   auto funcs = lang::Lower("func1", stages, {A, B});
 
   Target target;
-  target.arch = Target::Arch ::X86;
+  target.arch = common::X86Arch{};
   target.bits = Target::Bit ::k32;
   target.os = Target::OS ::Linux;
 
diff --git a/paddle/cinn/operator_fusion/CMakeLists.txt b/paddle/cinn/operator_fusion/CMakeLists.txt
new file mode 100644
index 0000000000000..d5a46bd4ef2e2
--- /dev/null
+++ b/paddle/cinn/operator_fusion/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_subdirectory(frontend)
+add_subdirectory(backend)
+add_subdirectory(policy)
+
+cc_library(
+  op_fusion
+  SRCS ${frontend_fusion_src} ${backend_fusion_src} ${policy_fusion_src}
+       pattern_graph.cc
+  DEPS phi)
diff --git a/paddle/cinn/operator_fusion/backend/CMakeLists.txt b/paddle/cinn/operator_fusion/backend/CMakeLists.txt
new file mode 100644
index 0000000000000..fe18d42b10fec
--- /dev/null
+++ b/paddle/cinn/operator_fusion/backend/CMakeLists.txt
@@ -0,0 +1 @@
+gather_srcs(backend_fusion_src SRCS pattern_fuser.cc)
diff --git a/paddle/cinn/operator_fusion/backend/pattern.h b/paddle/cinn/operator_fusion/backend/pattern.h
new file mode 100644
index 0000000000000..7006fb02b829e
--- /dev/null
+++ b/paddle/cinn/operator_fusion/backend/pattern.h
@@ -0,0 +1,88 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/cinn/hlir/framework/pir/trivial_op_impl.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/operator_fusion/pattern.h"
+#include "paddle/cinn/operator_fusion/pattern_fuser.h"
+#include "paddle/cinn/operator_fusion/utils.h"
+
+namespace cinn::fusion {
+
+struct BackendStage {};
+
+template <>
+struct PatternContent<BackendStage> {
+  explicit PatternContent<BackendStage>(pir::Operation* op,
+                                        std::optional<ir::Expr> expr)
+      : op(op), expr(expr) {}
+  pir::Operation* op;
+  std::optional<ir::Expr> expr;
+};
+
+using BackendContent = PatternContent<BackendStage>;
+using TrivialOp = cinn::hlir::framework::pir::trivial_fusion_detail::TrivialOp;
+using ReduceOp = cinn::hlir::framework::pir::trivial_fusion_detail::ReduceOp;
+using FusionOp = std::variant<ReduceOp, TrivialOp>;
+template <>
+struct TrivialPattern<BackendStage> {
+  explicit TrivialPattern(const std::vector<pir::Operation*>& ops,
+                          const TrivialOp& op)
+      : ops_(ops), trivial_op(op) {}
+  std::vector<pir::Operation*> ops_;
+  TrivialOp trivial_op;
+  static std::string name() { return "Trivial"; }
+  std::vector<pir::Operation*> ops() const { return ops_; }
+};
+
+template <>
+struct ReducePattern<BackendStage> {
+  explicit ReducePattern(const std::vector<pir::Operation*>& ops,
+                         const ReduceOp& op)
+      : ops_(ops), reduce_op(op) {}
+  std::vector<pir::Operation*> ops_;
+  ReduceOp reduce_op;
+  std::vector<pir::Operation*> ops() const { return ops_; }
+  pir::Operation* GetReduceOp() const { return ops_.back(); }
+  static std::string name() { return "Reduce"; }
+};
+
+template <>
+struct UnsupportPattern<BackendStage> {
+  explicit UnsupportPattern(const std::vector<pir::Operation*>& ops)
+      : ops_(ops) {}
+  std::vector<pir::Operation*> ops_;
+  std::vector<pir::Operation*> ops() const { return ops_; }
+  static std::string name() { return "Unsupport"; }
+};
+
+template <>
+struct HorizontalFusionPattern<BackendStage> {
+  explicit HorizontalFusionPattern(
+      const std::vector<StmtPattern<BackendStage>>& patterns)
+      : patterns_(patterns) {}
+  std::vector<StmtPattern<BackendStage>> patterns_;
+  std::vector<pir::Operation*> ops() const {
+    std::vector<pir::Operation*> result;
+    for (const auto& pattern : patterns_) {
+      auto ops = GetOpsInPattern(pattern);
+      ExtendVector(&result, ops);
+    }
+    return result;
+  }
+  static std::string name() { return "HorizontalFusionPattern"; }
+};
+
+}  // namespace cinn::fusion
diff --git a/paddle/cinn/operator_fusion/backend/pattern_fuser.cc b/paddle/cinn/operator_fusion/backend/pattern_fuser.cc
new file mode 100644
index 0000000000000..61e8fd658f94a
--- /dev/null
+++ b/paddle/cinn/operator_fusion/backend/pattern_fuser.cc
@@ -0,0 +1,211 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "glog/logging.h"
+
+#include "paddle/cinn/hlir/framework/pir/trivial_op_impl.h"
+#include "paddle/cinn/operator_fusion/backend/pattern.h"
+#include "paddle/cinn/operator_fusion/backend/pattern_fuser.h"
+
+namespace cinn::fusion {
+
+template <>
+StmtPattern<BackendStage> ConvertToStmtPattern(
+    const PatternContent<BackendStage>& content) {
+  const auto& kind = GetOpPatternKind(content.op);
+  if (kind == hlir::framework::kReduction) {
+    CHECK(content.expr.has_value());
+    return ReducePattern<BackendStage>({content.op},
+                                       ReduceOp(content.expr.value()));
+  } else if (kind == hlir::framework::kElementWise ||
+             kind == hlir::framework::kBroadcast ||
+             kind == hlir::framework::kInjective) {
+    CHECK(content.expr.has_value());
+    return TrivialPattern<BackendStage>({content.op},
+                                        TrivialOp(content.expr.value()));
+  } else {
+    CHECK(false);
+    return UnsupportPattern<BackendStage>({content.op});
+  }
+}
+
+// template StmtPattern<BackendStage> RT_x_RT(const
+// ReduceTreePattern<BackendStage>& upstream, const
+// ReduceTreePattern<BackendStage>& downstream);
+
+template <>
+StmtPattern<BackendStage> MergePatternImpl(
+    const ReduceTreePattern<BackendStage>& first,
+    const TrivialPattern<BackendStage>& second) {
+  return ReduceTreePlusTrivialPattern<BackendStage>(first, second);
+}
+
+template <>
+StmtPattern<BackendStage> MergePatternImpl(
+    const TrivialPattern<BackendStage>& first,
+    const ReducePattern<BackendStage>& second) {
+  const auto& ops = UniqueConcatVector(GetOpsInPattern<BackendStage>(first),
+                                       GetOpsInPattern<BackendStage>(second));
+  const auto& reduce_op =
+      cinn::hlir::framework::pir::trivial_fusion_detail::TrivalxOther_Fusion(
+          first.trivial_op, second.reduce_op);
+  return ReducePattern<BackendStage>(ops, reduce_op);
+}
+
+template <>
+StmtPattern<BackendStage> MergePatternImpl(
+    const TrivialPattern<BackendStage>& first,
+    const TrivialPattern<BackendStage>& second) {
+  const auto& ops = UniqueConcatVector(GetOpsInPattern<BackendStage>(first),
+                                       GetOpsInPattern<BackendStage>(second));
+  const auto& trivial_op =
+      cinn::hlir::framework::pir::trivial_fusion_detail::TrivalxOther_Fusion(
+          first.trivial_op, second.trivial_op);
+  return TrivialPattern<BackendStage>(ops, trivial_op);
+}
+
+template <>
+StmtPattern<BackendStage> MergePatternImpl(
+    const HorizontalFusionPattern<BackendStage>& first,
+    const HorizontalFusionPattern<BackendStage>& second) {
+  const auto& contents =
+      UniqueConcatVector(GetOpsInPattern<BackendStage>(first),
+                         GetOpsInPattern<BackendStage>(second));
+  return HorizontalFusionPattern<BackendStage>({first, second});
+}
+
+/// Start: Tmp Transform Operation for ReduceTree
+std::vector<FusionOp> ReduceTransformRecursive(
+    ReduceOp reduce_op,
+    const ReduceTreePattern<BackendStage>& reduce_tree_pattern,
+    const std::vector<size_t>& fake_reduce_iter_idx = {}) {
+  FusionOp root_op = reduce_op;
+  VLOG(4) << "ReduceTransformRecursive: " << *_GetFuncBodyPointer(root_op);
+  std::vector<FusionOp> result;
+  for (const auto& child_tree : reduce_tree_pattern.childs()) {
+    const auto& child_reduce_op = child_tree.GetRootPattern().reduce_op;
+    auto transformed_nodes = cinn::hlir::framework::pir::trivial_fusion_detail::
+        TransformReduceLoopRange(
+            child_reduce_op, &root_op, fake_reduce_iter_idx);
+    for (auto& node : transformed_nodes) {
+      auto child_flatten =
+          ReduceTransformRecursive(std::get<ReduceOp>(node), child_tree);
+      result.insert(result.end(), child_flatten.begin(), child_flatten.end());
+    }
+  }
+  result.push_back(root_op);
+  VLOG(4) << "ReduceTransformRecursive: End";
+  return result;
+}
+
+std::vector<FusionOp> ReduceTreeTrivialTransformRecursive(
+    TrivialOp trivial_op,
+    const ReduceTreePlusTrivialPattern<BackendStage>& rt_pattern) {
+  FusionOp root_op = trivial_op;
+  VLOG(4) << "ReduceTrivialTransformRecursive: "
+          << *_GetFuncBodyPointer(root_op);
+  std::vector<FusionOp> result;
+  // for (const auto& child_tree : ) {
+  //
+  const auto& child_tree = rt_pattern.tree;
+  const auto& child_reduce_op = child_tree.GetRootPattern().reduce_op;
+  auto transformed_nodes = cinn::hlir::framework::pir::trivial_fusion_detail::
+      TransformReduceLoopRange(
+          child_reduce_op, &root_op, rt_pattern.fake_reduce_iter_idx);
+  for (auto& node : transformed_nodes) {
+    auto child_flatten = ReduceTransformRecursive(
+        std::get<ReduceOp>(node), child_tree, rt_pattern.fake_reduce_iter_idx);
+    result.insert(result.end(), child_flatten.begin(), child_flatten.end());
+  }
+  //}
+  result.push_back(
+      cinn::hlir::framework::pir::trivial_fusion_detail::SinkTrivialLoopAlign(
+          std::get<TrivialOp>(root_op),
+          rt_pattern.tree.GetRootPattern().reduce_op,
+          rt_pattern.fake_reduce_iter_idx));
+  VLOG(4) << "ReduceTrivialTransformRecursive End;";
+  return result;
+}
+
+/// End: Tmp Transform Operation for reduce tree
+
+std::vector<FusionOp> GetFusionOpFromPattern(
+    const StmtPattern<BackendStage>& pattern);
+
+struct FusionOpGetter {
+  std::vector<FusionOp> operator()(
+      const TrivialPattern<BackendStage>& pattern) {
+    return {pattern.trivial_op};
+  }
+
+  std::vector<FusionOp> operator()(const ReducePattern<BackendStage>& pattern) {
+    return {pattern.reduce_op};
+  }
+
+  std::vector<FusionOp> operator()(
+      const ReduceTreePattern<BackendStage>& pattern) {
+    return ReduceTransformRecursive(pattern.GetRootPattern().reduce_op,
+                                    pattern);
+  }
+
+  std::vector<FusionOp> operator()(
+      const ReduceTreePlusTrivialPattern<BackendStage>& pattern) {
+    return ReduceTreeTrivialTransformRecursive(pattern.sink_trivial.trivial_op,
+                                               pattern);
+  }
+
+  std::vector<FusionOp> operator()(
+      const HorizontalFusionPattern<BackendStage>& pattern) {
+    std::vector<FusionOp> result;
+    for (const auto& sub_pattern : pattern.patterns_) {
+      result = ConcatVector(result, GetFusionOpFromPattern(sub_pattern));
+    }
+    return result;
+  }
+
+  std::vector<FusionOp> operator()(
+      const UnsupportPattern<BackendStage>& pattern) {
+    CHECK(false) << "Not Implemented.";
+  }
+};
+
+// tmp transform for reduce_tree and reduce_tree_trivial.
+std::vector<FusionOp> GetFusionOpFromPattern(
+    const StmtPattern<BackendStage>& pattern) {
+  return std::visit(FusionOpGetter(), pattern.variant());
+}
+
+struct FusionOp2Expr {
+  std::vector<ir::Expr> operator()(const TrivialOp& op) {
+    return {op.GetFuncBody()};
+  }
+  std::vector<ir::Expr> operator()(const ReduceOp& op) {
+    const auto& t_r = SplitReduceOp(op);
+    return {t_r.first.GetFuncBody(), t_r.second.GetFuncBody()};
+  }
+};
+
+std::vector<ir::Expr> GetExprFromPattern(
+    const StmtPattern<BackendStage>& pattern) {
+  const auto& fusion_ops = GetFusionOpFromPattern(pattern);
+  std::vector<ir::Expr> results;
+  for (const auto& op : fusion_ops) {
+    results = ConcatVector(results, std::visit(FusionOp2Expr(), op));
+  }
+  return results;
+}
+
+}  // namespace cinn::fusion
diff --git a/paddle/cinn/operator_fusion/backend/pattern_fuser.h b/paddle/cinn/operator_fusion/backend/pattern_fuser.h
new file mode 100644
index 0000000000000..a460d67f8e02f
--- /dev/null
+++ b/paddle/cinn/operator_fusion/backend/pattern_fuser.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/cinn/operator_fusion/backend/pattern.h"
+#include "paddle/cinn/operator_fusion/pattern.h"
+#include "paddle/cinn/operator_fusion/pattern_fuser.h"
+
+namespace cinn::fusion {
+
+// extern template to don't allow compiler specialize the following code.
+
+template <>
+StmtPattern<BackendStage> ConvertToStmtPattern(
+    const PatternContent<BackendStage>& content);
+
+template <>
+StmtPattern<BackendStage> MergePatternImpl(
+    const ReduceTreePattern<BackendStage>& first,
+    const TrivialPattern<BackendStage>& second);
+
+template <>
+StmtPattern<BackendStage> MergePatternImpl(
+    const TrivialPattern<BackendStage>& first,
+    const ReducePattern<BackendStage>& second);
+
+template <>
+StmtPattern<BackendStage> MergePatternImpl(
+    const TrivialPattern<BackendStage>& first,
+    const TrivialPattern<BackendStage>& second);
+
+template <>
+StmtPattern<BackendStage> MergePatternImpl(
+    const HorizontalFusionPattern<BackendStage>& first,
+    const HorizontalFusionPattern<BackendStage>& second);
+
+std::vector<ir::Expr> GetExprFromPattern(
+    const StmtPattern<BackendStage>& pattern);
+
+}  // namespace cinn::fusion
diff --git a/paddle/cinn/operator_fusion/frontend/CMakeLists.txt b/paddle/cinn/operator_fusion/frontend/CMakeLists.txt
new file mode 100644
index 0000000000000..e9683e520ef5a
--- /dev/null
+++ b/paddle/cinn/operator_fusion/frontend/CMakeLists.txt
@@ -0,0 +1 @@
+gather_srcs(frontend_fusion_src SRCS pattern_fuser.cc)
diff --git a/paddle/cinn/operator_fusion/frontend/pattern.h b/paddle/cinn/operator_fusion/frontend/pattern.h
new file mode 100644
index 0000000000000..e267483e56586
--- /dev/null
+++ b/paddle/cinn/operator_fusion/frontend/pattern.h
@@ -0,0 +1,92 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/cinn/operator_fusion/pattern.h"
+#include "paddle/cinn/operator_fusion/pattern_fuser.h"
+#include "paddle/cinn/operator_fusion/utils.h"
+
+namespace cinn::fusion {
+
+struct FrontendStage {};
+
+template <>
+struct PatternContent<FrontendStage> {
+  explicit PatternContent<FrontendStage>(pir::Operation* op) : op(op) {}
+  pir::Operation* op;
+  bool operator==(const PatternContent<FrontendStage>& other) const {
+    return op == other.op;
+  }
+};
+
+using FrontendContent = PatternContent<FrontendStage>;
+
+}  // namespace cinn::fusion
+
+namespace std {
+template <>
+struct hash<cinn::fusion::FrontendContent> {
+  size_t operator()(const cinn::fusion::FrontendContent& content) const {
+    return std::hash<pir::Operation*>()(content.op);
+  }
+};
+
+}  // namespace std
+
+namespace cinn::fusion {
+template <>
+struct TrivialPattern<FrontendStage> {
+  explicit TrivialPattern(const std::vector<pir::Operation*>& ops)
+      : ops_(ops) {}
+  std::vector<pir::Operation*> ops_;
+  static std::string name() { return "Trivial"; }
+  std::vector<pir::Operation*> ops() const { return ops_; }
+};
+
+template <>
+struct ReducePattern<FrontendStage> {
+  explicit ReducePattern(const std::vector<pir::Operation*>& ops) : ops_(ops) {}
+  std::vector<pir::Operation*> ops_;
+  std::vector<pir::Operation*> ops() const { return ops_; }
+  pir::Operation* GetReduceOp() const { return ops_.back(); }
+  static std::string name() { return "Reduce"; }
+};
+
+template <>
+struct UnsupportPattern<FrontendStage> {
+  explicit UnsupportPattern(const std::vector<pir::Operation*>& ops)
+      : ops_(ops) {}
+  std::vector<pir::Operation*> ops_;
+  std::vector<pir::Operation*> ops() const { return ops_; }
+  static std::string name() { return "Unsupport"; }
+};
+
+template <>
+struct HorizontalFusionPattern<FrontendStage> {
+  explicit HorizontalFusionPattern(
+      const std::vector<StmtPattern<FrontendStage>>& patterns)
+      : patterns_(patterns) {}
+  std::vector<StmtPattern<FrontendStage>> patterns_;
+  std::vector<pir::Operation*> ops() const {
+    std::vector<pir::Operation*> result;
+    for (const auto& pattern : patterns_) {
+      auto ops = GetOpsInPattern(pattern);
+      ExtendVector(&result, ops);
+    }
+    return result;
+  }
+  static std::string name() { return "HorizontalFusionPattern"; }
+};
+
+}  // namespace cinn::fusion
diff --git a/paddle/cinn/operator_fusion/frontend/pattern_fuser.cc b/paddle/cinn/operator_fusion/frontend/pattern_fuser.cc
new file mode 100644
index 0000000000000..332230f409979
--- /dev/null
+++ b/paddle/cinn/operator_fusion/frontend/pattern_fuser.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/cinn/operator_fusion/frontend/pattern_fuser.h"
+#include "paddle/cinn/operator_fusion/frontend/pattern.h"
+
+namespace cinn::fusion {
+
+template <>
+StmtPattern<FrontendStage> ConvertToStmtPattern(
+    const PatternContent<FrontendStage>& content) {
+  const auto& kind = GetOpPatternKind(content.op);
+  if (kind == hlir::framework::kReduction) {
+    return ReducePattern<FrontendStage>({content.op});
+  } else if (kind == hlir::framework::kElementWise ||
+             kind == hlir::framework::kBroadcast ||
+             kind == hlir::framework::kInjective) {
+    return TrivialPattern<FrontendStage>({content.op});
+  } else {
+    return UnsupportPattern<FrontendStage>({content.op});
+  }
+}
+
+template <>
+StmtPattern<FrontendStage> MergePatternImpl(
+    const ReduceTreePattern<FrontendStage>& first,
+    const TrivialPattern<FrontendStage>& second) {
+  return ReduceTreePlusTrivialPattern<FrontendStage>(first, second);
+}
+
+template <>
+StmtPattern<FrontendStage> MergePatternImpl(
+    const TrivialPattern<FrontendStage>& first,
+    const ReducePattern<FrontendStage>& second) {
+  const auto& contents =
+      UniqueConcatVector(GetOpsInPattern<FrontendStage>(first),
+                         GetOpsInPattern<FrontendStage>(second));
+  return ReducePattern<FrontendStage>(contents);
+}
+
+template <>
+StmtPattern<FrontendStage> MergePatternImpl(
+    const TrivialPattern<FrontendStage>& first,
+    const TrivialPattern<FrontendStage>& second) {
+  const auto& contents =
+      UniqueConcatVector(GetOpsInPattern<FrontendStage>(first),
+                         GetOpsInPattern<FrontendStage>(second));
+  return TrivialPattern<FrontendStage>(contents);
+}
+
+template <>
+StmtPattern<FrontendStage> MergePatternImpl(
+    const HorizontalFusionPattern<FrontendStage>& first,
+    const HorizontalFusionPattern<FrontendStage>& second) {
+  const auto& contents =
+      UniqueConcatVector(GetOpsInPattern<FrontendStage>(first),
+                         GetOpsInPattern<FrontendStage>(second));
+  return HorizontalFusionPattern<FrontendStage>({first, second});
+}
+
+}  // namespace cinn::fusion
diff --git a/paddle/cinn/operator_fusion/frontend/pattern_fuser.h b/paddle/cinn/operator_fusion/frontend/pattern_fuser.h
new file mode 100644
index 0000000000000..d92b8429ad16b
--- /dev/null
+++ b/paddle/cinn/operator_fusion/frontend/pattern_fuser.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/cinn/operator_fusion/frontend/pattern.h"
+#include "paddle/cinn/operator_fusion/pattern.h"
+#include "paddle/cinn/operator_fusion/pattern_fuser.h"
+
+namespace cinn::fusion {
+
+// extern template to don't allow compiler specialize the following code.
+
+template <>
+StmtPattern<FrontendStage> ConvertToStmtPattern(
+    const PatternContent<FrontendStage>& content);
+
+template <>
+StmtPattern<FrontendStage> MergePatternImpl(
+    const ReduceTreePattern<FrontendStage>& first,
+    const TrivialPattern<FrontendStage>& second);
+
+template <>
+StmtPattern<FrontendStage> MergePatternImpl(
+    const TrivialPattern<FrontendStage>& first,
+    const ReducePattern<FrontendStage>& second);
+
+template <>
+StmtPattern<FrontendStage> MergePatternImpl(
+    const TrivialPattern<FrontendStage>& first,
+    const TrivialPattern<FrontendStage>& second);
+
+template <>
+StmtPattern<FrontendStage> MergePatternImpl(
+    const HorizontalFusionPattern<FrontendStage>& first,
+    const HorizontalFusionPattern<FrontendStage>& second);
+
+}  // namespace cinn::fusion
diff --git a/paddle/cinn/operator_fusion/group_cluster.h b/paddle/cinn/operator_fusion/group_cluster.h
new file mode 100644
index 0000000000000..aa545699a0d4d
--- /dev/null
+++ b/paddle/cinn/operator_fusion/group_cluster.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/cinn/operator_fusion/backend/pattern.h"
+#include "paddle/cinn/operator_fusion/backend/pattern_fuser.h"
+#include "paddle/cinn/operator_fusion/frontend/pattern.h"
+#include "paddle/cinn/operator_fusion/frontend/pattern_fuser.h"
+#include "paddle/cinn/operator_fusion/pattern_graph.h"
+#include "paddle/cinn/operator_fusion/policy/general_topo_policy.h"
+#include "paddle/cinn/operator_fusion/policy/relative_judge_policy.h"
+#include "paddle/cinn/operator_fusion/policy/shardable_axes_policy.h"
+
+namespace cinn::fusion {
+
+template <typename T>
+inline std::vector<fusion::PatternNodePtr<T>> ClusterOps(
+    const std::vector<fusion::PatternContent<T>>& contents) {
+  std::function<pir::Operation*(fusion::PatternContent<T>)> func =
+      [](const fusion::PatternContent<T>& content) { return content.op; };
+  const auto& origin_ops = fusion::MapVector(contents, func);
+  CHECK_GT(origin_ops.size(), 0);
+  VLOG(4) << "Start Cluster Ops!";
+  VLOG(4) << "Input Group with size " << origin_ops.size() << " :\n"
+          << fusion::OpsDebugStr(origin_ops);
+
+  std::vector<pir::Value> outputs;
+  const auto& ops = [&] {
+    std::vector<pir::Operation*> ops;
+    for (const auto& content : contents) {
+      if (content.op->name() == "cf.yield") {  // just skip cf.yield.
+        for (auto& operand : content.op->operands()) {
+          outputs.push_back(operand.source());
+        }
+        continue;
+      }
+      ops.emplace_back(content.op);
+    }
+    return ops;
+  }();
+
+  const auto& content_without_yield =
+      FilterVector(contents, [](const fusion::PatternContent<T>& content) {
+        return content.op->name() != "cf.yield";
+      });
+
+  pir::Program* program = ops.at(0)->GetParentProgram();
+
+  const auto* shape_analysis =
+      &pir::ShapeAnalysisManager::Instance().Get(program);
+
+  VLOG(4) << "Start Create Policies and PolicyManager!";
+  const auto& relative_judge_policy =
+      std::make_shared<fusion::RelativeJudgePolicy<T>>(ops, shape_analysis);
+
+  const auto& general_topo_policy =
+      std::make_shared<fusion::GeneralTopoPolicy<T>>();
+
+  auto policy_manager =
+      fusion::PolicyManager<T>({relative_judge_policy, general_topo_policy});
+
+  auto topo_manager = fusion::PolicyManager<T>({general_topo_policy});
+
+  VLOG(4) << "Start Create PatternGraph";
+  fusion::PatternGraph<T> graph(
+      content_without_yield, outputs, policy_manager, topo_manager);
+  auto result = graph.ClusterOps();
+
+  VLOG(4) << "End Cluster Ops! result size:" << result.size();
+  for (const auto& node : result) {
+    VLOG(4) << "\n"
+            << node->DebugStr() << "\n"
+            << fusion::StmtPatternDebugStr(node->stmt_pattern_);
+  }
+
+  return result;
+}
+
+}  // namespace cinn::fusion
diff --git a/paddle/cinn/operator_fusion/pattern.h b/paddle/cinn/operator_fusion/pattern.h
new file mode 100644
index 0000000000000..908b4a4348bfc
--- /dev/null
+++ b/paddle/cinn/operator_fusion/pattern.h
@@ -0,0 +1,101 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <unordered_set>
+#include <variant>
+#include <vector>
+#include "glog/logging.h"
+#include "paddle/cinn/operator_fusion/utils.h"
+#include "paddle/pir/include/core/operation.h"
+
+namespace cinn::fusion {
+
+template <typename T>
+struct PatternContent {};
+
+template <typename T>
+class TrivialPattern {};
+
+template <typename T>
+class ReducePattern {};
+
+template <typename T>
+struct ReduceTreePattern {
+  explicit ReduceTreePattern(const std::vector<ReduceTreePattern<T>>& childs,
+                             const ReducePattern<T>& root)
+      : childs_(childs), root_(root) {}
+  const ReducePattern<T>& GetRootPattern() const { return root_; }
+  std::vector<pir::Operation*> ops() const {
+    std::vector<pir::Operation*> result{root_.ops()};
+    for (const auto& child : childs_) {
+      result = UniqueConcatVector(result, child.ops());
+    }
+    return result;
+  }
+  static std::string name() { return "ReduceTree"; }
+  const std::vector<ReduceTreePattern<T>>& childs() const { return childs_; }
+  std::vector<ReduceTreePattern<T>>& childs() { return childs_; }
+  void InsertChild(const ReduceTreePattern<T>& child) {
+    childs_.push_back(child);
+  }
+  std::vector<ReducePattern<T>> FlattenReducePattern() const {
+    std::vector<ReducePattern<T>> result;
+    for (const auto& child : childs_) {
+      result = ConcatVector(result, child.FlattenReducePattern());
+    }
+    return result;
+  }
+
+ private:
+  std::vector<ReduceTreePattern<T>> childs_;
+  ReducePattern<T> root_;
+};
+
+template <typename T>
+struct ReduceTreePlusTrivialPattern {
+  explicit ReduceTreePlusTrivialPattern(const ReduceTreePattern<T>& tree,
+                                        const TrivialPattern<T>& sink_trivial)
+      : tree(tree), sink_trivial(sink_trivial) {}
+  ReduceTreePattern<T> tree;
+  TrivialPattern<T> sink_trivial;
+  std::vector<pir::Operation*> ops() const {
+    return UniqueConcatVector(tree.ops(), sink_trivial.ops());
+  }
+  static std::string name() { return "ReduceTree+Trivial"; }
+  std::vector<size_t> fake_reduce_iter_idx;
+};
+
+template <typename T>
+class UnsupportPattern {};
+template <typename T>
+class HorizontalFusionPattern {};
+
+template <typename T>
+using StmtPatternBase = std::variant<TrivialPattern<T>,
+                                     ReducePattern<T>,
+                                     ReduceTreePattern<T>,
+                                     ReduceTreePlusTrivialPattern<T>,
+                                     HorizontalFusionPattern<T>,
+                                     UnsupportPattern<T>>;
+
+template <typename T>
+struct StmtPattern final : public StmtPatternBase<T> {
+  using StmtPatternBase<T>::StmtPatternBase;
+  const StmtPatternBase<T>& variant() const {
+    return static_cast<const StmtPatternBase<T>&>(*this);
+  }
+};
+}  // namespace cinn::fusion
diff --git a/paddle/cinn/operator_fusion/pattern_fuser.h b/paddle/cinn/operator_fusion/pattern_fuser.h
new file mode 100644
index 0000000000000..802031b6b2304
--- /dev/null
+++ b/paddle/cinn/operator_fusion/pattern_fuser.h
@@ -0,0 +1,208 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <memory>
+#include <optional>
+#include <typeinfo>
+#include <unordered_map>
+#include <unordered_set>
+#include <variant>
+#include <vector>
+
+#include "glog/logging.h"
+
+#include "paddle/cinn/adt/adt.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/operator_fusion/pattern.h"
+#include "paddle/cinn/operator_fusion/utils.h"
+
+// This file is the protocol of the pattern fuser. Please implement
+// ConvertToStmtPattern and MergePatternImpl in the specializations.
+
+namespace cinn::fusion {
+
+template <typename T>
+ReducePattern<T> ToReducePattern(const StmtPattern<T>& second) {
+  return std::get<ReducePattern<T>>(second);
+}
+
+template <typename T>
+std::string GetPatternName(const StmtPattern<T>& s) {
+  return std::visit([](const auto& impl) { return impl.name(); }, s.variant());
+}
+
+template <typename T>
+StmtPattern<T> ConvertToStmtPattern(const PatternContent<T>& content);
+
+template <typename T>
+std::vector<pir::Operation*> GetOpsInPattern(const StmtPattern<T>& pattern) {
+  return std::visit([](const auto& impl) { return impl.ops(); },
+                    pattern.variant());
+}
+
+template <typename T>
+bool IsReducePattern(const StmtPattern<T>& pattern) {
+  return std::holds_alternative<ReducePattern<T>>(pattern);
+}
+
+template <typename T>
+bool IsReduceTreePattern(const StmtPattern<T>& pattern) {
+  return std::holds_alternative<ReduceTreePattern<T>>(pattern);
+}
+
+template <typename T>
+bool IsOpsDependents(const StmtPattern<T>& pattern) {
+  return std::holds_alternative<ReduceTreePattern<T>>(pattern);
+}
+
+template <typename T>
+bool IsUnsupportPattern(const StmtPattern<T>& pattern) {
+  return std::holds_alternative<UnsupportPattern<T>>(pattern);
+}
+
+template <typename T>
+bool IsReduceTrivialPattern(const StmtPattern<T>& pattern) {
+  return std::holds_alternative<ReduceTreePlusTrivialPattern<T>>(pattern);
+}
+
+template <typename T>
+std::unordered_set<pir::Value> GetPatternInputValuesIncludeInner(
+    const StmtPattern<T>& A) {
+  std::unordered_set<pir::Value> result;
+  for (const auto& op : GetOpsInPattern(A)) {
+    for (const auto& value : op->operands()) {
+      result.insert(value.source());
+    }
+  }
+  return result;
+}
+
+template <typename T>
+std::unordered_set<pir::Value> GetPatternOutputValuesIncludedInner(
+    const StmtPattern<T>& A) {
+  std::unordered_set<pir::Value> result;
+  for (const auto& op : GetOpsInPattern(A)) {
+    for (const auto& value : op->results()) {
+      result.insert(value);
+    }
+  }
+  return result;
+}
+
+template <typename T>
+std::unordered_set<pir::Value> GetPatternInputValues(const StmtPattern<T>& A) {
+  auto all_input_values = GetPatternInputValuesIncludeInner(A);
+  for (const auto& value : GetPatternOutputValuesIncludedInner(A)) {
+    all_input_values.erase(value);
+  }
+  VLOG(4) << "GetPatternInputValues: " << all_input_values.size();
+  return all_input_values;
+}
+
+template <typename T>
+std::string StmtPatternDebugStr(const StmtPattern<T>& stmt) {
+  std::stringstream ss;
+  auto all_ops = GetOpsInPattern(stmt);
+  ss << "StmtPattern, size " << all_ops.size() << " :\n";
+  ss << OpsDebugStr(all_ops);
+  return ss.str();
+}
+
+static bool IsDirectUpstream(const pir::Operation* upstream,
+                             const pir::Operation* downstream) {
+  for (const auto& value : downstream->results()) {
+    for (const auto& operand : upstream->operands()) {
+      if (value == operand.source()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+template <typename T>
+int InsertDownstreamIntoTree(const ReduceTreePattern<T>& upstream,
+                             ReduceTreePattern<T>& downstream) {  // NOLINT
+  if (IsDirectUpstream(upstream.GetRootPattern().GetReduceOp(),
+                       downstream.GetRootPattern().GetReduceOp())) {
+    downstream.InsertChild(upstream);
+    return 1;
+  }
+  int insert_num = 0;
+  for (auto& child : downstream.childs()) {
+    insert_num += InsertDownstreamIntoTree(upstream, child);
+  }
+  return insert_num;
+}
+
+template <typename T>
+StmtPattern<T> MergePatternImpl(const ReduceTreePattern<T>& upstream,
+                                const ReduceTreePattern<T>& downstream) {
+  ReduceTreePattern<T> result = downstream;  // copy first.
+  int insert_num = InsertDownstreamIntoTree(upstream, result);
+  CHECK(insert_num == 1) << "Must insert only once, but insert " << insert_num;
+  return result;
+}
+
+template <typename T>
+StmtPattern<T> MergePatternImpl(const ReduceTreePattern<T>& first,
+                                const TrivialPattern<T>& second);
+
+template <typename T>
+StmtPattern<T> MergePatternImpl(const TrivialPattern<T>& first,
+                                const ReducePattern<T>& second);
+
+template <typename T>
+StmtPattern<T> MergePatternImpl(const TrivialPattern<T>& first,
+                                const TrivialPattern<T>& second);
+
+template <typename T>
+StmtPattern<T> MergePatternImpl(const HorizontalFusionPattern<T>& first,
+                                const HorizontalFusionPattern<T>& second);
+
+template <typename T>
+StmtPattern<T> MergePattern(const StmtPattern<T>& first,
+                            const StmtPattern<T>& second) {
+  VLOG(4) << "MergePattern: " << GetPatternName(first) << " x "
+          << GetPatternName(second);
+  const auto PatternMatch = adt::match{
+      [&](const ReduceTreePattern<T>& lhs, const ReduceTreePattern<T>& rhs) {
+        return MergePatternImpl(lhs, rhs);
+      },
+      [&](const ReduceTreePattern<T>& lhs, const TrivialPattern<T>& rhs) {
+        return MergePatternImpl(lhs, rhs);
+      },
+      [&](const TrivialPattern<T>& lhs, const ReducePattern<T>& rhs) {
+        return MergePatternImpl(lhs, rhs);
+      },
+      [&](const TrivialPattern<T>& lhs, const TrivialPattern<T>& rhs) {
+        return MergePatternImpl(lhs, rhs);
+      },
+      [&](const HorizontalFusionPattern<T>& lhs,
+          const HorizontalFusionPattern<T>& rhs) {
+        return MergePatternImpl(lhs, rhs);
+      },
+      [&](const auto& lhs, const auto& rhs) -> StmtPattern<T> {
+        CHECK(false) << "Found not support merge!" << GetPatternName(first)
+                     << "X" << GetPatternName(second);
+      },
+  };
+  return std::visit(PatternMatch, first.variant(), second.variant());
+}
+
+}  // namespace cinn::fusion
diff --git a/paddle/cinn/operator_fusion/pattern_graph.cc b/paddle/cinn/operator_fusion/pattern_graph.cc
new file mode 100644
index 0000000000000..547f7ff9e14cf
--- /dev/null
+++ b/paddle/cinn/operator_fusion/pattern_graph.cc
@@ -0,0 +1,258 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/operator_fusion/pattern_graph.h"
+#include "paddle/cinn/operator_fusion/backend/pattern.h"
+#include "paddle/cinn/operator_fusion/backend/pattern_fuser.h"
+#include "paddle/cinn/operator_fusion/frontend/pattern.h"
+#include "paddle/cinn/operator_fusion/frontend/pattern_fuser.h"
+
+namespace cinn::fusion {
+
+template <typename T>
+std::vector<PatternNodePtr<T>> PatternGraph<T>::ClusterOps() {
+  VLOG(4) << "[Group Cluster] Initial Condition: " << GraphInfo();
+
+  VLOG(4) << "[Group Cluster] Start SinkTrivialPattern";
+  SinkTrivialPattern();
+  VLOG(4) << "[Group Cluster] After SinkTrivialPattern: " << GraphInfo();
+
+  // ReducePattern -> ReduceTreePattern
+  VLOG(4) << "[Group Cluster] Start ReduceLiftReduceTree";
+  ReduceLiftReduceTree();
+  VLOG(4) << "[Group Cluster] After ReduceLiftReduceTree: " << GraphInfo();
+
+  // ReduceTreePattern + ReduceTreePattern fusion
+  VLOG(4) << "[Group Cluster] Start ReduceTreeGrown";
+  ReduceTreeGrown();
+  VLOG(4) << "[Group Cluster] After ReduceTreeGrown: " << GraphInfo();
+
+  // ReduceTreePattern + TrivialPattern fusion.
+  VLOG(4) << "[Group Cluster] Start ReduceTree_Trivial_Fusion";
+  ReduceTree_Trivial_Fusion();
+  VLOG(4) << "[Group Cluster] After ReduceTree_Trivial_Fusion: " << GraphInfo();
+
+  // Horizontal fusion.
+  VLOG(4) << "[Group Cluster] Start HorizontalFusion";
+  HorizontalFusion();
+  VLOG(4) << "[Group Cluster] After HorizontalFusion: " << GraphInfo();
+
+  return SortByTopoOrder();
+}
+
+template <typename T>
+std::vector<PatternNodePtr<T>> PatternGraph<T>::SortByTopoOrder() {
+  // sort all_pattern_nodes_ by topo order.
+  std::vector<PatternNodePtr<T>> res;
+  std::list<PatternNodePtr<T>> topo_queue;
+  std::map<PatternNodePtr<T>, int> degree;
+  for (const auto& node : all_pattern_nodes_) {
+    degree[node] = node->upstream_.size();
+    if (degree[node] == 0) {
+      topo_queue.push_back(node);
+    }
+  }
+  while (!topo_queue.empty()) {
+    PatternNodePtr<T> node = topo_queue.front();
+    topo_queue.pop_front();
+    res.push_back(node);
+    for (const auto& downstream_op : node->downstream_) {
+      degree[downstream_op] = degree[downstream_op] - 1;
+      if (degree[downstream_op] == 0) {
+        topo_queue.push_back(downstream_op);
+      }
+    }
+  }
+  return res;
+}
+
+template <typename T>
+void PatternGraph<T>::SinkTrivialPattern() {
+  GraphTransformer<
+      NodePattern,
+      T,
+      And<And<NonSinkNodeMatcher, StmtPatternGraphMatcher<TrivialPattern<T>>>,
+          IsNotOutputNodeMatcher>,
+      MergeTrivialPatternOperation>(this);
+}
+
+template <typename T>
+void PatternGraph<T>::ReduceLiftReduceTree() {
+  GraphTransformer<
+      NodePattern,
+      T,
+      And<DownstreamSmallerThan<2>, StmtPatternGraphMatcher<ReducePattern<T>>>,
+      LiftReduceToReduceTreeOperation>(this);
+}
+
+template <typename T>
+void PatternGraph<T>::HorizontalFusion() {
+  GraphTransformer<NodePattern,
+                   T,
+                   StmtPatternGraphMatcher<TrivialPattern<T>>,
+                   LiftToHorizontalFusionPatternOperation>(this);
+
+  GraphTransformer<NodePairPattern,
+                   T,
+                   HorizontalFusionConstrain,
+                   HorizontalFusionOperation>(this);
+}
+
+template <typename T>
+void PatternGraph<T>::ReduceTreeGrown() {
+  GraphTransformer<NodePattern,
+                   T,
+                   And<CanFuseReduceTreeMatcher, IsNotOutputNodeMatcher>,
+                   MergeReduceTreeOperation>(this);
+}
+
+template <typename T>
+void PatternGraph<T>::ReduceTree_Trivial_Fusion() {
+  GraphTransformer<
+      NodePattern,
+      T,
+      And<CanFuseReduceTreeAndTrivialMatcher, IsNotOutputNodeMatcher>,
+      MergeReduceTreeAndTrivialOperation>(this);
+}
+
+template <typename T>
+PatternGraph<T>::PatternGraph(const std::vector<PatternContent<T>>& contents,
+                              const std::vector<pir::Value>& outputs,
+                              const PolicyManager<T> policy_manager,
+                              const PolicyManager<T> topo_manager)
+    : policy_manager_(policy_manager),
+      topo_manager_(topo_manager),
+      outputs_(outputs) {
+  std::unordered_map<pir::Operation*, PatternNodePtr<T>> op_to_node_map;
+
+  VLOG(4) << "len(outputs) = " << outputs_.size();
+  for (const auto& v : outputs) {
+    VLOG(4) << "output is" << OpsDebugStr({v.defining_op()});
+  }
+
+  for (const auto& content : contents) {
+    PatternNodePtr<T> node = std::make_shared<PatternNode<T>>(content);
+    op_to_node_map[content.op] = node;
+    all_pattern_nodes_.emplace(node);
+    node->sink_op_ = content.op;
+  }
+
+  for (const auto& content : contents) {
+    PatternNodePtr<T> cur_node = op_to_node_map[content.op];
+
+    // add upstream nodes
+    for (int i = 0; i < content.op->num_operands(); ++i) {
+      ::pir::Operation* input_op = content.op->operand_source(i).defining_op();
+      if (op_to_node_map.find(input_op) != op_to_node_map.end()) {
+        PatternNodePtr<T> upstream_node = op_to_node_map[input_op];
+        cur_node->upstream_.push_back(upstream_node);
+      }
+    }
+
+    // add downstream nodes
+    for (int i = 0; i < content.op->num_results(); ++i) {
+      pir::Value related_value = content.op->result(i);
+      for (auto consumer_it = related_value.use_begin();
+           consumer_it != related_value.use_end();
+           ++consumer_it) {
+        ::pir::Operation* output_op = consumer_it->owner();
+        if (op_to_node_map.find(output_op) != op_to_node_map.end()) {
+          PatternNodePtr<T> downstream_node = op_to_node_map[output_op];
+          cur_node->downstream_.push_back(downstream_node);
+        }
+      }
+    }
+  }
+
+  VLOG(4) << "PatternGraph Created, pattern node size: "
+          << all_pattern_nodes_.size();
+}
+
+template <typename T>
+void PatternGraph<T>::RemoveNode(const PatternNodePtr<T>& node) {
+  VLOG(4) << "Start Remove: " << node;
+  if (all_pattern_nodes_.find(node) != all_pattern_nodes_.end()) {
+    VLOG(4) << "Removed! ";
+    all_pattern_nodes_.erase(node);
+  }
+
+  for (PatternNodePtr<T>& upstream : node->upstream_) {
+    RemoveFromVector(&upstream->downstream_, node);
+  }
+
+  for (PatternNodePtr<T>& downstream : node->downstream_) {
+    RemoveFromVector(&downstream->upstream_, node);
+  }
+}
+
+template <typename T>
+void PatternGraph<T>::AppendNode(const PatternNodePtr<T>& node) {
+  all_pattern_nodes_.emplace(node);
+}
+
+template <typename T>
+std::string PatternGraph<T>::GraphInfo() const {
+  std::stringstream ss;
+  ss << "\n========= GraphInfo ===========";
+  for (const auto& v : all_pattern_nodes_) {
+    ss << "\n" << v->DebugStr();
+    ss << "\n    IsOutput: " << IsOutputNodeMatcher()(*this, v);
+  }
+  ss << "\n===============================";
+  return ss.str();
+}
+
+template <typename T>
+PatternNodePtr<T> PatternGraph<T>::MergeNode(
+    const PatternNodePtr<T>& upstream, const PatternNodePtr<T>& downstream) {
+  PatternNodePtr<T> merged_node =
+      std::make_shared<PatternNode<T>>(upstream, downstream);
+
+  // deal with the reference.
+  ExtendVector(&merged_node->upstream_, upstream->upstream_);
+  ExtendVector(&merged_node->upstream_, downstream->upstream_);
+  RemoveFromVector(&merged_node->upstream_, upstream);
+
+  ExtendVector(&merged_node->downstream_, upstream->downstream_);
+  ExtendVector(&merged_node->downstream_, downstream->downstream_);
+  RemoveFromVector(&merged_node->downstream_, downstream);
+
+  for (const auto& upstream_node : merged_node->upstream_) {
+    upstream_node->downstream_.push_back(merged_node);
+    RemoveFromVector(&upstream_node->downstream_, upstream);
+    RemoveFromVector(&upstream_node->downstream_, downstream);
+  }
+  for (const auto& downstream_node : merged_node->downstream_) {
+    downstream_node->upstream_.push_back(merged_node);
+    RemoveFromVector(&downstream_node->downstream_, upstream);
+    RemoveFromVector(&downstream_node->downstream_, downstream);
+  }
+
+  const auto vec_unique = [](const std::vector<PatternNodePtr<T>>& vec) {
+    auto set = std::unordered_set(vec.begin(), vec.end());
+    return set.size() == vec.size();
+  };
+
+  CHECK(vec_unique(merged_node->upstream_));
+  CHECK(vec_unique(merged_node->downstream_));
+
+  // deal with the graph storage.
+  AppendNode(merged_node);
+  return merged_node;
+}
+
+template class PatternGraph<FrontendStage>;
+template class PatternGraph<BackendStage>;
+
+}  // namespace cinn::fusion
diff --git a/paddle/cinn/operator_fusion/pattern_graph.h b/paddle/cinn/operator_fusion/pattern_graph.h
new file mode 100644
index 0000000000000..589235d8d76a8
--- /dev/null
+++ b/paddle/cinn/operator_fusion/pattern_graph.h
@@ -0,0 +1,373 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/cinn/operator_fusion/pattern_node.h"
+#include "paddle/cinn/operator_fusion/policy/policy_manager.h"
+#include "paddle/cinn/operator_fusion/policy/relative_judge_policy.h"
+#include "paddle/cinn/operator_fusion/utils.h"
+
+namespace cinn::fusion {
+
+template <typename T>
+using PatternNodePtrSet = std::unordered_set<PatternNodePtr<T>>;
+
+template <typename T>
+class PatternGraph {
+ public:
+  PatternGraph(const std::vector<PatternContent<T>>& nodes,
+               const std::vector<pir::Value>& outputs,
+               const PolicyManager<T> policy_manager,
+               const PolicyManager<T> topo_manager);
+
+  std::vector<PatternNodePtr<T>> ClusterOps();
+
+  void SinkTrivialPattern();
+  void HorizontalFusion();
+  void ReduceLiftReduceTree();
+  void ReduceTreeGrown();
+  void ReduceTree_Trivial_Fusion();
+
+  void RemoveNode(const PatternNodePtr<T>& node);
+  void AppendNode(const PatternNodePtr<T>& node);
+  std::string GraphInfo() const;
+  PatternNodePtr<T> MergeNode(const PatternNodePtr<T>& upstream,
+                              const PatternNodePtr<T>& downstream);
+  std::vector<PatternNodePtr<T>> SortByTopoOrder();
+
+ public:
+  PatternNodePtrSet<T> all_pattern_nodes_;
+  std::vector<pir::Value> outputs_;
+  PolicyManager<T> policy_manager_;
+  PolicyManager<T> topo_manager_;
+};
+
+// PatternGraphFusionOperation := (GraphMatcher, GraphOperation)
+// SearchAlgorithm := NodePattern | EdgePattern | GraphMatcher
+// GraphOperation := Merge2Node | SplitNode | SplitAllAndMergeDownstream
+
+struct NodePattern {};
+struct EdgePattern {};
+struct GraphPattern {};     // not implemented.
+struct NodePairPattern {};  // not implemented.
+
+template <typename Kind,
+          typename Phrase,
+          typename GraphMatcher,
+          typename GraphOperation>
+struct SearchAlgorithm {};
+
+template <typename Phrase, typename GraphMatcher, typename GraphOperation>
+struct SearchAlgorithm<NodePattern, Phrase, GraphMatcher, GraphOperation> {
+  PatternGraph<Phrase>* graph_;
+  PatternNodePtrSet<Phrase> visited_nodes;
+
+  explicit SearchAlgorithm(PatternGraph<Phrase>* graph) {
+    VLOG(4) << "Create NodePattern algorithm.";
+    graph_ = graph;
+  }
+
+  PatternNodePtr<Phrase> FindMatchedNode() {
+    for (PatternNodePtr<Phrase> iter_node : graph_->all_pattern_nodes_) {
+      if (GraphMatcher()(*graph_, iter_node) &&
+          !visited_nodes.count(iter_node)) {
+        visited_nodes.insert(iter_node);
+        VLOG(4) << "Find Matched Node: " << iter_node;
+        return iter_node;
+      }
+    }
+    VLOG(4) << "Can't find matched node any more.";
+    return nullptr;
+  }
+
+  void operator()() {
+    while (true) {
+      PatternNodePtr<Phrase> node = FindMatchedNode();
+      if (node == nullptr) {
+        break;
+      }
+      GraphOperation()(graph_, node);
+    }
+  }
+};
+
+template <typename Phrase, typename GraphMatcher, typename GraphOperation>
+struct SearchAlgorithm<NodePairPattern, Phrase, GraphMatcher, GraphOperation> {
+  PatternGraph<Phrase>* graph_;
+  std::set<std::pair<PatternNodePtr<Phrase>, PatternNodePtr<Phrase>>>
+      visited_node_pair;
+  explicit SearchAlgorithm(PatternGraph<Phrase>* graph) {
+    VLOG(4) << "Create NodePairPattern algorithm.";
+    graph_ = graph;
+  }
+  std::optional<std::pair<PatternNodePtr<Phrase>, PatternNodePtr<Phrase>>>
+  FindMatchedPair() {
+    for (PatternNodePtr<Phrase> i : graph_->all_pattern_nodes_) {
+      for (PatternNodePtr<Phrase> j : graph_->all_pattern_nodes_) {
+        if (i == j) continue;
+        const auto& pair = std::make_pair(i, j);
+        if (GraphMatcher()(*graph_, i, j) && !visited_node_pair.count(pair)) {
+          visited_node_pair.insert(pair);
+          VLOG(4) << "Find Matched Node Pair: (" << i << ", " << j << ")";
+          return pair;
+        }
+      }
+    }
+    VLOG(4) << "Can't find matched node any more.";
+    return {};
+  }
+  void operator()() {
+    while (true) {
+      const auto& node = FindMatchedPair();
+      if (!node.has_value()) break;
+      const auto& [i, j] = node.value();
+      GraphOperation()(graph_, i, j);
+    }
+  }
+};
+
+// Operation
+
+struct MergeReduceTreeOperation {
+  template <typename Phrase>
+  void operator()(PatternGraph<Phrase>* graph, PatternNodePtr<Phrase> node) {
+    CHECK_EQ(node->downstream_.size(), 1);
+    auto downstream = node->downstream_.at(0);
+    auto merged_node = graph->MergeNode(node, downstream);
+    graph->RemoveNode(downstream);
+    graph->RemoveNode(node);
+    VLOG(4) << "MergeReduceTreeOperation: \nupstream " << node->DebugStr()
+            << "\ndownstream " << downstream->DebugStr() << "\nmerged "
+            << merged_node->DebugStr();
+  }
+};
+
+struct MergeReduceTreeAndTrivialOperation {
+  template <typename Phrase>
+  void operator()(PatternGraph<Phrase>* graph, PatternNodePtr<Phrase> node) {
+    CHECK_EQ(node->downstream_.size(), 1);
+    auto downstream = node->downstream_.at(0);
+    auto fake_reduce_iter_idx =
+        graph->policy_manager_.GetFakeReduceIterIdx(node, downstream);
+    PatternNodePtr<Phrase> merged_node = graph->MergeNode(node, downstream);
+    std::get<ReduceTreePlusTrivialPattern<Phrase>>(merged_node->stmt_pattern_)
+        .fake_reduce_iter_idx = fake_reduce_iter_idx;
+    graph->RemoveNode(downstream);
+    graph->RemoveNode(node);
+    VLOG(4) << "MergeReduceTreeAndTrivialOperation: \nupstream "
+            << node->DebugStr() << "\ndownstream " << downstream->DebugStr()
+            << "\nmerged " << merged_node->DebugStr();
+  }
+};
+
+struct LiftReduceToReduceTreeOperation {
+  template <typename Phrase>
+  void operator()(PatternGraph<Phrase>* graph, PatternNodePtr<Phrase> node) {
+    const auto& reduce_pattern = ToReducePattern<Phrase>(node->stmt_pattern_);
+    node->stmt_pattern_ = ReduceTreePattern<Phrase>({}, reduce_pattern);
+    VLOG(4) << "LiftReduceToReduceTreeOperation: \nnode " << node->DebugStr();
+  }
+};
+
+struct MergeTrivialPatternOperation {
+  template <typename Phrase>
+  void operator()(PatternGraph<Phrase>* graph,
+                  PatternNodePtr<Phrase> upstream) {
+    std::vector<PatternNodePtr<Phrase>> fusion_candidate =
+        upstream->downstream_;
+    upstream->downstream_.clear();
+    for (const auto& downstream : fusion_candidate) {
+      if (std::holds_alternative<ReducePattern<Phrase>>(
+              downstream->stmt_pattern_) ||
+          std::holds_alternative<TrivialPattern<Phrase>>(
+              downstream->stmt_pattern_)) {
+        auto merged_node = graph->MergeNode(upstream, downstream);
+        graph->RemoveNode(downstream);
+        VLOG(4) << "MergeTrivialPatternOperation: \nupstream "
+                << upstream->DebugStr() << "\ndownstream "
+                << downstream->DebugStr() << "\nmerged "
+                << merged_node->DebugStr();
+      } else {
+        upstream->downstream_.push_back(downstream);
+      }
+    }
+    if (upstream->downstream_.empty()) {
+      graph->RemoveNode(upstream);
+    }
+  }
+};
+
+struct LiftToHorizontalFusionPatternOperation {
+  template <typename Phrase>
+  void operator()(PatternGraph<Phrase>* graph, PatternNodePtr<Phrase> i) {
+    i->stmt_pattern_ = HorizontalFusionPattern<Phrase>({i->stmt_pattern_});
+  }
+};
+
+// Matcher
+
+template <typename StmtPattern>
+struct AlwaysTrue {
+  template <typename T>
+  bool operator()(const PatternGraph<T>& graph, const PatternNodePtr<T>& node) {
+    return true;
+  }
+};
+
+template <typename StmtPattern>
+struct StmtPatternGraphMatcher {
+  template <typename T>
+  bool operator()(const PatternGraph<T>& graph, const PatternNodePtr<T>& node) {
+    return GetPatternName(node->stmt_pattern_) == StmtPattern::name();
+  }
+};
+
+struct CanFuseRxTMatcher {
+  template <typename T>
+  bool operator()(const PatternGraph<T>& graph, const PatternNodePtr<T>& node) {
+    return (std::holds_alternative<ReduceTreePattern<T>>(node->stmt_pattern_) &&
+            !node->downstream_.empty() &&
+            std::holds_alternative<TrivialPattern<T>>(
+                node->downstream_.at(0)->stmt_pattern_));
+  }
+};
+
+struct CanFuseReduceTreeMatcher {
+  template <typename T>
+  bool operator()(const PatternGraph<T>& graph, const PatternNodePtr<T>& node) {
+    return StmtPatternGraphMatcher<ReduceTreePattern<T>>()(graph, node) &&
+           !node->downstream_.empty() &&
+           std::holds_alternative<ReduceTreePattern<T>>(
+               node->downstream_.at(0)->stmt_pattern_) &&
+           graph.policy_manager_.CanFuse(node, node->downstream_.at(0));
+  }
+};
+
+struct CanFuseReduceTreeAndTrivialMatcher {
+  template <typename T>
+  bool operator()(const PatternGraph<T>& graph, const PatternNodePtr<T>& node) {
+    return StmtPatternGraphMatcher<ReduceTreePattern<T>>()(graph, node) &&
+           !node->downstream_.empty() &&
+           std::holds_alternative<TrivialPattern<T>>(
+               node->downstream_.at(0)->stmt_pattern_) &&
+           graph.policy_manager_.CanFuse(node, node->downstream_.at(0));
+  }
+};
+
+struct HorizontalFusionConstrain {
+  template <typename T>
+  bool operator()(const PatternGraph<T>& graph,
+                  const PatternNodePtr<T>& first,
+                  const PatternNodePtr<T>& second) {
+    if (!StmtPatternGraphMatcher<HorizontalFusionPattern<T>>()(graph, first)) {
+      return false;
+    }
+    if (!StmtPatternGraphMatcher<HorizontalFusionPattern<T>>()(graph, second)) {
+      return false;
+    }
+    const auto& first_dim = first->sink_op_->result(0)
+                                .type()
+                                .template dyn_cast<pir::DenseTensorType>()
+                                .dims();
+    const auto& second_dim = second->sink_op_->result(0)
+                                 .type()
+                                 .template dyn_cast<pir::DenseTensorType>()
+                                 .dims();
+    return graph.topo_manager_.CanFuse(first, second) &&
+           first_dim == second_dim;
+  }
+};
+
+struct HorizontalFusionOperation {
+  template <typename T>
+  void operator()(PatternGraph<T>* graph,
+                  const PatternNodePtr<T>& i,
+                  const PatternNodePtr<T>& j) {
+    CHECK(GetPatternName(i->stmt_pattern_) ==
+          HorizontalFusionPattern<T>::name());
+    CHECK(GetPatternName(j->stmt_pattern_) ==
+          HorizontalFusionPattern<T>::name());
+    graph->MergeNode(i, j);
+    graph->RemoveNode(i);
+    graph->RemoveNode(j);
+  }
+};
+
+struct NonSinkNodeMatcher {
+  template <typename T>
+  bool operator()(const PatternGraph<T>& graph, const PatternNodePtr<T>& node) {
+    return !node->downstream_.empty();
+  }
+};
+
+struct IsOutputNodeMatcher {
+  template <typename T>
+  bool operator()(const PatternGraph<T>& graph, const PatternNodePtr<T>& node) {
+    bool res = IsAnyFirstInSecond(node->sink_op_->results(), graph.outputs_);
+    return res;
+  }
+};
+
+struct IsNotOutputNodeMatcher {
+  template <typename T>
+  bool operator()(const PatternGraph<T>& graph, const PatternNodePtr<T>& node) {
+    bool res = !IsOutputNodeMatcher()(graph, node);
+    return res;
+  }
+};
+
+template <int N>
+struct DownstreamSmallerThan {
+  template <typename T>
+  bool operator()(const PatternGraph<T>& graph, const PatternNodePtr<T>& node) {
+    return node->downstream_.size() < N;
+  }
+};
+
+template <typename A, typename B>
+struct And {
+  template <typename T>
+  bool operator()(const PatternGraph<T>& graph, const PatternNodePtr<T>& node) {
+    return A()(graph, node) && B()(graph, node);
+  }
+};
+
+template <typename A, typename B>
+struct Or {
+  template <typename T>
+  bool operator()(const PatternGraph<T>& graph, const PatternNodePtr<T>& node) {
+    return A()(graph, node) || B()(graph, node);
+  }
+};
+
+template <typename A>
+struct Not {
+  template <typename T>
+  bool operator()(const PatternGraph<T>& graph, const PatternNodePtr<T>& node) {
+    return !A()(graph, node);
+  }
+};
+
+template <typename Kind,
+          typename Phrase,
+          typename GraphMatcher,
+          typename GraphOperation>
+void GraphTransformer(PatternGraph<Phrase>* graph) {
+  VLOG(4) << "Start GraphTransformer...";
+  auto alog =
+      SearchAlgorithm<Kind, Phrase, GraphMatcher, GraphOperation>(graph);
+  alog();
+}
+
+}  // namespace cinn::fusion
diff --git a/paddle/cinn/operator_fusion/pattern_node.h b/paddle/cinn/operator_fusion/pattern_node.h
new file mode 100644
index 0000000000000..d6c9f8202669e
--- /dev/null
+++ b/paddle/cinn/operator_fusion/pattern_node.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/cinn/operator_fusion/pattern.h"
+#include "paddle/cinn/operator_fusion/pattern_fuser.h"
+#include "paddle/cinn/operator_fusion/utils.h"
+
+namespace cinn::fusion {
+
+template <typename T>
+struct PatternNode {
+  using PatternNodePtr = std::shared_ptr<PatternNode<T>>;
+
+  explicit PatternNode(const PatternContent<T>& content)
+      : sink_op_(content.op), stmt_pattern_(ConvertToStmtPattern<T>(content)) {}
+
+  explicit PatternNode(PatternNodePtr fused_up_node,
+                       PatternNodePtr fused_down_node)
+      : sink_op_(fused_down_node->sink_op_),
+        stmt_pattern_(MergePattern<T>(fused_up_node->stmt_pattern_,
+                                      fused_down_node->stmt_pattern_)) {}
+
+  std::string DebugStr() const {
+    std::stringstream ss;
+    ss << "Node: " << this << ", Pattern: " << GetPatternName(stmt_pattern_)
+       << "\n    -u>:  ";
+    for (const auto& u : upstream_) {
+      ss << u << ", ";
+    }
+    ss << "\n    <d-:  ";
+    for (const auto& d : downstream_) {
+      ss << d << ", ";
+    }
+    return ss.str();
+  }
+
+  StmtPattern<T> stmt_pattern_;
+  pir::Operation* sink_op_;
+
+  std::vector<PatternNodePtr> upstream_;
+  std::vector<PatternNodePtr> downstream_;
+};
+
+template <typename T>
+using PatternNodePtr = std::shared_ptr<PatternNode<T>>;
+}  // namespace cinn::fusion
diff --git a/paddle/cinn/operator_fusion/policy/CMakeLists.txt b/paddle/cinn/operator_fusion/policy/CMakeLists.txt
new file mode 100644
index 0000000000000..94a33e9599640
--- /dev/null
+++ b/paddle/cinn/operator_fusion/policy/CMakeLists.txt
@@ -0,0 +1,8 @@
+gather_srcs(
+  policy_fusion_src
+  SRCS
+  shardable_axes_base.cc
+  policy_manager.cc
+  relative_judge_policy.cc
+  general_topo_policy.cc
+  shardable_axes_policy.cc)
diff --git a/paddle/cinn/operator_fusion/policy/general_topo_policy.cc b/paddle/cinn/operator_fusion/policy/general_topo_policy.cc
new file mode 100644
index 0000000000000..53d54b8fa0f65
--- /dev/null
+++ b/paddle/cinn/operator_fusion/policy/general_topo_policy.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/operator_fusion/policy/general_topo_policy.h"
+#include "paddle/cinn/operator_fusion/backend/pattern.h"
+#include "paddle/cinn/operator_fusion/backend/pattern_fuser.h"
+#include "paddle/cinn/operator_fusion/frontend/pattern.h"
+#include "paddle/cinn/operator_fusion/frontend/pattern_fuser.h"
+
+namespace cinn::fusion {
+
+template <typename T>
+bool IsDownstreamNode(const PatternNodePtr<T> start,
+                      const PatternNodePtr<T> target) {
+  if (start == target) return true;
+  for (const auto& down_node : start->downstream_) {
+    if (IsDownstreamNode(down_node, target)) return true;
+  }
+  return false;
+}
+
+template <typename T>
+bool IsIndirectDownstreamNode(const PatternNodePtr<T> start,
+                              const PatternNodePtr<T> target) {
+  for (const auto& node : start->downstream_) {
+    if (node == target) continue;
+    if (IsDownstreamNode(node, target)) return true;
+  }
+  return false;
+}
+
+template <typename T>
+bool GeneralTopoPolicy<T>::CanFuse(const PatternNodePtr<T>& first,
+                                   const PatternNodePtr<T>& second) {
+  VLOG(4) << "Start GeneralTopoPolicy";
+  return !(IsIndirectDownstreamNode(first, second) ||
+           IsIndirectDownstreamNode(second, first));
+}
+
+template class GeneralTopoPolicy<FrontendStage>;
+template class GeneralTopoPolicy<BackendStage>;
+
+}  // namespace cinn::fusion
diff --git a/paddle/cinn/operator_fusion/policy/general_topo_policy.h b/paddle/cinn/operator_fusion/policy/general_topo_policy.h
new file mode 100644
index 0000000000000..8fc8e360dd01d
--- /dev/null
+++ b/paddle/cinn/operator_fusion/policy/general_topo_policy.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/cinn/operator_fusion/policy/policy_manager.h"
+
+namespace cinn::fusion {
+
+template <typename T>
+class GeneralTopoPolicy final : virtual public Policy<T> {
+ public:
+  bool CanFuse(const PatternNodePtr<T>& upstream,
+               const PatternNodePtr<T>& downstream) override;
+  std::string Name() { return "GeneralTopoPolicy"; }
+};
+
+}  // namespace cinn::fusion
diff --git a/paddle/cinn/operator_fusion/policy/policy_manager.cc b/paddle/cinn/operator_fusion/policy/policy_manager.cc
new file mode 100644
index 0000000000000..5d21dfed23b90
--- /dev/null
+++ b/paddle/cinn/operator_fusion/policy/policy_manager.cc
@@ -0,0 +1,46 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/operator_fusion/policy/policy_manager.h"
+#include "paddle/cinn/operator_fusion/backend/pattern.h"
+#include "paddle/cinn/operator_fusion/frontend/pattern.h"
+#include "paddle/common/enforce.h"
+
+namespace cinn::fusion {
+
+template <typename T>
+bool PolicyManager<T>::CanFuse(const PatternNodePtr<T>& upstream,
+                               const PatternNodePtr<T>& downstream) const {
+  for (const auto& policy : policies_) {
+    if (!policy->CanFuse(upstream, downstream)) return false;
+  }
+  return true;
+}
+
+template <typename T>
+std::vector<size_t> PolicyManager<T>::GetFakeReduceIterIdx(
+    const PatternNodePtr<T>& upstream,
+    const PatternNodePtr<T>& downstream) const {
+  for (const auto& policy : policies_) {
+    if (policy->Name() == "RelativeJudgePolicy") {
+      return policy->GetFakeReduceIterIdx(upstream, downstream);
+    }
+  }
+  return {};
+}
+
+template class PolicyManager<FrontendStage>;
+template class PolicyManager<BackendStage>;
+
+}  // namespace cinn::fusion
diff --git a/paddle/cinn/operator_fusion/policy/policy_manager.h b/paddle/cinn/operator_fusion/policy/policy_manager.h
new file mode 100644
index 0000000000000..33c4227f7b1c7
--- /dev/null
+++ b/paddle/cinn/operator_fusion/policy/policy_manager.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/cinn/operator_fusion/pattern_node.h"
+
+namespace cinn::fusion {
+
+template <typename T>
+class Policy {
+ public:
+  virtual std::string Name() = 0;
+  virtual bool CanFuse(const PatternNodePtr<T>& upstream,
+                       const PatternNodePtr<T>& downstream) = 0;
+  virtual std::vector<size_t> GetFakeReduceIterIdx(
+      const PatternNodePtr<T>& upstream, const PatternNodePtr<T>& downstream) {
+    return {};
+  }
+};
+
+template <typename T>
+using PolicyPtr = std::shared_ptr<Policy<T>>;
+
+template <typename T>
+class PolicyManager {
+ public:
+  explicit PolicyManager(const std::vector<PolicyPtr<T>>& policies)
+      : policies_(policies) {}
+  bool CanFuse(const PatternNodePtr<T>& upstream,
+               const PatternNodePtr<T>& downstream) const;
+  std::vector<size_t> GetFakeReduceIterIdx(
+      const PatternNodePtr<T>& upstream,
+      const PatternNodePtr<T>& downstream) const;
+
+ private:
+  std::vector<PolicyPtr<T>> policies_;
+};
+
+}  // namespace cinn::fusion
diff --git a/paddle/cinn/operator_fusion/policy/relative_judge_policy.cc b/paddle/cinn/operator_fusion/policy/relative_judge_policy.cc
new file mode 100644
index 0000000000000..630403776b49d
--- /dev/null
+++ b/paddle/cinn/operator_fusion/policy/relative_judge_policy.cc
@@ -0,0 +1,342 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/operator_fusion/policy/relative_judge_policy.h"
+#include "paddle/cinn/operator_fusion/backend/pattern.h"
+#include "paddle/cinn/operator_fusion/frontend/pattern.h"
+
+namespace cinn::fusion {
+
+template <typename T>
+bool RelativeJudgePolicy<T>::IsDownstreamStmtDependReduceOp(
+    pir::Operation* reduce, const StmtPattern<T>& downstream) {
+  const auto& values = GetPatternInputValues(downstream);
+  for (const auto& value : reduce->results()) {
+    if (std::find(values.begin(), values.end(), value) != values.end()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+template <typename T>
+std::optional<ReducePattern<T>>
+RelativeJudgePolicy<T>::GetDownstreamFromCandidate(
+    const ReducePattern<T>& upstream,
+    const std::vector<ReducePattern<T>>& candidates) {
+  pir::Operation* reduce = upstream.GetReduceOp();
+  for (const auto& candidate : candidates) {
+    if (IsDownstreamStmtDependReduceOp(reduce, candidate)) {
+      return candidate;
+    }
+  }
+  return {};
+}
+
+SplitDims SplitReduceInputDimsIfRelatedWithNonReduceAxis(
+    const ShardableAxesSignature& signature, pir::Operation* op) {
+  const auto& v = op->operand_source(0);
+  const auto& input_names = signature.inputs[0].axis_names;
+  const auto& output_names = signature.outputs[0].axis_names;
+  std::set<std::string> output_names_set(output_names.begin(),
+                                         output_names.end());
+  auto result = SplitDims();
+  int idx = 0;
+  for (const auto& in : input_names) {
+    if (output_names_set.count(in) == 0) {
+      result.non_related.emplace_back(v, idx);
+    } else {
+      result.related.emplace_back(v, idx);
+    }
+    idx += 1;
+  }
+  return result;
+}
+
+SplitDims SplitReduceOutputDimsIfRelatedWithNonReduceAxis(
+    const ShardableAxesSignature& signature, const pir::Operation* op) {
+  const auto& v = op->result(0);
+  const auto& input_names = signature.inputs[0].axis_names;
+  const auto& output_names = signature.outputs[0].axis_names;
+  std::set<std::string> input_names_set(input_names.begin(), input_names.end());
+  auto result = SplitDims();
+  int idx = 0;
+  for (const auto& name : output_names) {
+    if (input_names_set.count(name) == 0) {
+      result.non_related.emplace_back(v, idx);
+    } else {
+      result.related.emplace_back(v, idx);
+    }
+    idx += 1;
+  }
+  return result;
+}
+
+template <typename T>
+bool RelativeJudgePolicy<T>::IsBroadcastEdge(
+    const std::vector<ValueDim>& upstream_out_dims,
+    const std::vector<ValueDim>& downstream_reduce_dims) {
+  VLOG(4) << "IsBroadcastEdge: upstream_out_dims.size()"
+          << upstream_out_dims.size();
+  VLOG(4) << "IsBroadcastEdge: downstream_reduce_dims.size()"
+          << downstream_reduce_dims.size();
+
+  for (const auto& downstream_reduce_dim : downstream_reduce_dims) {
+    for (const auto& upstream_out_dim : upstream_out_dims) {
+      VLOG(4) << "upstream_out_dim: " << upstream_out_dim.DebugStr()
+              << " downstream_reduce_dim: " << downstream_reduce_dim.DebugStr();
+      if (IsRelated(upstream_out_dim, downstream_reduce_dim)) {
+        return false;
+      }
+    }
+  }
+
+  VLOG(4) << "IsBroadcastEdge";
+  return true;
+}
+
+template <typename T>
+bool RelativeJudgePolicy<T>::ReduceTreeGrownCanMerge(
+    const PatternNodePtr<T>& upstream, const PatternNodePtr<T>& downstream) {
+  const auto& upstream_tree =
+      std::get<ReduceTreePattern<T>>(upstream->stmt_pattern_);
+  VLOG(4) << "upstream->stmt_pattern_:"
+          << OpsDebugStr(GetOpsInPattern<T>(upstream_tree));
+  const auto& downstream_tree =
+      std::get<ReduceTreePattern<T>>(downstream->stmt_pattern_);
+  VLOG(4) << "downstream->stmt_pattern_"
+          << OpsDebugStr(GetOpsInPattern<T>(downstream_tree));
+  const auto& maybe_downstream_op = GetDownstreamFromCandidate(
+      upstream_tree.GetRootPattern(), downstream_tree.FlattenReducePattern());
+  int idx = 0;
+  for (const auto& r_pattern : downstream_tree.childs()) {
+    idx += 1;
+    VLOG(4) << "downstream_tree.reduce_patterns_"
+            << "[" << idx << "]" << OpsDebugStr(GetOpsInPattern<T>(r_pattern));
+  }
+  if (!maybe_downstream_op.has_value()) {
+    VLOG(4) << "can't find candidate from patterns. can fuse return false.";
+    return false;
+  }
+  const pir::Value& reduce_out_value =
+      upstream_tree.GetRootPattern().GetReduceOp()->result(0);
+  pir::Operation* downstream_reduce_op =
+      maybe_downstream_op.value().GetReduceOp();
+  const auto& split_reduce_dim_result =
+      SplitReduceInputDimsIfRelatedWithNonReduceAxis(
+          axes_info_.GetSignature(downstream_reduce_op), downstream_reduce_op);
+  VLOG(4) << split_reduce_dim_result.DebugStr();
+  const auto& upstream_output_dims = GetAllValueDimFromValue(reduce_out_value);
+  auto res = IsBroadcastEdge(upstream_output_dims,
+                             split_reduce_dim_result.non_related);
+  VLOG(4) << "ReduceTreeGrownCanMerge: " << res;
+  return res;
+}
+
+template <typename T>
+SplitDims RelativeJudgePolicy<T>::SplitDimsWithRelationship(
+    const std::vector<ValueDim>& targets,
+    const std::vector<ValueDim>& related_with) {
+  VLOG(4) << "SplitDimsWithRelationship";
+  auto result = SplitDims();
+  bool is_related = false;
+  for (auto& target_dim : targets) {
+    is_related = false;
+    for (auto& related_dim : related_with) {
+      if (IsRelated(related_dim, target_dim)) is_related = true;
+    }
+    if (is_related) {
+      result.related.push_back(target_dim);
+    } else {
+      result.non_related.push_back(target_dim);
+    }
+  }
+
+  return result;
+}
+
+bool DimsEqual(const std::vector<ValueDim>& first,
+               const std::vector<ValueDim>& second) {
+  const auto GetDimInfo =
+      [](const std::vector<ValueDim>& dims) -> std::unordered_map<size_t, int> {
+    std::unordered_map<size_t, int> result;
+    for (const auto& dim : dims) {
+      VLOG(4) << "dim: " << dim.DebugStr();
+      size_t value = dim.GetNumericValue();
+      VLOG(4) << "value: " << value;
+      if (result.find(value) == result.end()) {
+        result[value] = 1;
+      } else {
+        result[value] += 1;
+      }
+    }
+    return result;
+  };
+  VLOG(4) << "GetDimInfo";
+  const std::unordered_map<size_t, int>& first_dims = GetDimInfo(first);
+  VLOG(4) << "GetDimInfo";
+  const std::unordered_map<size_t, int>& second_dims = GetDimInfo(second);
+  if (first_dims.size() != second_dims.size()) return false;
+  for (const auto& [dim_value, count] : first_dims) {
+    if (second_dims.find(dim_value) == second_dims.end() ||
+        second_dims.at(dim_value) != count)
+      return false;
+  }
+  return true;
+}
+
+template <typename T>
+bool RelativeJudgePolicy<T>::ReducePlusTrivialCanMerge(
+    const PatternNodePtr<T>& upstream, const PatternNodePtr<T>& downstream) {
+  VLOG(4) << "RT can fuse";
+
+  // const auto& split_reduce_dims_result =
+  //     SplitReduceInputDimsIfRelatedWithNonReduceAxis(
+  //         axes_info_.GetSignature(upstream->sink_op_), upstream->sink_op_);
+
+  // VLOG(4) << split_reduce_dims_result.DebugStr();
+
+  // const auto& upstream_reduce_dims = split_reduce_dims_result.non_related;
+  // const auto& upstream_non_reduce_dims = split_reduce_dims_result.related;
+
+  // TODO(wuzhanfei) fix bug in relation that if has multi path in graph
+  // test_rms_norm can test
+
+  const auto& split_reduce_input_dims_result =
+      SplitReduceInputDimsIfRelatedWithNonReduceAxis(
+          axes_info_.GetSignature(upstream->sink_op_), upstream->sink_op_);
+  VLOG(4) << split_reduce_input_dims_result.DebugStr();
+  const auto& upstream_reduce_dims = split_reduce_input_dims_result.non_related;
+
+  const auto& split_reduce_output_dims_result =
+      SplitReduceOutputDimsIfRelatedWithNonReduceAxis(
+          axes_info_.GetSignature(upstream->sink_op_), upstream->sink_op_);
+  VLOG(4) << split_reduce_input_dims_result.DebugStr();
+  const auto& upstream_non_reduce_dims =
+      split_reduce_output_dims_result.related;
+  // replace codes upside with original design
+
+  const auto& split_trivial_dims_result = SplitDimsWithRelationship(
+      GetAllValueDimFromValue(downstream->sink_op_->result(0)),
+      upstream_non_reduce_dims);
+
+  VLOG(4) << split_trivial_dims_result.DebugStr();
+
+  auto res =
+      DimsEqual(split_trivial_dims_result.non_related, upstream_reduce_dims);
+  res = res || IsFlattenDimSmaller(upstream, downstream);
+  VLOG(4) << "ReducePlusTrivialCanMerge: " << res;
+  return res;
+}
+
+template <typename T>
+bool RelativeJudgePolicy<T>::IsFlattenDimSmaller(
+    const PatternNodePtr<T>& upstream, const PatternNodePtr<T>& downstream) {
+  const auto& split_reduce_dims_result =
+      SplitReduceInputDimsIfRelatedWithNonReduceAxis(
+          axes_info_.GetSignature(upstream->sink_op_), upstream->sink_op_);
+  const auto& upstream_reduce_dims = split_reduce_dims_result.non_related;
+  const auto& upstream_non_reduce_dims = split_reduce_dims_result.related;
+
+  const auto& split_trivial_dims_result = SplitDimsWithRelationship(
+      GetAllValueDimFromValue(downstream->sink_op_->result(0)),
+      upstream_non_reduce_dims);
+
+  VLOG(4) << "IsFlattenDimSmaller: "
+          << axes_info_.GetSignature(downstream->sink_op_).DebugStr();
+  int rank = axes_info_.GetSignature(downstream->sink_op_)
+                 .outputs[0]
+                 .axis_names.size();
+  VLOG(4) << "IsFlattenDimSmaller: " << rank << " "
+          << split_trivial_dims_result.related.size() << " "
+          << upstream_non_reduce_dims.size();
+  bool res = (rank - split_trivial_dims_result.related.size()) <=
+             upstream_non_reduce_dims.size();
+  VLOG(4) << "IsFlattenDimSmaller: " << res;
+  return res;
+}
+
+template <typename T>
+bool RelativeJudgePolicy<T>::CanFuse(const PatternNodePtr<T>& upstream,
+                                     const PatternNodePtr<T>& downstream) {
+  if (std::holds_alternative<ReduceTreePattern<T>>(upstream->stmt_pattern_) &&
+      std::holds_alternative<TrivialPattern<T>>(downstream->stmt_pattern_)) {
+    return ReducePlusTrivialCanMerge(upstream, downstream);
+  }
+  if (std::holds_alternative<ReduceTreePattern<T>>(upstream->stmt_pattern_) &&
+      std::holds_alternative<ReduceTreePattern<T>>(downstream->stmt_pattern_)) {
+    return ReduceTreeGrownCanMerge(upstream, downstream);
+  }
+  return true;  // other case.
+}
+
+template <typename T>
+std::vector<size_t> RelativeJudgePolicy<T>::GetFakeReduceIterIdx(
+    const PatternNodePtr<T>& upstream, const PatternNodePtr<T>& downstream) {
+  if (!std::holds_alternative<ReduceTreePattern<T>>(upstream->stmt_pattern_) &&
+      !std::holds_alternative<TrivialPattern<T>>(downstream->stmt_pattern_)) {
+    PADDLE_THROW("Illegal Call GetFakeReduceIterIdx");
+  }
+
+  // TODO(xiongkun): replace after fix bug in relation that if has multi path in
+  // graph const auto& split_reduce_dims_result =
+  // SplitReduceInputDimsIfRelatedWithNonReduceAxis(
+  // axes_info_.GetSignature(upstream->sink_op_), upstream->sink_op_);
+
+  // const auto& upstream_reduce_dims = split_reduce_dims_result.non_related;
+  // const auto& upstream_non_reduce_dims = split_reduce_dims_result.related;
+  //
+
+  const auto& split_reduce_input_dims_result =
+      SplitReduceInputDimsIfRelatedWithNonReduceAxis(
+          axes_info_.GetSignature(upstream->sink_op_), upstream->sink_op_);
+  VLOG(4) << split_reduce_input_dims_result.DebugStr();
+  const auto& upstream_reduce_dims = split_reduce_input_dims_result.non_related;
+  const auto& split_reduce_output_dims_result =
+      SplitReduceOutputDimsIfRelatedWithNonReduceAxis(
+          axes_info_.GetSignature(upstream->sink_op_), upstream->sink_op_);
+  VLOG(4) << split_reduce_input_dims_result.DebugStr();
+  const auto& upstream_non_reduce_dims =
+      split_reduce_output_dims_result.related;
+
+  // =======================
+
+  const auto& split_trivial_dims_result = SplitDimsWithRelationship(
+      GetAllValueDimFromValue(downstream->sink_op_->result(0)),
+      upstream_non_reduce_dims);
+
+  const auto& trivial_reorder_dims = split_trivial_dims_result.non_related;
+
+  // CHECK(upstream_reduce_dims.size() == trivial_reorder_dims.size() ||
+  // trivial_reorder_dims.size() == 0);
+  std::unordered_set<ValueDim, ValueDimHash> visited_dims;
+  std::vector<size_t> result;
+  for (auto& reduce_dim : upstream_reduce_dims) {
+    for (auto& trivial_dim : trivial_reorder_dims) {
+      if (visited_dims.find(trivial_dim) == visited_dims.end() &&
+          trivial_dim.GetNumericValue() == reduce_dim.GetNumericValue()) {
+        visited_dims.emplace(trivial_dim);
+        result.emplace_back(trivial_dim.idx_);
+        break;
+      }
+    }
+  }
+  VLOG(4) << "FakeReduceIterIdx: " << cinn::utils::Join(result, ", ");
+  return result;
+}
+
+template class RelativeJudgePolicy<FrontendStage>;
+template class RelativeJudgePolicy<BackendStage>;
+
+}  // namespace cinn::fusion
diff --git a/paddle/cinn/operator_fusion/policy/relative_judge_policy.h b/paddle/cinn/operator_fusion/policy/relative_judge_policy.h
new file mode 100644
index 0000000000000..ac7d9037d24f5
--- /dev/null
+++ b/paddle/cinn/operator_fusion/policy/relative_judge_policy.h
@@ -0,0 +1,304 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <functional>
+#include "paddle/cinn/operator_fusion/policy/policy_manager.h"
+#include "paddle/cinn/operator_fusion/policy/shardable_axes_base.h"
+#include "paddle/cinn/operator_fusion/utils.h"
+
+namespace cinn::fusion {
+
+struct ValueDim {
+  pir::Value v_;
+  size_t idx_;
+  ValueDim(pir::Value v, size_t idx) : v_(v), idx_(idx) {}
+  ValueDim() = default;
+  ValueDim(const ValueDim& v) = default;
+  bool operator==(const ValueDim& v) const {
+    return (idx_ == v.idx_) && (v_ == v.v_);
+  }
+
+  size_t GetNumericValue() const {
+    return v_.type().dyn_cast<pir::DenseTensorType>().dims().at(idx_);
+  }
+
+  std::string DebugStr() const {
+    std::ostringstream oss;
+    oss << "ValueDim: ";
+    oss << "Index: " << idx_;
+    oss << ", ";
+    v_.defining_op()->Print(oss);
+    return oss.str();
+  }
+};
+
+struct ValueDimHash {
+  std::size_t operator()(const ValueDim& p) const {
+    auto h1 = std::hash<size_t>{}(p.idx_);
+    auto h2 = std::hash<pir::Value>{}(p.v_);
+    // Mainly for demonstration purposes, i.e. works but is overly simple
+    // In the real world, use sth. like boost.hash_combine
+    return h1 ^ (h2 << 1);
+  }
+};
+
+using ValueDimRelation =
+    std::unordered_map<ValueDim,
+                       std::unordered_map<ValueDim, bool, ValueDimHash>,
+                       ValueDimHash>;
+// ValueDimRelation[in][out] = True; means f(out) = in is related.
+
+static std::vector<ValueDim> GetAllValueDimFromValue(const pir::Value& v) {
+  std::vector<ValueDim> value_dims;
+  size_t rank = GetRank(v);
+  for (size_t i = 0; i < rank; ++i) {
+    value_dims.emplace_back(v, i);
+  }
+  return value_dims;
+}
+
+static std::vector<ValueDim> GetAllInputValueDim(pir::Operation* op) {
+  std::vector<ValueDim> value_dims;
+  for (const auto& v : op->operands()) {
+    value_dims = ConcatVector(value_dims, GetAllValueDimFromValue(v.source()));
+  }
+  return value_dims;
+}
+
+static std::vector<ValueDim> GetAllOutputValueDim(pir::Operation* op) {
+  std::vector<ValueDim> value_dims;
+  for (const auto& v : op->results()) {
+    value_dims = ConcatVector(value_dims, GetAllValueDimFromValue(v));
+  }
+  return value_dims;
+}
+
+static ValueDimRelation CreateOpRelativenessForElementWise(pir::Operation* op) {
+  ValueDimRelation res;
+  for (const auto& v : op->operands()) {
+    const auto& value_dims = GetAllValueDimFromValue(v.source());
+    const auto& out_value_dims = GetAllOutputValueDim(op);
+    CHECK_EQ(value_dims.size(), out_value_dims.size());
+    for (size_t i = 0; i < value_dims.size(); ++i) {
+      res[value_dims[i]][out_value_dims[i]] = true;
+    }
+  }
+  return res;
+}
+
+static std::vector<std::pair<size_t, size_t>> GetNonBroadCastDims(
+    pir::Operation* op) {
+  std::vector<std::pair<size_t, size_t>> res;
+  const auto* shape_analysis =
+      &pir::ShapeAnalysisManager::Instance().Get(op->GetParentProgram());
+
+  const auto& broad_cast_value = GetBroadcastOpInputOuputValue(op);
+  CHECK(broad_cast_value.has_value());
+
+  const auto& [input_value, output_value] = broad_cast_value.value();
+  const int input_rank = GetRank(input_value);
+  const int output_rank = GetRank(output_value);
+  CHECK_GE(output_rank, input_rank);
+
+  // Compare axis one by one, from back to front.
+  // The rule of broadcasting:
+  // https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/beginner/tensor_cn.html#id7
+  for (int i = 1; i <= input_rank; ++i) {
+    int input_axis = input_rank - i;
+    int output_axis = output_rank - i;
+    if (input_axis < 0 || output_axis < 0) break;
+    if (shape_analysis->IsProductEqual(
+            input_value, {input_axis}, output_value, {output_axis})) {
+      res.emplace_back(input_axis, output_axis);
+    }
+  }
+
+  return res;
+}
+
+static ValueDimRelation CreateOpRelativenessForBroadcast(pir::Operation* op) {
+  ValueDimRelation res;
+  const auto& in_value = op->operand(0).source();
+  const auto& out_value = op->result(0);
+  for (const auto& t : GetNonBroadCastDims(op)) {
+    res[ValueDim(in_value, t.first)][ValueDim(out_value, t.second)] = true;
+  }
+  return res;
+}
+
+static ValueDimRelation CreateOpRelativenessForDefault(pir::Operation* op) {
+  ValueDimRelation res;
+  for (const auto& out_dim : GetAllOutputValueDim(op)) {
+    for (const auto& in_dim : GetAllInputValueDim(op)) {
+      res[in_dim][out_dim] = true;
+    }
+  }
+  return res;
+}
+
+static ValueDimRelation CreateOpRelativenessForReduce(pir::Operation* op) {
+  const auto& reduce_axis_idx = GetReduceAxisIdx(op);
+  ValueDimRelation res;
+  const size_t input_rank = GetRank(op->operand_source(0));
+  int out_idx = 0;
+  bool keep_dim = GetReduceOpKeepDims(op);
+  for (int i = 0; i < input_rank; i++) {
+    if (std::find(reduce_axis_idx.begin(), reduce_axis_idx.end(), i) !=
+        reduce_axis_idx.end()) {
+      res[ValueDim(op->operand_source(0), i)]
+         [ValueDim(op->result(0), out_idx)] = true;
+      out_idx += 1;
+    } else {
+      out_idx += keep_dim;
+    }
+  }
+  return res;
+}
+
+static std::optional<ValueDimRelation> CreateOpRelativenessForSpecialOps(
+    pir::Operation* op) {
+  if (op->name() == "cinn_op.reshape") {
+    // Special Elementwise.
+    return CreateOpRelativenessForDefault(op);
+  }
+  if (op->name() == "pd_op.reshape") {
+    // Special Elementwise.
+    return CreateOpRelativenessForDefault(op);
+  }
+  if (op->name() == "cinn_op.generate_shape") {
+    return CreateOpRelativenessForDefault(op);
+  }
+  if (op->name() == "cinn_op.yield_store") {
+    return CreateOpRelativenessForDefault(op);
+  }
+  return {};
+}
+
+static ValueDimRelation GetSingleOpRelation(pir::Operation* op) {
+  VLOG(4) << "GetSingleOpRelation for " << op->name();
+  const auto& special_result = CreateOpRelativenessForSpecialOps(op);
+  if (special_result != std::nullopt) {
+    return special_result.value();
+  }
+
+  CHECK(op->num_results() == 1)
+      << "Now we do not support op with multi outputs: " << op->name();
+  const hlir::framework::OpPatternKind kind = GetOpPatternKind(op);
+  ValueDimRelation result;
+  if (kind == hlir::framework::kReduction) {
+    result = CreateOpRelativenessForReduce(op);
+  } else if (kind == hlir::framework::kElementWise) {
+    result = CreateOpRelativenessForElementWise(op);
+  } else if (kind == hlir::framework::kBroadcast) {
+    result = CreateOpRelativenessForBroadcast(op);
+  } else {
+    result = CreateOpRelativenessForDefault(op);
+  }
+  return result;
+}
+
+static std::vector<std::pair<ValueDim, ValueDim>> FlattenRelation(
+    const ValueDimRelation& axes_relation) {
+  std::vector<std::pair<ValueDim, ValueDim>> res;
+  for (const auto& in_dim_pair : axes_relation) {
+    for (const auto& out_dim_pair : in_dim_pair.second) {
+      res.emplace_back(in_dim_pair.first, out_dim_pair.first);
+    }
+  }
+  return res;
+}
+
+static ValueDimRelation AnalysisIndexExprRelation(
+    const std::vector<pir::Operation*>& ops) {
+  ValueDimRelation res;
+
+  for (size_t i = ops.size(); i >= 1; --i) {
+    pir::Operation* op = ops[i - 1];
+    if (op->name() == "cf.yield") continue;
+
+    const auto& value_dim_relation = GetSingleOpRelation(op);
+    for (const auto& in_out_pair : FlattenRelation(value_dim_relation)) {
+      for (const auto& out_relation : res[in_out_pair.second]) {
+        res[in_out_pair.first][out_relation.first] = true;
+      }
+      res[in_out_pair.first][in_out_pair.second] = true;
+    }
+  }
+  return res;
+}
+
+struct SplitDims {
+  std::vector<ValueDim> related;
+  std::vector<ValueDim> non_related;
+
+  std::string DebugStr() const {
+    std::stringstream ss;
+    ss << "SplitDims:\nrelated:\n";
+    for (const auto& dim : related) {
+      ss << dim.DebugStr() << "\n";
+    }
+    ss << "non_related:\n";
+    for (const auto& dim : non_related) {
+      ss << dim.DebugStr() << "\n";
+    }
+    return ss.str();
+  }
+};
+
+template <typename T>
+class RelativeJudgePolicy final : public Policy<T> {
+ public:
+  RelativeJudgePolicy(const std::vector<pir::Operation*>& ops,
+                      const pir::ShapeConstraintIRAnalysis* shape_analysis)
+      : axes_info_(ops, shape_analysis) {
+    VLOG(4) << "[relative_judge_policy] Start AnalysisIndexExprRelation.";
+    index_expr_map_ = AnalysisIndexExprRelation(ops);
+    VLOG(4) << "[relative_judge_policy] End AnalysisIndexExprRelation.";
+  }
+  bool CanFuse(const PatternNodePtr<T>& upstream,
+               const PatternNodePtr<T>& downstream) override;
+
+  std::string Name() { return "RelativeJudgePolicy"; }
+
+  std::vector<size_t> GetFakeReduceIterIdx(
+      const PatternNodePtr<T>& upstream,
+      const PatternNodePtr<T>& downstream) override;
+
+  bool IsRelated(ValueDim in, ValueDim out) {
+    return index_expr_map_[in].count(out) == 1;
+  }
+
+ private:
+  ValueDimRelation index_expr_map_;
+  ShardableAxesInfoManager axes_info_;
+  bool ReduceTreeGrownCanMerge(const PatternNodePtr<T>&,
+                               const PatternNodePtr<T>&);
+  bool IsFlattenDimSmaller(const PatternNodePtr<T>& upstream,
+                           const PatternNodePtr<T>& downstream);
+  bool ReducePlusTrivialCanMerge(const PatternNodePtr<T>&,
+                                 const PatternNodePtr<T>&);
+  SplitDims SplitDimsWithRelationship(
+      const std::vector<ValueDim>& targets,
+      const std::vector<ValueDim>& related_with);
+  std::optional<ReducePattern<T>> GetDownstreamFromCandidate(
+      const ReducePattern<T>& upstream,
+      const std::vector<ReducePattern<T>>& candidates);
+  bool IsDownstreamStmtDependReduceOp(pir::Operation* reduce,
+                                      const StmtPattern<T>& downstream);
+  bool IsBroadcastEdge(const std::vector<ValueDim>& upstream_out_dims,
+                       const std::vector<ValueDim>&);
+};
+
+}  // namespace cinn::fusion
diff --git a/paddle/cinn/operator_fusion/policy/shardable_axes_base.cc b/paddle/cinn/operator_fusion/policy/shardable_axes_base.cc
new file mode 100644
index 0000000000000..a9876ea0b8271
--- /dev/null
+++ b/paddle/cinn/operator_fusion/policy/shardable_axes_base.cc
@@ -0,0 +1,305 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/cinn/operator_fusion/policy/shardable_axes_base.h"
+
+namespace cinn::fusion {
+
+ShardableAxes ShardableAxesInfoManager::ReplaceShardableAxesWithRootName(
+    const ShardableAxes& axes) {
+  std::vector<std::string> names;
+  for (auto name : axes.axis_names) {
+    names.push_back(name_union_[name]);
+  }
+  return ShardableAxes(names);
+}
+
+ShardableAxesSignature ShardableAxesInfoManager::GetSignature(
+    pir::Operation* op) {
+  return op_signature_map_[op];
+  // TODO(baizhou) fix broadcast signature and enable here
+  // auto result = ShardableAxesSignature();
+  // auto origin_sig = op_signature_map_[op];
+  // for (const auto& axes : origin_sig.inputs) {
+  //   result.inputs.emplace_back(ReplaceShardableAxesWithRootName(axes));
+  // }
+  // for (const auto& axes : origin_sig.outputs) {
+  //   result.outputs.emplace_back(ReplaceShardableAxesWithRootName(axes));
+  // }
+  // return result;
+}
+
+ShardableAxes ShardableAxesInfoManager::GetAxes(pir::Value value) {
+  return ReplaceShardableAxesWithRootName(value_axes_map_[value]);
+}
+
+std::string ShardableAxesInfoManager::GetUniqueName() {
+  static std::atomic<int64_t> counter = 0;
+  counter += 1;
+  return "D" + std::to_string(counter);
+}
+
+std::vector<std::string> CreateNewNamesWithRank(int64_t rank) {
+  auto result = std::vector<std::string>();
+  for (int64_t i = 0; i < rank; i++) {
+    result.emplace_back(ShardableAxesInfoManager::GetUniqueName());
+  }
+  return result;
+}
+
+ShardableAxesSignature CreateDefaultSignature(pir::Operation* op) {
+  ShardableAxesSignature result = ShardableAxesSignature();
+  for (int i = 0; i < op->num_operands(); ++i) {
+    result.inputs.emplace_back(
+        CreateNewNamesWithRank(GetRank(op->operand_source(i))));
+  }
+  for (int i = 0; i < op->num_results(); ++i) {
+    result.outputs.emplace_back(CreateNewNamesWithRank(GetRank(op->result(i))));
+  }
+  return result;
+}
+
+std::optional<ShardableAxesSignature> CreateSignatureForSpecialOps(
+    pir::Operation* op) {
+  if (op->isa<cinn::dialect::ReshapeOp>()) {
+    return CreateDefaultSignature(op);
+  }
+  if (op->name() == "cinn_op.generate_shape") {
+    return CreateDefaultSignature(op);
+  }
+  if (op->name() == "cinn_op.yield_store") {
+    return CreateDefaultSignature(op);
+  }
+  if (op->name() == "cinn_op.reshape") {
+    return CreateDefaultSignature(op);
+  }
+  if (op->name() == "pd_op.reshape") {
+    return CreateDefaultSignature(op);
+  }
+  return std::nullopt;
+}
+
+ShardableAxesSignature CreateSignatureForReduce(pir::Operation* reduce_op) {
+  CHECK_EQ(reduce_op->num_operands(), 1);
+  CHECK_EQ(reduce_op->num_results(), 1);
+  ShardableAxesSignature result = ShardableAxesSignature();
+  const size_t input_rank = GetRank(reduce_op->operand_source(0));
+  auto input_axes = CreateNewNamesWithRank(input_rank);
+
+  const auto& reduce_axis_idx = GetReduceAxisIdx(reduce_op);
+  bool keep_dim = GetReduceOpKeepDims(reduce_op);
+  auto output_axes = std::vector<std::string>();
+
+  for (int i = 0; i < input_rank; i++) {
+    if (std::find(reduce_axis_idx.begin(), reduce_axis_idx.end(), i) !=
+        reduce_axis_idx.end()) {
+      if (keep_dim) {
+        output_axes.emplace_back(ShardableAxesInfoManager::GetUniqueName());
+      }  // else do nothing
+    } else {
+      output_axes.emplace_back(input_axes[i]);
+    }
+  }
+
+  result.inputs.emplace_back(input_axes);
+  result.outputs.emplace_back(output_axes);
+
+  return result;
+}
+
+ShardableAxesSignature CreateSignatureForElementWise(pir::Operation* op) {
+  ShardableAxesSignature result = ShardableAxesSignature();
+
+  int64_t rank = GetRank(op->result(0));
+  auto same_axes = CreateNewNamesWithRank(rank);
+
+  for (int i = 0; i < op->num_operands(); ++i) {
+    CHECK(rank == GetRank(op->operand_source(i)));
+    result.inputs.emplace_back(same_axes);
+  }
+  for (int i = 0; i < op->num_results(); ++i) {
+    CHECK(rank == GetRank(op->result(i)));
+    result.outputs.emplace_back(same_axes);
+  }
+  return result;
+}
+
+ShardableAxesSignature CreateSignatureForBroadcast(
+    pir::Operation* op, const pir::ShapeConstraintIRAnalysis* shape_analysis) {
+  ShardableAxesSignature result = ShardableAxesSignature();
+
+  const auto& broad_cast_value = GetBroadcastOpInputOuputValue(op);
+  CHECK(broad_cast_value.has_value());
+
+  const auto& [input_value, output_value] = broad_cast_value.value();
+  const int input_rank = GetRank(input_value);
+  const int output_rank = GetRank(output_value);
+  CHECK_GE(output_rank, input_rank);
+
+  // Create axes for operands. For expand op, the second operand is the shape of
+  // output.
+  for (int i = 0; i < op->num_operands(); ++i) {
+    result.inputs.emplace_back(
+        CreateNewNamesWithRank(GetRank(op->operand_source(i))));
+  }
+
+  // Create output axes. Compare axis one by one, from back to front.
+  // The rule of broadcasting:
+  // https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/beginner/tensor_cn.html#id7
+  const auto& input_axis_names = result.inputs[0].axis_names;
+  std::vector<std::string> output_axis_names;
+  for (int i = 1; i <= output_rank; ++i) {
+    int input_axis = input_rank - i;
+    int output_axis = output_rank - i;
+    if ((input_axis >= 0) &&
+        shape_analysis->IsProductEqual(
+            input_value, {input_axis}, output_value, {output_axis})) {
+      output_axis_names.emplace_back(input_axis_names[input_axis]);
+    } else {
+      output_axis_names.emplace_back(ShardableAxesInfoManager::GetUniqueName());
+    }
+  }
+  std::reverse(output_axis_names.begin(), output_axis_names.end());
+  result.outputs.emplace_back(ShardableAxes(output_axis_names));
+
+  return result;
+}
+
+ShardableAxesSignature ShardableAxesInfoManager::CreateShardableSignature(
+    pir::Operation* op) {
+  auto special_result = CreateSignatureForSpecialOps(op);
+  if (special_result != std::nullopt) {
+    VLOG(4) << "[ShardableAxesInfoManager] Create Shardable Axes Signature : \n"
+            << op->name() << " : " << special_result.value().DebugStr();
+    return special_result.value();
+  }
+
+  CHECK(op->num_results() == 1)
+      << "Now we do not support op with multi outputs: " << op->name();
+  ShardableAxesSignature result;
+  const hlir::framework::OpPatternKind kind = GetOpPatternKind(op);
+  if (kind == hlir::framework::kReduction) {
+    result = CreateSignatureForReduce(op);
+  } else if (kind == hlir::framework::kElementWise) {
+    result = CreateSignatureForElementWise(op);
+  } else if (kind == hlir::framework::kBroadcast) {
+    result = CreateSignatureForBroadcast(op, shape_analysis_);
+  } else {
+    result = CreateDefaultSignature(op);
+  }
+  VLOG(4) << "[ShardableAxesInfoManager] Create Shardable Axes Signature : \n"
+          << op->name() << " : " << result.DebugStr();
+  return result;
+}
+
+ShardableAxesInfoManager::ShardableAxesInfoManager(
+    const std::vector<pir::Operation*>& ops,
+    const pir::ShapeConstraintIRAnalysis* shape_analysis)
+    : ops_(ops), shape_analysis_(shape_analysis) {
+  for (const auto& op : ops) {
+    if (op->name() == "cf.yield") continue;
+    op_signature_map_[op] = CreateShardableSignature(op);
+  }
+
+  const auto FindRoot = [&](std::string non_root) {
+    std::string result = non_root;
+    while (name_union_[result] != result) {
+      result = name_union_[result];
+    }
+    return result;
+  };
+
+  const auto CombineAxes = [&](const ShardableAxes& root,
+                               const ShardableAxes& non_root) {
+    CHECK_EQ(root.axis_names.size(), non_root.axis_names.size());
+    for (int i = 0; i < non_root.axis_names.size(); i++) {
+      name_union_[non_root.axis_names[i]] = FindRoot(root.axis_names[i]);
+    }
+  };
+
+  for (const auto& [op, axes_signature] : op_signature_map_) {
+    for (int i = 0; i < op->num_operands(); ++i) {
+      auto value = op->operand_source(i);
+      auto axes = axes_signature.inputs[i];
+      if (value_axes_map_.find(value) == value_axes_map_.end()) {
+        value_axes_map_[value] = axes;
+        for (auto& axis_name : axes.axis_names) {
+          name_union_[axis_name] = axis_name;
+        }
+      } else {
+        CombineAxes(value_axes_map_[value], axes);
+      }
+    }
+    for (int i = 0; i < op->num_results(); ++i) {
+      auto value = op->result(i);
+      auto axes = axes_signature.outputs[i];
+      if (value_axes_map_.find(value) == value_axes_map_.end()) {
+        value_axes_map_[value] = axes;
+        for (auto& axis_name : axes.axis_names) {
+          name_union_[axis_name] = axis_name;
+        }
+      } else {
+        CombineAxes(value_axes_map_[value], axes);
+      }
+    }
+  }
+
+  VLOG(4) << NameUnionDebugStr();
+}
+
+std::string ShardableAxes::DebugStr() const {
+  std::stringstream ss;
+  for (const auto& name : axis_names) {
+    ss << name << ", ";
+  }
+  return ss.str();
+}
+
+std::string ShardableAxesSignature::DebugStr() const {
+  std::stringstream ss;
+  ss << "ShardableAxes Signature:\n";
+  for (int i = 0; i < inputs.size(); i++) {
+    ss << "input " << i << ": " << inputs[i].DebugStr() << "\n";
+  }
+  for (int i = 0; i < outputs.size(); i++) {
+    ss << "output " << i << ": " << outputs[i].DebugStr() << "\n";
+  }
+  return ss.str();
+}
+
+std::string ShardableAxesInfoManager::NameUnionDebugStr() const {
+  std::stringstream ss;
+  ss << "[ShardableAxesInfoManager] NameUnion :\n";
+
+  std::unordered_map<std::string, std::vector<std::string>> root_to_sons;
+  for (const auto& [non_root, root] : name_union_) {
+    if (root_to_sons.find(root) == root_to_sons.end()) {
+      root_to_sons[root] = std::vector<std::string>{non_root};
+    } else {
+      root_to_sons[root].push_back(non_root);
+    }
+  }
+  for (const auto& [root, sons] : root_to_sons) {
+    ss << "Root " << root << ": ";
+    for (const auto& son : sons) {
+      ss << son << ", ";
+    }
+    ss << "\n";
+  }
+
+  return ss.str();
+}
+
+}  // namespace cinn::fusion
diff --git a/paddle/cinn/frontend/group_cluster/cluster_policy/shardable_axes_policy/shardable_axes_base.h b/paddle/cinn/operator_fusion/policy/shardable_axes_base.h
similarity index 65%
rename from paddle/cinn/frontend/group_cluster/cluster_policy/shardable_axes_policy/shardable_axes_base.h
rename to paddle/cinn/operator_fusion/policy/shardable_axes_base.h
index c9c341c0b05de..1202641bab3c4 100644
--- a/paddle/cinn/frontend/group_cluster/cluster_policy/shardable_axes_policy/shardable_axes_base.h
+++ b/paddle/cinn/operator_fusion/policy/shardable_axes_base.h
@@ -14,39 +14,42 @@
 
 #pragma once
 
-#include "paddle/cinn/frontend/group_cluster/common_utils.h"
+#include "paddle/cinn/operator_fusion/utils.h"
 
-namespace cinn::frontend::group_cluster::policy {
+namespace cinn::fusion {
 
 struct ShardableAxes {
+  ShardableAxes() : axis_names({}) {}
   explicit ShardableAxes(const std::vector<std::string>& names)
       : axis_names(names) {}
   std::vector<std::string> axis_names;
-  std::string DebugStr();
+  std::string DebugStr() const;
 };
 
 struct ShardableAxesSignature {
   std::vector<ShardableAxes> inputs;
   std::vector<ShardableAxes> outputs;
-  std::string DebugStr();
+  std::string DebugStr() const;
 };
 
 struct ShardableAxesInfoManager {
   ShardableAxesInfoManager(
-      const std::vector<const pir::Operation*>& ops,
+      const std::vector<pir::Operation*>& ops,
       const pir::ShapeConstraintIRAnalysis* shape_analysis);
-  ShardableAxesSignature GetSignature(const pir::Operation* op);
-  ShardableAxes GetAxes(const pir::Value value);
+  ShardableAxesSignature GetSignature(pir::Operation* op);
+  ShardableAxes GetAxes(pir::Value value);
+  ShardableAxesSignature CreateShardableSignature(pir::Operation* op);
+  ShardableAxes ReplaceShardableAxesWithRootName(const ShardableAxes& axes);
   static std::string GetUniqueName();
+  std::string NameUnionDebugStr() const;
 
  private:
-  const std::vector<const pir::Operation*>& ops_;
+  const std::vector<pir::Operation*>& ops_;
   const pir::ShapeConstraintIRAnalysis* shape_analysis_;
 
-  std::unordered_map<const pir::Operation*, ShardableAxesSignature>
-      op_signature_map_;
+  std::unordered_map<pir::Operation*, ShardableAxesSignature> op_signature_map_;
   std::unordered_map<pir::Value, ShardableAxes> value_axes_map_;
   std::unordered_map<std::string, std::string> name_union_;
 };
 
-}  // namespace cinn::frontend::group_cluster::policy
+}  // namespace cinn::fusion
diff --git a/paddle/cinn/operator_fusion/policy/shardable_axes_policy.cc b/paddle/cinn/operator_fusion/policy/shardable_axes_policy.cc
new file mode 100644
index 0000000000000..24ffa6d862c86
--- /dev/null
+++ b/paddle/cinn/operator_fusion/policy/shardable_axes_policy.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/operator_fusion/policy/shardable_axes_policy.h"
+#include "paddle/cinn/operator_fusion/backend/pattern.h"
+#include "paddle/cinn/operator_fusion/frontend/pattern.h"
+
+namespace cinn::fusion {
+
+template <typename T>
+bool ShardableAxesRRFusePolicy<T>::IsDownstreamStmtDependReduceOp(
+    pir::Operation* reduce, const StmtPattern<T>& downstream) {
+  const auto& values = GetPatternInputValues(downstream);
+  for (const auto& value : reduce->results()) {
+    if (std::find(values.begin(), values.end(), value) != values.end()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+template <typename T>
+std::optional<ReducePattern<T>>
+ShardableAxesRRFusePolicy<T>::GetDownstreamFromCandidate(
+    const ReducePattern<T>& upstream,
+    const std::vector<ReducePattern<T>>& candidates) {
+  pir::Operation* reduce = upstream.GetReduceOp();
+  for (const auto& candidate : candidates) {
+    if (IsDownstreamStmtDependReduceOp(reduce, candidate)) {
+      return candidate;
+    }
+  }
+  return {};
+}
+
+static std::set<std::string> GetReduceAxesName(
+    const ShardableAxesSignature& signature) {
+  const auto& input_names = signature.inputs[0].axis_names;
+  const auto& output_names = signature.outputs[0].axis_names;
+  std::set<std::string> res(input_names.begin(), input_names.end());
+  for (const auto& n : output_names) {
+    res.erase(n);
+  }
+  return res;
+}
+
+template <typename T>
+bool ShardableAxesRRFusePolicy<T>::ReduceTreeGrownCanMerge(
+    const PatternNodePtr<T>& upstream, const PatternNodePtr<T>& downstream) {
+  if (!upstream->IsReduceTree() || !downstream->IsReduceTree()) {
+    return false;
+  }
+  const auto& upstream_tree =
+      std::get<ReduceTreePattern>(upstream->stmt_pattern_);
+  const auto& downstream_tree =
+      std::get<ReduceTreePattern>(downstream->stmt_pattern_);
+  const auto& maybe_downstream_op = GetDownstreamFromCandidate(
+      upstream_tree.GetRootPattern(), downstream_tree.reduce_patterns_);
+  if (!maybe_downstream_op.has_value()) {
+    return false;
+  }
+  const pir::Value& reduce_out_value =
+      upstream_tree.GetRootPattern().GetReduceOp()->result(0);
+  pir::Operation* downstream_reduce_op =
+      maybe_downstream_op.value().GetReduceOp();
+  const auto& reduce_names =
+      GetReduceAxesName(axes_info_.GetSignature(downstream_reduce_op));
+  for (const auto& n :
+       axes_info_.GetAxes(downstream_reduce_op->result(0)).axis_names) {
+    if (reduce_names.count(n) > 0) {
+      // not meeting the BroadcastEdge condition.
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename T>
+bool ShardableAxesRRFusePolicy<T>::CanFuse(
+    const PatternNodePtr<T>& upstream, const PatternNodePtr<T>& downstream) {
+  // TODO(wuzhanfei) shardable axes policy
+  return ReduceTreeGrownCanMerge(upstream, downstream);
+}
+
+template class RelativeJudgePolicy<FrontendStage>;
+template class RelativeJudgePolicy<BackendStage>;
+
+}  // namespace cinn::fusion
diff --git a/paddle/cinn/operator_fusion/policy/shardable_axes_policy.h b/paddle/cinn/operator_fusion/policy/shardable_axes_policy.h
new file mode 100644
index 0000000000000..d4c662c6c3a09
--- /dev/null
+++ b/paddle/cinn/operator_fusion/policy/shardable_axes_policy.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/cinn/operator_fusion/policy/policy_manager.h"
+#include "paddle/cinn/operator_fusion/policy/shardable_axes_base.h"
+
+namespace cinn::fusion {
+
+template <typename T>
+class ShardableAxesRRFusePolicy final : public Policy<T> {
+ public:
+  ShardableAxesRRFusePolicy(
+      const std::vector<pir::Operation*>& ops,               // NOLINT
+      const pir::ShapeConstraintIRAnalysis* shape_analysis)  // NOLINT
+      : axes_info_(ops, shape_analysis) {}
+  bool CanFuse(const PatternNodePtr<T>& upstream,
+               const PatternNodePtr<T>& downstream) override;
+  std::string Name() { return "ShardableAxesRRFusePolicy"; }
+
+ private:
+  bool ReduceTreeGrownCanMerge(const PatternNodePtr<T>&,
+                               const PatternNodePtr<T>&);
+  std::optional<ReducePattern<T>> GetDownstreamFromCandidate(
+      const ReducePattern<T>& upstream,
+      const std::vector<ReducePattern<T>>& candidates);
+  ShardableAxesInfoManager axes_info_;
+  bool IsDownstreamStmtDependReduceOp(pir::Operation* reduce,
+                                      const StmtPattern<T>& downstream);
+};
+
+}  // namespace cinn::fusion
diff --git a/paddle/cinn/operator_fusion/utils.h b/paddle/cinn/operator_fusion/utils.h
new file mode 100644
index 0000000000000..696836fe2a780
--- /dev/null
+++ b/paddle/cinn/operator_fusion/utils.h
@@ -0,0 +1,178 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <set>
+#include <typeinfo>
+#include <unordered_map>
+#include <unordered_set>
+#include <variant>
+#include <vector>
+
+#include "glog/logging.h"
+#include "paddle/cinn/hlir/dialect/operator/ir/cinn_op.h"
+#include "paddle/cinn/hlir/dialect/operator/ir/manual_op.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/utils/string.h"
+#include "paddle/pir/include/dialect/control_flow/ir/cf_op.h"
+
+namespace cinn::fusion {
+
+using OpPatternKind = cinn::hlir::framework::OpPatternKind;
+static OpPatternKind GetOpPatternKind(const ::pir::Operation* op) {
+  return hlir::framework::pir::CompatibleInfo::OpKind(*op);
+}
+
+static size_t GetRank(pir::Value value) {
+  return value.type().dyn_cast<pir::DenseTensorType>().dims().size();
+}
+
+static std::vector<int64_t> GetReduceAxisIdx(pir::Operation* reduce_op) {
+  const size_t input_rank = GetRank(reduce_op->operand_source(0));
+  const auto& attr_val = reduce_op->attributes().at("dim");
+  CHECK(attr_val.isa<::pir::ArrayAttribute>());
+  const auto& axis_attr = attr_val.dyn_cast<::pir::ArrayAttribute>();
+  std::vector<int64_t> reduce_axis_idx;
+  for (int i = 0; i < axis_attr.size(); ++i) {
+    int64_t axis = axis_attr.at(i).dyn_cast<::pir::Int64Attribute>().data();
+    if (axis < 0) {
+      axis += input_rank;
+    }
+    CHECK_GE(axis, 0);
+    CHECK_LT(axis, input_rank);
+    reduce_axis_idx.push_back(axis);
+  }
+  VLOG(4) << "GetReduceAxisIdx: " << utils::Join(reduce_axis_idx, ",");
+  return reduce_axis_idx;
+}
+
+static bool GetReduceOpKeepDims(pir::Operation* reduce_op) {
+  const auto& attr_val = reduce_op->attributes().at("keep_dim");
+  CHECK(attr_val.isa<::pir::BoolAttribute>());
+  return attr_val.dyn_cast<::pir::BoolAttribute>().data();
+}
+
+static std::string OpsDebugStr(std::vector<pir::Operation*> ops) {
+  std::stringstream ss;
+  pir::IrPrinter printer(ss);
+  for (const auto* op : ops) {
+    printer.PrintOperation(const_cast<pir::Operation*>(op));
+    ss << "\n";
+  }
+  return ss.str();
+}
+
+static std::optional<std::pair<pir::Value, pir::Value>>
+GetBroadcastOpInputOuputValue(pir::Operation* op) {
+  auto* mut_op = const_cast<pir::Operation*>(op);
+  if (op->isa<paddle::dialect::ExpandOp>()) {
+    auto expand_op = mut_op->dyn_cast<paddle::dialect::ExpandOp>();
+    return std::make_pair(expand_op.x(), expand_op.out());
+  } else if (op->isa<cinn::dialect::BroadcastOp>()) {
+    auto broadcast_op = mut_op->dyn_cast<cinn::dialect::BroadcastOp>();
+    return std::make_pair(broadcast_op.x(), broadcast_op.out());
+  } else {
+    CHECK(false) << "Unsupported broadcast op: " << op->name();
+  }
+  return std::nullopt;
+}
+
+template <typename T>
+void RemoveFromVector(std::vector<T>* vec, T item) {
+  auto iter = std::find(vec->begin(), vec->end(), item);
+  if (iter != vec->end()) {
+    vec->erase(iter);
+  }
+}
+
+template <typename T>
+std::vector<T> ConcatVector(const std::vector<T>& first,
+                            const std::vector<T>& second) {
+  std::vector<T> result = first;
+  result.insert(result.end(), second.begin(), second.end());
+  return result;
+}
+
+template <typename T, typename F>
+std::vector<T> FilterVector(const std::vector<T>& first, const F& func) {
+  std::vector<T> result;
+  for (const auto& i : first) {
+    if (func(i)) {
+      result.push_back(i);
+    }
+  }
+  return result;
+}
+
+template <class A, class B>
+std::vector<B> MapVector(const std::vector<A>& as,
+                         const std::function<B(A)>& func) {
+  std::vector<B> res;
+  for (const auto& a : as) {
+    res.push_back(func(a));
+  }
+  return res;
+}
+
+template <typename T>
+std::set<T> ToSet(const std::vector<T>& input) {
+  std::set<T> result(input.begin(), input.end());
+  return result;
+}
+
+template <typename T>
+bool IsAnyFirstInSecond(const std::vector<T>& first,
+                        const std::vector<T>& second) {
+  const auto& second_set = ToSet(second);
+  for (const auto& ele : first) {
+    if (second_set.count(ele)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+template <typename T>
+std::vector<T> UniqueVectorBySet(const std::vector<T>& v) {
+  std::set<T> unique(v.begin(), v.end());
+  return std::vector<T>(unique.begin(), unique.end());
+}
+
+template <typename T>
+void ExtendVector(std::vector<T>* first, const std::vector<T>& second) {
+  std::unordered_set<T> visited =
+      std::unordered_set<T>(first->begin(), first->end());
+  for (auto iter = second.begin(); iter != second.end(); iter++) {
+    if (visited.find(*iter) == visited.end()) {
+      visited.emplace(*iter);
+      first->emplace_back(*iter);
+    }
+  }
+}
+
+template <typename T>
+std::vector<T> UniqueConcatVector(const std::vector<T>& first,
+                                  const std::vector<T>& second) {
+  std::vector<T> result = std::vector<T>(first);
+  ExtendVector(&result, second);
+  return result;
+}
+
+}  // namespace cinn::fusion
diff --git a/paddle/cinn/optim/cast_bool_to_int8.cc b/paddle/cinn/optim/cast_bool_to_int8.cc
index 64385623bcd21..55c8053fc6db5 100644
--- a/paddle/cinn/optim/cast_bool_to_int8.cc
+++ b/paddle/cinn/optim/cast_bool_to_int8.cc
@@ -38,10 +38,30 @@ struct Mutator : public ir::IRMutator<> {
 
 }  // namespace
 
+void CastBoolExprToInt8Impl(common::UnknownArch, Expr* e) {
+  LOG(FATAL) << "unknown architecture.";
+}
+
+void CastBoolExprToInt8Impl(common::X86Arch, Expr* e) {
+  Mutator mutator;
+  mutator.Visit(e, e);
+}
+
+void CastBoolExprToInt8Impl(common::ARMArch, Expr* e) {
+  // Do nothing.
+}
+
+void CastBoolExprToInt8Impl(common::NVGPUArch, Expr* e) {
+  // Do nothing.
+}
+
+void CastBoolExprToInt8(common::Arch arch, Expr* e) {
+  return std::visit(
+      [&](const auto& impl) { return CastBoolExprToInt8Impl(impl, e); },
+      arch.variant());
+}
+
 void CastBoolToInt8(Expr* e, Target target) {
-  if (target.arch == Target::Arch::X86) {
-    Mutator mutator;
-    mutator.Visit(e, e);
-  }
+  CastBoolExprToInt8(target.arch, e);
 }
 }  // namespace cinn::optim
diff --git a/paddle/cinn/optim/lower_intrin.cc b/paddle/cinn/optim/lower_intrin.cc
index 07fe5370e7761..5c0fa6566d60c 100644
--- a/paddle/cinn/optim/lower_intrin.cc
+++ b/paddle/cinn/optim/lower_intrin.cc
@@ -25,12 +25,14 @@
 namespace cinn {
 namespace optim {
 
-void LowerIntrin(Expr *e, Target target) {
-  if (target.arch == Target::Arch::X86) {
-    codegen::RegisterCpuIntrinRule();
-  } else {
-    return;
-  }
+template <typename T>
+void LowerIntrinImpl(const T &, const Target &target, Expr *e) {
+  // Do nothing.
+}
+
+void LowerIntrinImpl(common::X86Arch, const Target &target, Expr *e) {
+  codegen::RegisterCpuIntrinRule();
+
   struct Mutator : ir::IRMutator<Expr *> {
     Target target;
 
@@ -99,5 +101,15 @@ void LowerIntrin(Expr *e, Target target) {
   m(e);
 }
 
+void LowerIntrinByArch(Expr *e, const Target &target) {
+  return std::visit(
+      [&](const auto &impl) { return LowerIntrinImpl(impl, target, e); },
+      target.arch.variant());
+}
+
+void LowerIntrin(Expr *e, Target target) {
+  return LowerIntrinByArch(e, target);
+}
+
 }  // namespace optim
 }  // namespace cinn
diff --git a/paddle/cinn/optim/map_extern_call.cc b/paddle/cinn/optim/map_extern_call.cc
index d260cea233dd4..1b9bbf1e57374 100644
--- a/paddle/cinn/optim/map_extern_call.cc
+++ b/paddle/cinn/optim/map_extern_call.cc
@@ -44,6 +44,65 @@ static const std::set<std::string> kExternInt32CallsGPU{{"left_shift",
 static const std::set<std::string> kExternFp32CallsCPU = {
     "erf", "acos", "acosh", "asin", "asinh", "atan", "atanh", "remainder"};
 
+void DealWithCpuIntrinsics(ir::Call *node, Expr *expr) {
+  if (kExternFp32CallsCPU.count(node->name)) {
+    PADDLE_ENFORCE_GE(
+        node->read_args.size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "The size of node's read args is incorrect."
+            "Expected size is greater than or equal to 1, but receive %d.",
+            node->read_args.size()));
+    CHECK(node->read_args.front().type().is_float())
+        << "CPU extern call intrinsics only support float now! Please "
+           "check.";
+    if (node->read_args.front().type().is_float(32)) {
+      auto out_type = node->type();
+      *expr = lang::CallExtern(node->name + "f", node->read_args);
+    }
+  }
+}
+
+void DealWithIntrinsicsImpl(common::UnknownArch, ir::Call *node, Expr *expr) {
+  DealWithCpuIntrinsics(node, expr);
+}
+
+void DealWithIntrinsicsImpl(common::X86Arch, ir::Call *node, Expr *expr) {
+  DealWithCpuIntrinsics(node, expr);
+}
+
+void DealWithIntrinsicsImpl(common::ARMArch, ir::Call *node, Expr *expr) {
+  DealWithCpuIntrinsics(node, expr);
+}
+
+void DealWithIntrinsicsImpl(common::NVGPUArch, ir::Call *node, Expr *expr) {
+  auto arg_size = node->read_args.size();
+  if (arg_size == 0UL) {
+    // some node like __syncthreads hasn't arguments
+    return;
+  }
+  const auto &dtype = node->read_args.front().type();
+  const auto &name = node->name;
+
+  bool node_in_extern_fp32 = kExternFp32CallsGPU.count(name);
+  bool node_in_extern_int32 = kExternInt32CallsGPU.count(name);
+  if (!node_in_extern_fp32 && !node_in_extern_int32) {
+    return;
+  }
+
+  std::string extern_func =
+      hlir::GetExternFuncName(cinn::common::DefaultNVGPUTarget(), dtype, name);
+  *expr = lang::CallExtern(extern_func, node->read_args, node->attrs);
+}
+
+void DealWithIntrinsics(common::Arch arch, ir::Call *node, Expr *expr) {
+  return std::visit(
+      [&](const auto &impl) {
+        return DealWithIntrinsicsImpl(impl, node, expr);
+      },
+      arch.variant());
+}
+
 void MapExternCall(Expr *e, Target target) {
   struct Mutator : ir::IRMutator<Expr *> {
     Target target;
@@ -56,50 +115,7 @@ void MapExternCall(Expr *e, Target target) {
       auto *node = expr->As<ir::Call>();
       CHECK(node);
       OptimizeConstantPow(node);
-      if (target.arch == Target::Arch::NVGPU) {
-        DealWithNvGpuIntrinsics(node, expr);
-      } else {
-        DealWithCpuIntrinsics(node, expr);
-      }
-    }
-
-    void DealWithCpuIntrinsics(ir::Call *node, Expr *expr) {
-      if (kExternFp32CallsCPU.count(node->name)) {
-        PADDLE_ENFORCE_GE(
-            node->read_args.size(),
-            1UL,
-            phi::errors::InvalidArgument(
-                "The size of node's read args is incorrect."
-                "Expected size is greater than or equal to 1, but receive %d.",
-                node->read_args.size()));
-        CHECK(node->read_args.front().type().is_float())
-            << "CPU extern call intrinsics only support float now! Please "
-               "check.";
-        if (node->read_args.front().type().is_float(32)) {
-          auto out_type = node->type();
-          *expr = lang::CallExtern(node->name + "f", node->read_args);
-        }
-      }
-    }
-
-    void DealWithNvGpuIntrinsics(ir::Call *node, Expr *expr) {
-      auto arg_size = node->read_args.size();
-      if (arg_size == 0UL) {
-        // some node like __syncthreads hasn't arguments
-        return;
-      }
-      const auto &dtype = node->read_args.front().type();
-      const auto &name = node->name;
-
-      bool node_in_extern_fp32 = kExternFp32CallsGPU.count(name);
-      bool node_in_extern_int32 = kExternInt32CallsGPU.count(name);
-      if (!node_in_extern_fp32 && !node_in_extern_int32) {
-        return;
-      }
-
-      std::string extern_func = hlir::GetExternFuncName(
-          cinn::common::DefaultNVGPUTarget(), dtype, name);
-      *expr = lang::CallExtern(extern_func, node->read_args, node->attrs);
+      DealWithIntrinsics(target.arch, node, expr);
     }
 
     // Replace pow(x, 0.5) to sqrt(x) and pow(x, -0.5) to rsqrt(x), which
diff --git a/paddle/cinn/optim/optimize.cc b/paddle/cinn/optim/optimize.cc
index bd6690838c09e..3e1ac6a2030b5 100644
--- a/paddle/cinn/optim/optimize.cc
+++ b/paddle/cinn/optim/optimize.cc
@@ -66,7 +66,7 @@ Expr Optimize(Expr e,
     RemoveGpuForloopsAxis(&copied);
   }
   CudaSyncThreadsDropIfThenElse(&copied);
-  // TransBufferWithDynamicShape(&copied);
+  // CudaTransBufferWithDynamicShape(&copied);
 #endif
 
   SimplifyBlocks(&copied);
diff --git a/paddle/cinn/optim/trans_buffer_with_dynamic_shape.cc b/paddle/cinn/optim/trans_buffer_with_dynamic_shape.cc
index a0b5ec89b494c..c46efa09cc64a 100644
--- a/paddle/cinn/optim/trans_buffer_with_dynamic_shape.cc
+++ b/paddle/cinn/optim/trans_buffer_with_dynamic_shape.cc
@@ -103,12 +103,11 @@ struct Mutator : public ir::IRMutator<> {
 
 }  // namespace
 
-void TransBufferWithDynamicShape(ir::Expr* e) {
+void CudaTransBufferWithDynamicShape(ir::Expr* e) {
   Mutator mutator;
   mutator.Visit(e, e);
 #ifdef CINN_WITH_CUDA
-  auto cur_dev_info =
-      common::DevInfoMgr<common::Target::Arch::NVGPU>::GetDevInfo(0);
+  auto cur_dev_info = common::DevInfoMgr<common::NVGPUArch>::GetDevInfo(0);
   if (cur_dev_info->IsValid()) {
     size_t max_shm_per_block = cur_dev_info->GetMaxSharedMemPerBlock();
     CHECK(mutator.shared_mem_size_used_ <= max_shm_per_block)
diff --git a/paddle/cinn/optim/trans_buffer_with_dynamic_shape.h b/paddle/cinn/optim/trans_buffer_with_dynamic_shape.h
index 4913347c0971c..c546770a0941f 100644
--- a/paddle/cinn/optim/trans_buffer_with_dynamic_shape.h
+++ b/paddle/cinn/optim/trans_buffer_with_dynamic_shape.h
@@ -24,7 +24,7 @@ namespace optim {
  * Given Expr AST, translate dynamic shape in buffers to
  * static shape, the pass is just used on Nvidia GPU temporarily.
  */
-void TransBufferWithDynamicShape(ir::Expr* expr);
+void CudaTransBufferWithDynamicShape(ir::Expr* expr);
 
 }  // namespace optim
 }  // namespace cinn
diff --git a/paddle/cinn/optim/transform_polyfor_to_for_test.cc b/paddle/cinn/optim/transform_polyfor_to_for_test.cc
index b6f7c073df154..652365d11722c 100644
--- a/paddle/cinn/optim/transform_polyfor_to_for_test.cc
+++ b/paddle/cinn/optim/transform_polyfor_to_for_test.cc
@@ -49,7 +49,7 @@ TEST(Expr, basic) {
   auto func = Lower("matmul", stages, {A, B, C});
 
   Target target;
-  target.arch = Target::Arch ::X86;
+  target.arch = common::X86Arch{};
   target.bits = Target::Bit ::k32;
   target.os = Target::OS ::Linux;
 
diff --git a/paddle/cinn/optim/vectorize_loops_test.cc b/paddle/cinn/optim/vectorize_loops_test.cc
index 7f9abe1e2c512..4e4ac9e24763c 100644
--- a/paddle/cinn/optim/vectorize_loops_test.cc
+++ b/paddle/cinn/optim/vectorize_loops_test.cc
@@ -55,7 +55,7 @@ TEST(Vectorize, replace_var) {
   Expr func = optim::Optimize(funcs, cinn::common::DefaultHostTarget());
 
   Target target;
-  target.arch = Target::Arch ::X86;
+  target.arch = common::X86Arch{};
   target.bits = Target::Bit ::k32;
   target.os = Target::OS ::Linux;
 
@@ -99,7 +99,7 @@ TEST(Vectorize, TestMarkVectorize) {
   Expr N(500);
 
   Target target;
-  target.arch = Target::Arch ::X86;
+  target.arch = common::X86Arch{};
   target.bits = Target::Bit ::k32;
   target.os = Target::OS ::Linux;
 
diff --git a/paddle/cinn/pybind/CMakeLists.txt b/paddle/cinn/pybind/CMakeLists.txt
index ec409578930df..970203a273389 100755
--- a/paddle/cinn/pybind/CMakeLists.txt
+++ b/paddle/cinn/pybind/CMakeLists.txt
@@ -15,6 +15,8 @@ set(srcs
     utils.cc
     schedule.cc)
 
+gather_srcs(cinnapi_src SRCS ${srcs})
+
 if(WITH_CUDA)
   message(STATUS "Compile core_api with CUDA support")
   cinn_nv_library(
diff --git a/paddle/cinn/pybind/bind.cc b/paddle/cinn/pybind/bind.cc
index 4c20f22b973cf..6882a1ac87208 100644
--- a/paddle/cinn/pybind/bind.cc
+++ b/paddle/cinn/pybind/bind.cc
@@ -21,27 +21,28 @@ namespace py = pybind11;
 
 namespace cinn::pybind {
 
-PYBIND11_MODULE(core_api, m) {
-  m.doc() = "CINN core API";
-
-  py::module runtime = m.def_submodule("runtime", "bind cinn_runtime");
-  py::module common = m.def_submodule("common", "namespace cinn::common");
-  py::module lang = m.def_submodule("lang", "namespace cinn::lang");
-  py::module ir = m.def_submodule("ir", "namespace cinn::ir");
-  py::module poly = m.def_submodule("poly", "namespace cinn::poly, polyhedral");
-  py::module backends = m.def_submodule(
+void BindCINN(py::module *m) {
+  py::module cinn =
+      m->def_submodule("cinn", "Compiler Infrastructure for Neural Networks");
+  py::module runtime = cinn.def_submodule("runtime", "bind cinn_runtime");
+  py::module common = cinn.def_submodule("common", "namespace cinn::common");
+  py::module lang = cinn.def_submodule("lang", "namespace cinn::lang");
+  py::module ir = cinn.def_submodule("ir", "namespace cinn::ir");
+  py::module poly =
+      cinn.def_submodule("poly", "namespace cinn::poly, polyhedral");
+  py::module backends = cinn.def_submodule(
       "backends", "namespace cinn::backends, execution backends");
-  py::module optim =
-      m.def_submodule("optim", "namespace cinn::optim, CINN IR optimization");
-  py::module pe = m.def_submodule(
+  py::module optim = cinn.def_submodule(
+      "optim", "namespace cinn::optim, CINN IR optimization");
+  py::module pe = cinn.def_submodule(
       "pe", "namespace cinn::hlir::pe, CINN Primitive Emitters");
   py::module frontend =
-      m.def_submodule("frontend", "namespace cinn::frontend, CINN frontend");
-  py::module framework = m.def_submodule(
+      cinn.def_submodule("frontend", "namespace cinn::frontend, CINN frontend");
+  py::module framework = cinn.def_submodule(
       "framework", "namespace cinn::hlir::framework, CINN framework");
   py::module utils =
-      m.def_submodule("utils", "namespace cinn::utils, CINN framework");
-  py::module schedule = m.def_submodule(
+      cinn.def_submodule("utils", "namespace cinn::utils, CINN framework");
+  py::module schedule = cinn.def_submodule(
       "schedule", "namespace cinn::ir::schedule, CINN Schedule");
 
   BindRuntime(&runtime);
diff --git a/paddle/cinn/pybind/bind.h b/paddle/cinn/pybind/bind.h
index 77566097a19aa..bd9f69ece3c7f 100644
--- a/paddle/cinn/pybind/bind.h
+++ b/paddle/cinn/pybind/bind.h
@@ -53,4 +53,8 @@ void BindFrontend(pybind11::module *m);
 void BindFramework(pybind11::module *m);
 void BindUtils(pybind11::module *m);
 void BindSchedule(pybind11::module *m);
+
+__attribute__((visibility("default"))) extern void BindCINN(
+    pybind11::module *m);
+
 }  // namespace cinn::pybind
diff --git a/paddle/cinn/pybind/common.cc b/paddle/cinn/pybind/common.cc
index 7d777af91204a..9f7bd3bdf0d91 100644
--- a/paddle/cinn/pybind/common.cc
+++ b/paddle/cinn/pybind/common.cc
@@ -27,11 +27,16 @@ namespace py = pybind11;
 
 namespace cinn::pybind {
 
+using cinn::common::Arch;
+using cinn::common::ARMArch;
 using cinn::common::bfloat16;
 using cinn::common::CINNValue;
 using cinn::common::float16;
+using cinn::common::NVGPUArch;
 using cinn::common::Target;
 using cinn::common::Type;
+using cinn::common::UnknownArch;
+using cinn::common::X86Arch;
 using utils::GetStreamCnt;
 using utils::StringFormat;
 
@@ -44,14 +49,26 @@ void BindCinnValue(py::module *);
 void ResetGlobalNameID() { cinn::common::Context::Global().ResetNameId(); }
 
 void BindTarget(py::module *m) {
+  py::class_<Arch>(*m, "Arch")
+      .def("IsX86Arch",
+           [](const common::Arch &arch) {
+             return std::holds_alternative<common::X86Arch>(arch);
+           })
+      .def("IsNVGPUArch", [](const common::Arch &arch) {
+        return std::holds_alternative<common::NVGPUArch>(arch);
+      });
+
   py::class_<Target> target(*m, "Target");
   target.def_readwrite("os", &Target::os)
       .def_readwrite("arch", &Target::arch)
+      .def_static("X86Arch", []() -> common::Arch { return common::X86Arch{}; })
+      .def_static("NVGPUArch",
+                  []() -> common::Arch { return common::NVGPUArch{}; })
       .def_readwrite("bits", &Target::bits)
       .def_readwrite("features", &Target::features)
       .def(py::init<>())
       .def(py::init<Target::OS,
-                    Target::Arch,
+                    Arch,
                     Target::Bit,
                     const std::vector<Target::Feature> &>())
       .def("defined", &Target::defined)
@@ -71,12 +88,6 @@ void BindTarget(py::module *m) {
       .value("Linux", Target::OS::Linux)
       .value("Windows", Target::OS::Windows);
 
-  py::enum_<Target::Arch> arch(target, "Arch");
-  arch.value("Unk", Target::Arch::Unk)
-      .value("X86", Target::Arch::X86)
-      .value("ARM", Target::Arch::ARM)
-      .value("NVGPU", Target::Arch::NVGPU);
-
   py::enum_<Target::Bit> bit(target, "Bit");
   bit.value("Unk", Target::Bit::Unk)
       .value("k32", Target::Bit::k32)
diff --git a/paddle/cinn/pybind/framework.cc b/paddle/cinn/pybind/framework.cc
index 5122a61d9fc7b..36c9683e22d1c 100644
--- a/paddle/cinn/pybind/framework.cc
+++ b/paddle/cinn/pybind/framework.cc
@@ -78,7 +78,12 @@ void BindFramework(pybind11::module *m) {
                      input_output_names,
                      key,
                      target);
-             CHECK_EQ(funcs.size(), 1U);
+             PADDLE_ENFORCE_EQ(funcs.size(),
+                               1U,
+                               phi::errors::InvalidArgument(
+                                   "The size of funcs is incorrect."
+                                   "Expected size is 1, but receive %d.",
+                                   funcs.size()));
              func = funcs[0];
              return func;
            });
@@ -103,8 +108,11 @@ void BindFramework(pybind11::module *m) {
            })
       .def("get_attr",
            [](NodeAttr &self, const std::string &key) {
-             CHECK_EQ(self.attr_store.count(key), 1)
-                 << "Didn't find value with key [" << key << "].";
+             PADDLE_ENFORCE_EQ(self.attr_store.count(key),
+                               1,
+                               phi::errors::InvalidArgument(
+                                   "Didn't find value with key [%d].",
+                                   self.attr_store.count(key)));
              return self.attr_store[key];
            })
       .def("__str__", [](NodeAttr &self) { return utils::GetStreamCnt(self); });
@@ -119,24 +127,27 @@ void BindFramework(pybind11::module *m) {
                                              t->shape().data().end());
              py::array array(std::move(dt), std::move(shape));
              auto *mutable_data = array.mutable_data();
-             if (target.arch == Target::Arch::X86) {
-               std::memcpy(mutable_data,
-                           t->data<void>(),
-                           t->shape().numel() * t->type().bytes());
-             } else if (target.arch == Target::Arch::NVGPU) {
+             target.arch.Visit(adt::match{
+                 [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+                 [&](common::X86Arch) {
+                   std::memcpy(mutable_data,
+                               t->data<void>(),
+                               t->shape().numel() * t->type().bytes());
+                 },
+                 [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+                 [&](common::NVGPUArch) {
 #ifdef CINN_WITH_CUDA
-               CUDA_CALL(cudaMemcpy(
-                   mutable_data,
-                   reinterpret_cast<void *>(t->mutable_data(target, t->type())),
-                   t->shape().numel() * t->type().bytes(),
-                   cudaMemcpyDeviceToHost));
+                   CUDA_CALL(cudaMemcpy(mutable_data,
+                                        reinterpret_cast<void *>(
+                                            t->mutable_data(target, t->type())),
+                                        t->shape().numel() * t->type().bytes(),
+                                        cudaMemcpyDeviceToHost));
 #else
     PADDLE_THROW(phi::errors::Fatal("To use CUDA backends, "
     "you need to set WITH_CUDA ON!"));
 #endif
-             } else {
-               CINN_NOT_IMPLEMENTED
-             }
+                 },
+             });
              return array;
            })
       .def("var_names", &Scope::var_names);
@@ -152,38 +163,41 @@ void BindFramework(pybind11::module *m) {
            [](hlir::framework::Tensor &self, Type type) {
              self->set_type(type);
            })
-      .def(
-          "numpy",
-          [](hlir::framework::Tensor &self,
-             const cinn::common::Target &target) {
-            std::string type_str = cinn::common::Type2Str(self->type());
-            if (type_str == "bfloat16") {
-              type_str = "uint16";
-            }
-            py::dtype dt(type_str);
-            py::array::ShapeContainer shape(self->shape().data().begin(),
-                                            self->shape().data().end());
-            py::array array(std::move(dt), std::move(shape));
-            void *array_data = array.mutable_data();
-            if (target.arch == Target::Arch::X86) {
-              std::memcpy(array_data,
-                          self->data<void>(),
-                          self->shape().numel() * self->type().bytes());
-            } else if (target.arch == Target::Arch::NVGPU) {
+      .def("numpy",
+           [](hlir::framework::Tensor &self,
+              const cinn::common::Target &target) {
+             std::string type_str = cinn::common::Type2Str(self->type());
+             if (type_str == "bfloat16") {
+               type_str = "uint16";
+             }
+             py::dtype dt(type_str);
+             py::array::ShapeContainer shape(self->shape().data().begin(),
+                                             self->shape().data().end());
+             py::array array(std::move(dt), std::move(shape));
+             void *array_data = array.mutable_data();
+             target.arch.Visit(adt::match{
+                 [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+                 [&](common::X86Arch) {
+                   std::memcpy(array_data,
+                               self->data<void>(),
+                               self->shape().numel() * self->type().bytes());
+                 },
+                 [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+                 [&](common::NVGPUArch) {
 #ifdef CINN_WITH_CUDA
-              CUDA_CALL(cudaMemcpy(array_data,
-                                   self->data<void>(),
-                                   self->shape().numel() * self->type().bytes(),
-                                   cudaMemcpyDeviceToHost));
+                   CUDA_CALL(
+                       cudaMemcpy(array_data,
+                                  self->data<void>(),
+                                  self->shape().numel() * self->type().bytes(),
+                                  cudaMemcpyDeviceToHost));
 #else
     PADDLE_THROW(phi::errors::Fatal("To use CUDA backends, "
     "you need to set WITH_CUDA ON!"));
 #endif
-            } else {
-              CINN_NOT_IMPLEMENTED
-            }
-            return array;
-          })
+                 },
+             });
+             return array;
+           })
       .def(
           "from_numpy",
           [](hlir::framework::Tensor &self,
@@ -194,30 +208,44 @@ void BindFramework(pybind11::module *m) {
                 << "currently only support float32 data type as input";
             hlir::framework::shape_t shape;
             std::copy_n(array.shape(), array.ndim(), std::back_inserter(shape));
-            CHECK_EQ(
+            PADDLE_ENFORCE_EQ(
                 std::accumulate(shape.begin(),
                                 shape.end(),
                                 1,
                                 [](int32_t a, int32_t b) { return a * b; }),
-                self->shape().numel());
+                self->shape().numel(),
+                phi::errors::InvalidArgument(
+                    "The product of all elements in the shape container and "
+                    "shape numel is not equal,"
+                    "where the product of all elements in the shape "
+                    "container:%d but shape numel:%d.",
+                    std::accumulate(shape.begin(),
+                                    shape.end(),
+                                    1,
+                                    [](int32_t a, int32_t b) { return a * b; }),
+                    self->shape().numel()));
             auto *data = self->mutable_data(target, self->type());
-            if (target.arch == Target::Arch::X86) {
-              std::memcpy(data,
-                          array.data(),
-                          self->shape().numel() * self->type().bytes());
-            } else if (target.arch == Target::Arch::NVGPU) {
+            target.arch.Visit(adt::match{
+                [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+                [&](common::X86Arch) {
+                  std::memcpy(data,
+                              array.data(),
+                              self->shape().numel() * self->type().bytes());
+                },
+                [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+                [&](common::NVGPUArch) {
 #ifdef CINN_WITH_CUDA
-              CUDA_CALL(cudaMemcpy(reinterpret_cast<void *>(data),
-                                   reinterpret_cast<const void *>(array.data()),
-                                   self->shape().numel() * self->type().bytes(),
-                                   cudaMemcpyHostToDevice));
+                  CUDA_CALL(
+                      cudaMemcpy(reinterpret_cast<void *>(data),
+                                 reinterpret_cast<const void *>(array.data()),
+                                 self->shape().numel() * self->type().bytes(),
+                                 cudaMemcpyHostToDevice));
 #else
     PADDLE_THROW(phi::errors::Fatal("To use CUDA backends, "
     "you need to set WITH_CUDA ON!"));
 #endif
-            } else {
-              CINN_NOT_IMPLEMENTED
-            }
+                },
+            });
           });
 
   py::class_<Instruction> instruction(*m, "Instruction");
diff --git a/paddle/cinn/pybind/frontend.cc b/paddle/cinn/pybind/frontend.cc
index f7eaf01a59f07..fec7c5efb8b0a 100644
--- a/paddle/cinn/pybind/frontend.cc
+++ b/paddle/cinn/pybind/frontend.cc
@@ -219,27 +219,34 @@ void BindFrontend(pybind11::module *m) {
               auto in_tensor = scope->GetTensor(tensor_inputs[i]->id);
               auto dtype = tensor_inputs[i]->type;
               auto *data = in_tensor->mutable_data(target, dtype);
-              CHECK_EQ(input_data[i].size(), in_tensor->shape().numel())
-                  << "The size of tensor [" << tensor_inputs[i]->id
-                  << "] is different with the input data's size! Please check.";
-              if (target.arch == Target::Arch::NVGPU) {
+              PADDLE_ENFORCE_EQ(input_data[i].size(),
+                                in_tensor->shape().numel(),
+                                phi::errors::InvalidArgument(
+                                    "The size of tensor [%d] is different with "
+                                    "the input data's size! Please check.",
+                                    tensor_inputs[i]->id));
+              target.arch.Visit(adt::match{
+                  [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+                  [&](common::X86Arch) {
+                    memcpy(data,
+                           input_data[i].data(),
+                           in_tensor->shape().numel() *
+                               dtype.bytes());  // All random data
+                  },
+                  [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+                  [&](common::NVGPUArch) {
 #ifdef CINN_WITH_CUDA
-                CUDA_CALL(cudaMemcpy(data,
-                                     input_data[i].data(),
-                                     in_tensor->shape().numel() * dtype.bytes(),
-                                     cudaMemcpyHostToDevice));
+                    CUDA_CALL(
+                        cudaMemcpy(data,
+                                   input_data[i].data(),
+                                   in_tensor->shape().numel() * dtype.bytes(),
+                                   cudaMemcpyHostToDevice));
 #else
      PADDLE_THROW(phi::errors::Fatal("To use CUDA backends, "
      "you need to set WITH_CUDA ON!"));
 #endif
-              } else if (target.arch == Target::Arch::X86) {
-                memcpy(data,
-                       input_data[i].data(),
-                       in_tensor->shape().numel() *
-                           dtype.bytes());  // All random data
-              } else {
-                CINN_NOT_IMPLEMENTED
-              }
+                  },
+              });
             }
             program->Execute();
 
@@ -294,104 +301,118 @@ void BindFrontend(pybind11::module *m) {
        * '/python/tests/test_op_benchmark.py'
        *
        */
-      .def(
-          "test_benchmark",
-          [](Program &self,
-             const cinn::common::Target &target,
-             const std::vector<Variable> &tensor_inputs,
-             const std::vector<py::array> &input_data,
-             const Variable &tensor_out,
-             int repeat_,
-             const std::string &info) {
-            std::shared_ptr<hlir::framework::Graph> g(
-                new hlir::framework::Graph(self, target));
-            hlir::framework::ApplyPass(g.get(), "InferShape");
-            std::shared_ptr<hlir::framework::Scope> scope =
-                hlir::framework::BuildScope(target, g);
-            hlir::framework::CompilationContext context(g, scope, target);
-            hlir::framework::GraphCompiler gc(context);
-            auto program = gc.Build();
-            for (size_t i = 0; i < tensor_inputs.size(); i++) {
-              auto in_tensor = scope->GetTensor(tensor_inputs[i]->id);
-              auto *data = in_tensor->mutable_data<float>(target);
-              CHECK_EQ(input_data[i].size(), in_tensor->shape().numel())
-                  << "The size of tensor [" << tensor_inputs[i]->id
-                  << "] is different with the input data's size! Please check.";
-              if (target.arch == Target::Arch::NVGPU) {
+      .def("test_benchmark",
+           [](Program &self,
+              const cinn::common::Target &target,
+              const std::vector<Variable> &tensor_inputs,
+              const std::vector<py::array> &input_data,
+              const Variable &tensor_out,
+              int repeat_,
+              const std::string &info) {
+             std::shared_ptr<hlir::framework::Graph> g(
+                 new hlir::framework::Graph(self, target));
+             hlir::framework::ApplyPass(g.get(), "InferShape");
+             std::shared_ptr<hlir::framework::Scope> scope =
+                 hlir::framework::BuildScope(target, g);
+             hlir::framework::CompilationContext context(g, scope, target);
+             hlir::framework::GraphCompiler gc(context);
+             auto program = gc.Build();
+             for (size_t i = 0; i < tensor_inputs.size(); i++) {
+               auto in_tensor = scope->GetTensor(tensor_inputs[i]->id);
+               auto *data = in_tensor->mutable_data<float>(target);
+               PADDLE_ENFORCE_EQ(
+                   input_data[i].size(),
+                   in_tensor->shape().numel(),
+                   phi::errors::InvalidArgument(
+                       "The size of tensor [%d] is different with "
+                       "the input data's size! Please check.",
+                       tensor_inputs[i]->id));
+               target.arch.Visit(adt::match{
+                   [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+                   [&](common::X86Arch) {
+                     for (size_t j = 0; j < in_tensor->shape().numel(); j++) {
+                       data[j] = reinterpret_cast<const float *>(
+                           input_data[i].data())[j];  // All random data
+                     }
+                   },
+                   [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+                   [&](common::NVGPUArch) {
 #ifdef CINN_WITH_CUDA
-                CUDA_CALL(cudaMemcpy(reinterpret_cast<void *>(data),
-                                     input_data[i].data(),
-                                     in_tensor->shape().numel() * sizeof(float),
-                                     cudaMemcpyHostToDevice));
+                     CUDA_CALL(
+                         cudaMemcpy(reinterpret_cast<void *>(data),
+                                    input_data[i].data(),
+                                    in_tensor->shape().numel() * sizeof(float),
+                                    cudaMemcpyHostToDevice));
 #else
      PADDLE_THROW(phi::errors::Fatal("To use CUDA backends, "
      "you need to set WITH_CUDA ON!"));
 #endif
-              } else if (target.arch == Target::Arch::X86) {
-                for (size_t j = 0; j < in_tensor->shape().numel(); j++) {
-                  data[j] = reinterpret_cast<const float *>(
-                      input_data[i].data())[j];  // All random data
-                }
-              } else {
-                CINN_NOT_IMPLEMENTED
-              }
-            }
-            VLOG(3) << info;
-            program->ExecuteTest(repeat_);
-            auto out = scope->GetTensor(tensor_out->id);
-            return out;
-          })
-      .def(
-          "test_benchmark_with_code",
-          [](Program &self,
-             const cinn::common::Target &target,
-             const std::vector<Variable> &tensor_inputs,
-             const std::vector<py::array> &input_data,
-             const Variable &tensor_out,
-             int repeat_,
-             const std::string &info,
-             const std::string &code) {
-            // std::shared_ptr<hlir::framework::Graph> g(new
-            // hlir::framework::Graph(self, target));
-            // hlir::framework::ApplyPass(g.get(), "InferShape");
-            std::unordered_set<std::string> fetch_ids;
-            auto graph = cinn::frontend::Optimize(&self, fetch_ids, target);
-            std::shared_ptr<hlir::framework::Scope> scope =
-                hlir::framework::BuildScope(target, graph);
+                   },
+               });
+             }
+             VLOG(3) << info;
+             program->ExecuteTest(repeat_);
+             auto out = scope->GetTensor(tensor_out->id);
+             return out;
+           })
+      .def("test_benchmark_with_code",
+           [](Program &self,
+              const cinn::common::Target &target,
+              const std::vector<Variable> &tensor_inputs,
+              const std::vector<py::array> &input_data,
+              const Variable &tensor_out,
+              int repeat_,
+              const std::string &info,
+              const std::string &code) {
+             // std::shared_ptr<hlir::framework::Graph> g(new
+             // hlir::framework::Graph(self, target));
+             // hlir::framework::ApplyPass(g.get(), "InferShape");
+             std::unordered_set<std::string> fetch_ids;
+             auto graph = cinn::frontend::Optimize(&self, fetch_ids, target);
+             std::shared_ptr<hlir::framework::Scope> scope =
+                 hlir::framework::BuildScope(target, graph);
 
-            hlir::framework::CompilationContext context(graph, scope, target);
-            hlir::framework::GraphCompiler gc(context);
-            auto program = gc.Build(code);
-            for (size_t i = 0; i < tensor_inputs.size(); i++) {
-              auto in_tensor = scope->GetTensor(tensor_inputs[i]->id);
-              auto *data = in_tensor->mutable_data<float>(target);
-              CHECK_EQ(input_data[i].size(), in_tensor->shape().numel())
-                  << "The size of tensor [" << tensor_inputs[i]->id
-                  << "] is different with the input data's size! Please check.";
-              if (target.arch == Target::Arch::NVGPU) {
+             hlir::framework::CompilationContext context(graph, scope, target);
+             hlir::framework::GraphCompiler gc(context);
+             auto program = gc.Build(code);
+             for (size_t i = 0; i < tensor_inputs.size(); i++) {
+               auto in_tensor = scope->GetTensor(tensor_inputs[i]->id);
+               auto *data = in_tensor->mutable_data<float>(target);
+               PADDLE_ENFORCE_EQ(
+                   input_data[i].size(),
+                   in_tensor->shape().numel(),
+                   phi::errors::InvalidArgument(
+                       "The size of tensor [%d] is different with "
+                       "the input data's size! Please check.",
+                       tensor_inputs[i]->id));
+               target.arch.Visit(adt::match{
+                   [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+                   [&](common::X86Arch) {
+                     for (size_t j = 0; j < in_tensor->shape().numel(); j++) {
+                       data[j] = reinterpret_cast<const float *>(
+                           input_data[i].data())[j];  // All random data
+                     }
+                   },
+                   [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+                   [&](common::NVGPUArch) {
 #ifdef CINN_WITH_CUDA
-                CUDA_CALL(cudaMemcpy(reinterpret_cast<void *>(data),
-                                     input_data[i].data(),
-                                     in_tensor->shape().numel() * sizeof(float),
-                                     cudaMemcpyHostToDevice));
+                     CUDA_CALL(
+                         cudaMemcpy(reinterpret_cast<void *>(data),
+                                    input_data[i].data(),
+                                    in_tensor->shape().numel() * sizeof(float),
+                                    cudaMemcpyHostToDevice));
 #else
      PADDLE_THROW(phi::errors::Fatal("To use CUDA backends, "
      "you need to set WITH_CUDA ON!"));
 #endif
-              } else if (target.arch == Target::Arch::X86) {
-                for (size_t j = 0; j < in_tensor->shape().numel(); j++) {
-                  data[j] = reinterpret_cast<const float *>(
-                      input_data[i].data())[j];  // All random data
-                }
-              } else {
-                CINN_NOT_IMPLEMENTED
-              }
-            }
-            VLOG(3) << info;
-            program->ExecuteTest(repeat_);
-            auto out = scope->GetTensor(tensor_out->id);
-            return out;
-          });
+                   },
+               });
+             }
+             VLOG(3) << info;
+             program->ExecuteTest(repeat_);
+             auto out = scope->GetTensor(tensor_out->id);
+             return out;
+           });
 
   py::class_<frontend::Interpreter>(*m, "Interpreter")
       .def(py::init<const std::vector<std::string> &,
@@ -928,7 +949,7 @@ void BindFrontend(pybind11::module *m) {
       .def("get_cinn_name",
            [](PaddleModelConvertor &self, const std::string &paddle_name) {
              CHECK(self.var_model_to_program_map().count(paddle_name))
-                 << "Cannot find variabel " << paddle_name
+                 << "Cannot find variable " << paddle_name
                  << " in CINN! Please check.";
              return self.var_model_to_program_map().at(paddle_name);
            });
diff --git a/paddle/cinn/pybind/ir/ir.cc b/paddle/cinn/pybind/ir/ir.cc
index d9f9bd5fcdf7f..42e8e157998cd 100644
--- a/paddle/cinn/pybind/ir/ir.cc
+++ b/paddle/cinn/pybind/ir/ir.cc
@@ -33,7 +33,14 @@ void TensorStore(Expr tensor, Expr value, const std::vector<Expr>& indices) {
 std::vector<Expr> AxisMap(const std::string& kinds,
                           const std::vector<Expr>& iter_expression) {
   std::vector<Expr> rets;
-  CHECK_EQ(kinds.size(), iter_expression.size());
+  PADDLE_ENFORCE_EQ(
+      kinds.size(),
+      iter_expression.size(),
+      phi::errors::InvalidArgument(
+          "The size of kinds and iter expression in AxisMap is not equal,"
+          "where kinds size:%d but iter expression size:%d.",
+          kinds.size(),
+          iter_expression.size()));
   int n = iter_expression.size();
   rets.reserve(n);
   for (int i = 0; i < n; i++) {
diff --git a/paddle/cinn/pybind/lang.cc b/paddle/cinn/pybind/lang.cc
index 5f7a80e12e2c0..ed321a66ddc18 100644
--- a/paddle/cinn/pybind/lang.cc
+++ b/paddle/cinn/pybind/lang.cc
@@ -153,13 +153,22 @@ void BindModule(py::module *m) {
   builder.def(py::init<const std::string &, const cinn::common::Target &>())
       .def("add_function",
            [](ir::Module::Builder &self, ir::LoweredFunc func) {
-             if (self.GetTargetArch() == Target::Arch::NVGPU) {
+             self.GetTargetArch().Visit(adt::match{
+                 [&](common::UnknownArch) { LOG(FATAL) << "NotImplemented"; },
+                 [&](common::X86Arch) {
+                   // Do nothing
+                 },
+                 [&](common::ARMArch) {
+                   // Do nothing
+                 },
+                 [&](common::NVGPUArch) {
 #ifdef CINN_WITH_CUDA
-               auto func_expr = Expr(func);
-               ir::SetCudaAxisInfo(&func_expr);
-               optim::OptimizeExprGPU(&(func->body));
+                   auto func_expr = Expr(func);
+                   ir::SetCudaAxisInfo(&func_expr);
+                   optim::OptimizeExprGPU(&(func->body));
 #endif
-             }
+                 },
+             });
              self.AddFunction(func);
            })
       .def("add_buffer", &ir::Module::Builder::AddBuffer)
diff --git a/paddle/cinn/pybind/runtime.cc b/paddle/cinn/pybind/runtime.cc
index 0ef1ee542aa35..0d38616147536 100644
--- a/paddle/cinn/pybind/runtime.cc
+++ b/paddle/cinn/pybind/runtime.cc
@@ -74,30 +74,47 @@ cinn_buffer_t *CreateBufferFromNumpy(py::array data,
   return buffer;
 }
 
+cinn_buffer_t *CreateBufferFromNumpyImpl(common::UnknownArch, py::array data) {
+  LOG(FATAL) << "NotImplemented.";
+}
+
+cinn_buffer_t *CreateBufferFromNumpyImpl(common::X86Arch, py::array data) {
+  return CreateBufferFromNumpy(data, cinn_x86_device);
+}
+
+cinn_buffer_t *CreateBufferFromNumpyImpl(common::ARMArch, py::array data) {
+  LOG(FATAL) << "NotImplemented.";
+}
+
+cinn_buffer_t *CreateBufferFromNumpyImpl(common::NVGPUArch, py::array data) {
+#ifdef CINN_WITH_CUDA
+  std::vector<int> shape;
+  std::copy_n(data.shape(), data.ndim(), std::back_inserter(shape));
+  auto *buffer = new cinn_buffer_t();
+  buffer->device = cinn_nvgpu_device;
+  buffer->memory_size = data.nbytes();
+  CUDA_CALL(cudaMalloc(&buffer->memory, data.nbytes()));
+  CUDA_CALL(cudaMemcpy(
+      buffer->memory, data.data(), data.nbytes(), cudaMemcpyHostToDevice));
+  return buffer;
+#else
+  PADDLE_THROW(phi::errors::Fatal(
+      "To use CUDA backends, you need to set WITH_CUDA ON!"));
+#endif
+}
+
+cinn_buffer_t *InterfaceCreateBufferFromNumpy(common::Arch arch,
+                                              py::array data) {
+  return std::visit(
+      [&](const auto &impl) { return CreateBufferFromNumpyImpl(impl, data); },
+      arch.variant());
+}
+
 cinn_buffer_t *CreateBufferFromNumpy(
     py::array data,
     cinn::common::Target target = cinn::common::DefaultHostTarget(),
     int align = 0) {
-  if (target == cinn::common::DefaultHostTarget()) {
-    return CreateBufferFromNumpy(data, cinn_x86_device);
-  } else if (target.arch == Target::Arch::NVGPU) {
-#ifdef CINN_WITH_CUDA
-    std::vector<int> shape;
-    std::copy_n(data.shape(), data.ndim(), std::back_inserter(shape));
-    auto *buffer = new cinn_buffer_t();
-    buffer->device = cinn_nvgpu_device;
-    buffer->memory_size = data.nbytes();
-    CUDA_CALL(cudaMalloc(&buffer->memory, data.nbytes()));
-    CUDA_CALL(cudaMemcpy(
-        buffer->memory, data.data(), data.nbytes(), cudaMemcpyHostToDevice));
-    return buffer;
-#else
-    PADDLE_THROW(phi::errors::Fatal(
-        "To use CUDA backends, you need to set WITH_CUDA ON!"));
-#endif
-  } else {
-    CINN_NOT_IMPLEMENTED
-  }
+  return InterfaceCreateBufferFromNumpy(target.arch, data);
 }
 
 void BufferCopyTo(const cinn_buffer_t &buffer, py::array array) {
diff --git a/paddle/cinn/runtime/cpu/CMakeLists.txt b/paddle/cinn/runtime/cpu/CMakeLists.txt
index 72fa5f51bb0ca..804ee29ca5377 100644
--- a/paddle/cinn/runtime/cpu/CMakeLists.txt
+++ b/paddle/cinn/runtime/cpu/CMakeLists.txt
@@ -4,8 +4,8 @@ gather_srcs(cinnapi_src SRCS host_intrinsics.cc thread_backend.cc)
 
 if(WITH_MKL_CBLAS)
   gather_srcs(cinnapi_src SRCS mkl_math.cc cblas.cc)
-  if(WITH_MKLDNN)
-    gather_srcs(cinnapi_src SRCS mkldnn_math.cc)
+  if(WITH_ONEDNN)
+    gather_srcs(cinnapi_src SRCS onednn_math.cc)
   endif()
 endif()
 
@@ -15,8 +15,8 @@ if(WITH_MKL_CBLAS)
     cinn_cc_test(test_mkl_math SRCS mkl_math_test.cc mkl_math.cc DEPS cinncore)
   endif()
 
-  if(WITH_MKLDNN)
-    cinn_cc_test(test_mkldnn_math SRCS mkldnn_math_test.cc mkldnn_math.cc DEPS
+  if(WITH_ONEDNN)
+    cinn_cc_test(test_onednn_math SRCS onednn_math_test.cc onednn_math.cc DEPS
                  cinncore)
   endif()
 endif()
diff --git a/paddle/cinn/runtime/cpu/mkl_math_test.cc b/paddle/cinn/runtime/cpu/mkl_math_test.cc
index 50798ebb39029..f9149dab3a615 100644
--- a/paddle/cinn/runtime/cpu/mkl_math_test.cc
+++ b/paddle/cinn/runtime/cpu/mkl_math_test.cc
@@ -78,7 +78,7 @@ void TestCallElementwise(const std::string &fn_name,
   auto stages = CreateStages(lower_args);
 
   auto target = cinn::common::DefaultHostTarget();
-  target.arch = Target::Arch::X86;
+  target.arch = cinn::common::X86Arch{};
   ir::Module::Builder builder("module0", target);
   auto func = Lower("fn", stages, lower_args);
   builder.AddFunction(func);
@@ -216,7 +216,7 @@ TEST(cinn_cpu_mkl_gemm_fp32, test) {
   auto stages = CreateStages({call, out});
 
   auto target = cinn::common::DefaultHostTarget();
-  target.arch = Target::Arch::X86;
+  target.arch = cinn::common::X86Arch{};
   ir::Module::Builder builder("module0", target);
 
   auto func = Lower("fn", stages, {A, B, out, call});
diff --git a/paddle/cinn/runtime/cpu/mkldnn_math.cc b/paddle/cinn/runtime/cpu/onednn_math.cc
similarity index 96%
rename from paddle/cinn/runtime/cpu/mkldnn_math.cc
rename to paddle/cinn/runtime/cpu/onednn_math.cc
index f20e56e32f1e6..66af7029d7e58 100644
--- a/paddle/cinn/runtime/cpu/mkldnn_math.cc
+++ b/paddle/cinn/runtime/cpu/onednn_math.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/cinn/runtime/cpu/mkldnn_math.h"
+#include "paddle/cinn/runtime/cpu/onednn_math.h"
 
 #include <vector>
 
@@ -25,7 +25,7 @@ using dnnl::memory;
 using tag = memory::format_tag;
 using dt = memory::data_type;
 
-void cinn_cpu_mkldnn_softmax_fp32(int batch,
+void cinn_cpu_onednn_softmax_fp32(int batch,
                                   int channel,
                                   int h,
                                   int w,
@@ -75,7 +75,7 @@ void cinn_cpu_mkldnn_softmax_fp32(int batch,
   engine_stream.wait();
 }
 
-void cinn_cpu_mkldnn_conv2d_nchw_fp32(int batch_size,
+void cinn_cpu_onednn_conv2d_nchw_fp32(int batch_size,
                                       int c_in,
                                       int input_h,
                                       int input_w,
@@ -157,7 +157,7 @@ void cinn_cpu_mkldnn_conv2d_nchw_fp32(int batch_size,
   cpu_stream.wait();
 }
 
-CINN_REGISTER_HELPER(cinn_cpu_mkldnn) {
+CINN_REGISTER_HELPER(cinn_cpu_onednn) {
   using namespace cinn;  // NOLINT
   using backends::FunctionProto;
   auto host_target = cinn::common::DefaultHostTarget();
@@ -195,7 +195,7 @@ CINN_REGISTER_HELPER(cinn_cpu_mkldnn) {
         return shape;
       };
 
-  REGISTER_EXTERN_FUNC_HELPER(cinn_cpu_mkldnn_conv2d_nchw_fp32, host_target)
+  REGISTER_EXTERN_FUNC_HELPER(cinn_cpu_onednn_conv2d_nchw_fp32, host_target)
       .SetRetType<void>()
       .AddInputType<int>()              // batch_size
       .AddInputType<int>()              // c_in
@@ -217,7 +217,7 @@ CINN_REGISTER_HELPER(cinn_cpu_mkldnn) {
       .SetShapeInference(inference_shape_conv2d_nchw)
       .End();
 
-  REGISTER_EXTERN_FUNC_HELPER(cinn_cpu_mkldnn_softmax_fp32, host_target)
+  REGISTER_EXTERN_FUNC_HELPER(cinn_cpu_onednn_softmax_fp32, host_target)
       .SetRetType<void>()
       .AddInputType<int>()              // batch_size
       .AddInputType<int>()              // c_in
diff --git a/paddle/cinn/runtime/cpu/mkldnn_math.h b/paddle/cinn/runtime/cpu/onednn_math.h
similarity index 95%
rename from paddle/cinn/runtime/cpu/mkldnn_math.h
rename to paddle/cinn/runtime/cpu/onednn_math.h
index 9a37d13d57865..7d95b6461b0a9 100644
--- a/paddle/cinn/runtime/cpu/mkldnn_math.h
+++ b/paddle/cinn/runtime/cpu/onednn_math.h
@@ -21,7 +21,7 @@
 
 // define some C APIs
 extern "C" {
-void cinn_cpu_mkldnn_softmax_fp32(int batch,
+void cinn_cpu_onednn_softmax_fp32(int batch,
                                   int channel,
                                   int h,
                                   int w,
@@ -29,7 +29,7 @@ void cinn_cpu_mkldnn_softmax_fp32(int batch,
                                   cinn_buffer_t* inputs,
                                   cinn_buffer_t* out);
 
-void cinn_cpu_mkldnn_conv2d_nchw_fp32(int batch_size,
+void cinn_cpu_onednn_conv2d_nchw_fp32(int batch_size,
                                       int c_in,
                                       int input_h,
                                       int input_w,
diff --git a/paddle/cinn/runtime/cpu/mkldnn_math_test.cc b/paddle/cinn/runtime/cpu/onednn_math_test.cc
similarity index 96%
rename from paddle/cinn/runtime/cpu/mkldnn_math_test.cc
rename to paddle/cinn/runtime/cpu/onednn_math_test.cc
index 15574a9028042..cbfa19ffb4762 100644
--- a/paddle/cinn/runtime/cpu/mkldnn_math_test.cc
+++ b/paddle/cinn/runtime/cpu/onednn_math_test.cc
@@ -42,7 +42,7 @@ cinn_buffer_t *CreateBuffer(const std::vector<int> shape,
   return cinn::common::BufferBuilder(Float(32), shape).set_zero().Build();
 }
 
-TEST(cinn_cpu_mkldnn_conv2d_nchw_fp32, test) {
+TEST(cinn_cpu_onednn_conv2d_nchw_fp32, test) {
   int n(1);
   int c_in(3);
   int i_h(224);
@@ -65,7 +65,7 @@ TEST(cinn_cpu_mkldnn_conv2d_nchw_fp32, test) {
   auto call = Compute(
       {Expr(1)},
       [=]() -> Expr {
-        return lang::CallExtern("cinn_cpu_mkldnn_conv2d_nchw_fp32",
+        return lang::CallExtern("cinn_cpu_onednn_conv2d_nchw_fp32",
                                 {
                                     Expr(n),           // batch_size
                                     Expr(c_in),        // c_in
@@ -85,7 +85,7 @@ TEST(cinn_cpu_mkldnn_conv2d_nchw_fp32, test) {
                                     weights.tensor()   // weights
                                 });
       },
-      "cinn_cpu_mkldnn_conv2d_nchw_fp32");
+      "cinn_cpu_onednn_conv2d_nchw_fp32");
 
   auto out = call->TupleGet(0);
   out->WithBuffer(Float(32));
@@ -93,7 +93,7 @@ TEST(cinn_cpu_mkldnn_conv2d_nchw_fp32, test) {
   auto stages = CreateStages({call, out});
 
   auto target = cinn::common::DefaultHostTarget();
-  target.arch = Target::Arch::X86;
+  target.arch = cinn::common::X86Arch{};
   ir::Module::Builder builder("module0", target);
 
   auto func = Lower("fn", stages, {input, weights, out, call});
diff --git a/paddle/cinn/runtime/cpu/use_extern_funcs.h b/paddle/cinn/runtime/cpu/use_extern_funcs.h
index e708864f5b36a..4c65e5ff30501 100644
--- a/paddle/cinn/runtime/cpu/use_extern_funcs.h
+++ b/paddle/cinn/runtime/cpu/use_extern_funcs.h
@@ -21,7 +21,7 @@ CINN_USE_REGISTER(host_intrinsics)
 CINN_USE_REGISTER(mkl_math)
 CINN_USE_REGISTER(cinn_cpu_mkl)
 #ifdef CINN_WITH_DNNL
-CINN_USE_REGISTER(cinn_cpu_mkldnn)
+CINN_USE_REGISTER(cinn_cpu_onednn)
 #endif
 #endif
 CINN_USE_REGISTER(cinn_backend_parallel)
diff --git a/paddle/cinn/runtime/flags.cc b/paddle/cinn/runtime/flags.cc
index c310a47f5f180..9427d0eda7195 100644
--- a/paddle/cinn/runtime/flags.cc
+++ b/paddle/cinn/runtime/flags.cc
@@ -75,7 +75,7 @@ PD_DEFINE_bool(group_schedule_tiling_first,
                "Whether to enable new group scheduler tiling first strategy.");
 
 PD_DEFINE_bool(cinn_new_cluster_op_method,
-               BoolFromEnv("FLAGS_cinn_new_cluster_op_method", false),
+               BoolFromEnv("FLAGS_cinn_new_cluster_op_method", true),
                "Whether to enable newly developed clustering method of group "
                "op for cinn.");
 
@@ -343,17 +343,38 @@ bool IsCompiledWithCUDNN() {
 #endif
 }
 
+void CheckCompileOptionImpl(cinn::common::UnknownArch) {
+  PADDLE_THROW(phi::errors::Fatal("unknown architecture"));
+}
+
+void CheckCompileOptionImpl(cinn::common::X86Arch) {
+  // Do nothing.
+}
+
+void CheckCompileOptionImpl(cinn::common::ARMArch) {
+  // Do nothing.
+}
+
+void CheckCompileOptionImpl(cinn::common::NVGPUArch) {
+#if defined(CINN_WITH_CUDNN)
+  // Do nothing;
+#else
+  PADDLE_THROW(phi::errors::Fatal(
+      "Current CINN version does not support NVGPU, please try to "
+      "recompile with -DWITH_CUDA."));
+#endif
+}
+
+void CheckCompileOption(cinn::common::Arch arch) {
+  return std::visit([](const auto& impl) { CheckCompileOptionImpl(impl); },
+                    arch.variant());
+}
+
 cinn::common::Target CurrentTarget::target_ = cinn::common::DefaultTarget();
 
 void CurrentTarget::SetCurrentTarget(const cinn::common::Target& target) {
-  if (!IsCompiledWithCUDA() &&
-      target.arch == cinn::common::Target::Arch::NVGPU) {
-    PADDLE_THROW(phi::errors::Fatal(
-        "Current CINN version does not support NVGPU, please try to "
-        "recompile with -DWITH_CUDA."));
-  } else {
-    target_ = target;
-  }
+  CheckCompileOption(target.arch);
+  target_ = target;
 }
 
 cinn::common::Target& CurrentTarget::GetCurrentTarget() { return target_; }
diff --git a/paddle/common/enforce.cc b/paddle/common/enforce.cc
index 0719035db4c49..6dd4f0372e2b3 100644
--- a/paddle/common/enforce.cc
+++ b/paddle/common/enforce.cc
@@ -64,10 +64,11 @@ int GetCallStackLevel() { return FLAGS_call_stack_level; }
 std::string SimplifyErrorTypeFormat(const std::string& str) {
   std::ostringstream sout;
   size_t type_end_pos = str.find(':', 0);
-  if (str.substr(type_end_pos - 5, type_end_pos) == "Error:") {
+  if (type_end_pos != str.npos && type_end_pos >= 5 &&
+      str.substr(type_end_pos - 5, 6) == "Error:") {
     // Remove "Error:", add "()"
     // Examples:
-    //    InvalidArgumentError: xxx -> (InvalidArgument): xxx
+    //    InvalidArgumentError: xxx -> (InvalidArgument) xxx
     sout << "(" << str.substr(0, type_end_pos - 5) << ")"
          << str.substr(type_end_pos + 1);
   } else {
diff --git a/paddle/common/enforce.h b/paddle/common/enforce.h
index 6076e9089df83..b3027d55c8065 100644
--- a/paddle/common/enforce.h
+++ b/paddle/common/enforce.h
@@ -362,47 +362,5 @@ inline bool is_error(const T& stat) {
 }
 
 namespace pir {
-class IrNotMetException : public std::exception {
- public:
-  explicit IrNotMetException(const std::string& str)
-      : err_str_(str + ::common::enforce::GetCurrentTraceBackString()) {}
-
-  const char* what() const noexcept override { return err_str_.c_str(); }
-
- private:
-  std::string err_str_;
-  ::common::enforce::details::PaddleFatalGuard paddle_fatal_guard_;
-};
-
-#define IR_THROW(...)                                                     \
-  do {                                                                    \
-    try {                                                                 \
-      throw pir::IrNotMetException(                                       \
-          paddle::string::Sprintf("Error occurred at: %s:%d :\n%s",       \
-                                  __FILE__,                               \
-                                  __LINE__,                               \
-                                  paddle::string::Sprintf(__VA_ARGS__))); \
-    } catch (const std::exception& e) {                                   \
-      std::cout << e.what() << std::endl;                                 \
-      throw;                                                              \
-    }                                                                     \
-  } while (0)
-
-#define IR_ENFORCE(COND, ...)                                               \
-  do {                                                                      \
-    bool __cond__(COND);                                                    \
-    if (UNLIKELY(is_error(__cond__))) {                                     \
-      try {                                                                 \
-        throw pir::IrNotMetException(                                       \
-            paddle::string::Sprintf("Error occurred at: %s:%d :\n%s",       \
-                                    __FILE__,                               \
-                                    __LINE__,                               \
-                                    paddle::string::Sprintf(__VA_ARGS__))); \
-      } catch (const std::exception& e) {                                   \
-        std::cout << e.what() << std::endl;                                 \
-        throw;                                                              \
-      }                                                                     \
-    }                                                                       \
-  } while (0)
-
+#define IR_THROW(...) PADDLE_THROW(phi::errors::Fatal(__VA_ARGS__))
 }  // namespace pir
diff --git a/paddle/common/flags.cc b/paddle/common/flags.cc
index 16057b5ef598f..770b51e6fd3f1 100644
--- a/paddle/common/flags.cc
+++ b/paddle/common/flags.cc
@@ -629,6 +629,10 @@ PHI_DEFINE_EXPORTED_uint64(
     "The real chunk size is max(request_size, "
     "FLAGS_auto_growth_chunk_size_in_mb).");
 
+PHI_DEFINE_EXPORTED_bool(custom_device_mem_record,
+                         false,
+                         "Enable mem record event on custom device");
+
 #endif
 
 /**
@@ -737,13 +741,13 @@ PHI_DEFINE_EXPORTED_bool(set_to_1d, false, "set 0D Tensor to 1D numpy");
 
 /**
  * Debug related FLAG
- * Name: tracer_mkldnn_ops_on
+ * Name: tracer_onednn_ops_on
  * Since Version: 2.0.0
  * Value Range: string, default=empty
  * Example:
  * Note: Holds list of operation types with OneDNN kernels to be enabled.
  */
-PHI_DEFINE_EXPORTED_string(tracer_mkldnn_ops_on,
+PHI_DEFINE_EXPORTED_string(tracer_onednn_ops_on,
                            "",
                            "List of OneDNN operation types to be turned on");
 
@@ -761,13 +765,13 @@ PHI_DEFINE_EXPORTED_string(static_runtime_data_save_path,
 
 /**
  * Debug related FLAG
- * Name: tracer_mkldnn_ops_off
+ * Name: tracer_onednn_ops_off
  * Since Version: 2.0.0
  * Value Range: string, default=empty
  * Example:
  * Note: Holds list of operation types with OneDNN kernels to be disabled.
  */
-PHI_DEFINE_EXPORTED_string(tracer_mkldnn_ops_off,
+PHI_DEFINE_EXPORTED_string(tracer_onednn_ops_off,
                            "",
                            "List of OneDNN operation types to be turned off");
 
@@ -1021,6 +1025,19 @@ PHI_DEFINE_EXPORTED_string(deny_cinn_ops,
                            "",
                            "It controls the cinn op subset to be not used.");
 
+/*
+ * CINN related FLAG
+ * Name: FLAGS_deny_cinn_ops
+ * Since Version: 3.0 Beta
+ * Value Range: bool, default=true
+ * Example: FLAGS_enable_cinn_compile_cache=true would reuse cached Kernel
+ * function
+ */
+PHI_DEFINE_EXPORTED_bool(
+    enable_cinn_compile_cache,
+    true,
+    "It controls whether to enable cinn compilation cache.");
+
 /*
  * CINN related FLAG
  * Name: FLAGS_enable_pe_launch_cinn
@@ -1244,6 +1261,17 @@ PHI_DEFINE_EXPORTED_bool(benchmark_nccl,
  */
 PHI_DEFINE_EXPORTED_bool(use_autotune, false, "Whether enable autotune.");
 
+/**
+ * CINN training related FLAG
+ * Name: FLAGS_disable_dyshape_in_train
+ * Since Version: 2.7.0
+ * Value Range: bool, default=false
+ * Example:
+ */
+PHI_DEFINE_EXPORTED_bool(disable_dyshape_in_train,
+                         false,
+                         "Whether disable dyshape in training.");
+
 /**
  * Conv Search cache max number related FLAG
  * Name: FLAGS_search_cache_max_number
@@ -1345,6 +1373,19 @@ PHI_DEFINE_EXPORTED_bool(use_shm_cache,
                          false,
                          "Use shm cache in mmap_allocator.");
 
+/**
+ * mmap_allocator related FLAG
+ * Name: dataloader_use_file_descriptor
+ * Since Version: 2.6.2
+ * Value Range: bool, default=false
+ * Example:
+ * Note: . If True, mmap_allocator will use file descripor to open shared memory
+ * operation.
+ */
+PHI_DEFINE_EXPORTED_bool(dataloader_use_file_descriptor,
+                         false,
+                         "Use file descriptor in mmap_allocator.");
+
 /**
  * Tensor operants related FLAG
  * Name: tensor_operants_mode
@@ -1367,7 +1408,7 @@ PHI_DEFINE_EXPORTED_string(tensor_operants_mode,
  * Since Version: 2.6.0
  * Value Range: bool, default=false
  * Example:
- * Note: If Ture, executor will use new IR
+ * Note: If True, executor will use new IR
  */
 PHI_DEFINE_EXPORTED_bool(enable_pir_in_executor,
                          false,
@@ -1380,7 +1421,7 @@ PHI_DEFINE_EXPORTED_bool(enable_pir_in_executor,
  * Since Version: 2.6.0
  * Value Range: bool, default=true
  * Example:
- * Note: If Ture, program will be translated to pir program
+ * Note: If True, program will be translated to pir program
  * and then run in executor for dy2st mode.
  */
 PHI_DEFINE_EXPORTED_bool(enable_pir_with_pt_in_dy2st,
@@ -1530,7 +1571,7 @@ PHI_DEFINE_EXPORTED_int64(alloc_fill_value,
  * Since Version: 3.0.0
  * Value Range: bool, default=false
  * Example:
- * Note: If Ture, will apply shape_optimization pass to new IR.
+ * Note: If True, will apply shape_optimization pass to new IR.
  */
 PHI_DEFINE_EXPORTED_bool(pir_apply_shape_optimization_pass,
                          false,
diff --git a/paddle/common/union_find_set.h b/paddle/common/union_find_set.h
new file mode 100644
index 0000000000000..b00c8ae7de8f5
--- /dev/null
+++ b/paddle/common/union_find_set.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <unordered_map>
+#include <vector>
+
+namespace common {
+
+template <typename T>
+class UnionFindSet {
+ public:
+  const T& Find(const T& x) const {
+    if (parent_.find(x) == parent_.end()) {
+      return x;
+    }
+    if (parent_.at(x) != x) return Find(parent_.at(x));
+    return parent_.at(x);
+  }
+
+  const T& Find(const T& x) {
+    if (parent_.find(x) == parent_.end()) {
+      return x;
+    }
+    if (parent_[x] != x) {
+      parent_[x] = Find(parent_[x]);
+    }
+    return parent_.at(x);
+  }
+
+  void Union(const T& p, const T& q) {
+    if (parent_.find(p) == parent_.end()) {
+      parent_[p] = p;
+    }
+    if (parent_.find(q) == parent_.end()) {
+      parent_[q] = q;
+    }
+    parent_[Find(q)] = Find(p);
+  }
+
+  template <typename DoEachClusterT>
+  void VisitCluster(const DoEachClusterT& DoEachCluster) const {
+    std::unordered_map<T, std::vector<T>> clusters_map;
+    for (auto it = parent_.begin(); it != parent_.end(); it++) {
+      clusters_map[Find(it->first)].emplace_back(it->first);
+    }
+    for (const auto& [_, clusters] : clusters_map) {
+      DoEachCluster(clusters);
+    }
+  }
+
+  bool HasSameRoot(const T& p, const T& q) const { return Find(p) == Find(q); }
+
+  std::unordered_map<T, T>* GetMap() { return &parent_; }
+
+ private:
+  std::unordered_map<T, T> parent_;
+};
+
+}  // namespace common
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
deleted file mode 100644
index d10ff999f6eb2..0000000000000
--- a/paddle/fluid/API.spec
+++ /dev/null
@@ -1,33 +0,0 @@
-paddle.incubate.optimizer.PipelineOptimizer (paddle.incubate.optimizer.PipelineOptimizer, ('document', '2e55a29dbeb874934f7a1a1af3a22b8c'))
-paddle.incubate.optimizer.PipelineOptimizer.__init__ (ArgSpec(args=['self', 'optimizer', 'num_microbatches', 'start_cpu_core_id'], varargs=None, keywords=None, defaults=(1, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.incubate.optimizer.PipelineOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.audio.features (ArgSpec(), ('document', 'd41d8cd98f00b204e9800998ecf8427e'))
-paddle.audio.features.layers.LogMelSpectrogram (ArgSpec(), ('document', 'c38b53606aa89215c4f00d3833e158b8'))
-paddle.audio.features.layers.LogMelSpectrogram.forward (ArgSpec(args=['self', 'x'], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.Tensor'>, 'x': <class 'paddle.Tensor'>}), ('document', '6c14f6f78dc697a6981cf90412e2f1ea'))
-paddle.audio.features.layers.LogMelSpectrogram.load_dict (ArgSpec(args=[], varargs='args', varkw='kwargs', defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={}), ('document', '01221a60445ee437f439a8cbe293f759'))
-paddle.audio.features.layers.LogMelSpectrogram.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers', 'structured_name_prefix', 'use_hook'], varargs=None, varkw=None, defaults=(None, True, '', True), kwonlyargs=[], kwonlydefaults=None, annotations={}), ('document', '0c01cb0c12220c9426ae49549b145b0b'))
-paddle.audio.features.layers.MFCC (ArgSpec(), ('document', 'bcbe6499830d9228a4f746ddd63b6c0f'))
-paddle.audio.features.layers.MFCC.forward (ArgSpec(args=['self', 'x'], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.Tensor'>, 'x': <class 'paddle.Tensor'>}), ('document', 'd86bcaa345f26851089bfdb3efecd9e7'))
-paddle.audio.features.layers.MelSpectrogram (ArgSpec(), ('document', 'adf4012310984568ae9da6170aa89f91'))
-paddle.audio.features.layers.MelSpectrogram.forward (ArgSpec(args=['self', 'x'], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.Tensor'>, 'x': <class 'paddle.Tensor'>}), ('document', '458e9d454c8773091567c6b400f48cf5'))
-paddle.audio.features.layers.Spectrogram (ArgSpec(), ('document', '83811af6da032099bf147e3e01a458e1'))
-paddle.audio.features.layers.Spectrogram.forward (ArgSpec(args=['self', 'x'], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.Tensor'>, 'x': <class 'paddle.Tensor'>}), ('document', 'ab11e318fca1410f743b5432394dea35'))
-paddle.audio.functional (ArgSpec(), ('document', 'd41d8cd98f00b204e9800998ecf8427e'))
-paddle.audio.functional.functional.compute_fbank_matrix (ArgSpec(args=['sr', 'n_fft', 'n_mels', 'f_min', 'f_max', 'htk', 'norm', 'dtype'], varargs=None, varkw=None, defaults=(64, 0.0, None, False, 'slaney', 'float32'), kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.Tensor'>, 'sr': <class 'int'>, 'n_fft': <class 'int'>, 'n_mels': <class 'int'>, 'f_min': <class 'float'>, 'f_max': typing.Union[float, NoneType], 'htk': <class 'bool'>, 'norm': typing.Union[str, float], 'dtype': <class 'str'>}), ('document', '3c5411caa6baedb68860b09c81e0147c'))
-paddle.audio.functional.functional.create_dct (ArgSpec(args=['n_mfcc', 'n_mels', 'norm', 'dtype'], varargs=None, varkw=None, defaults=('ortho', 'float32'), kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.Tensor'>, 'n_mfcc': <class 'int'>, 'n_mels': <class 'int'>, 'norm': typing.Union[str, NoneType], 'dtype': <class 'str'>}), ('document', 'c9c57550671f9725b053769411d2f65a'))
-paddle.audio.functional.functional.fft_frequencies (ArgSpec(args=['sr', 'n_fft', 'dtype'], varargs=None, varkw=None, defaults=('float32',), kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.Tensor'>, 'sr': <class 'int'>, 'n_fft': <class 'int'>, 'dtype': <class 'str'>}), ('document', '057b990e79c9c780622407267c0a43c6'))
-paddle.audio.functional.functional.hz_to_mel (ArgSpec(args=['freq', 'htk'], varargs=None, varkw=None, defaults=(False,), kwonlyargs=[], kwonlydefaults=None, annotations={'return': typing.Union[paddle.Tensor, float], 'freq': typing.Union[paddle.Tensor, float], 'htk': <class 'bool'>}), ('document', '7ca01521dd0bf26cd3f72c67f7168dc4'))
-paddle.audio.functional.functional.mel_frequencies (ArgSpec(args=['n_mels', 'f_min', 'f_max', 'htk', 'dtype'], varargs=None, varkw=None, defaults=(64, 0.0, 11025.0, False, 'float32'), kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.Tensor'>, 'n_mels': <class 'int'>, 'f_min': <class 'float'>, 'f_max': <class 'float'>, 'htk': <class 'bool'>, 'dtype': <class 'str'>}), ('document', '2af3cf997ed1274214ec240b2b59a98d'))
-paddle.audio.functional.functional.mel_to_hz (ArgSpec(args=['mel', 'htk'], varargs=None, varkw=None, defaults=(False,), kwonlyargs=[], kwonlydefaults=None, annotations={'return': typing.Union[float, paddle.Tensor], 'mel': typing.Union[float, paddle.Tensor], 'htk': <class 'bool'>}), ('document', 'e93b432d382f98c60d7c7599489e7072'))
-paddle.audio.functional.functional.power_to_db (ArgSpec(args=['spect', 'ref_value', 'amin', 'top_db'], varargs=None, varkw=None, defaults=(1.0, 1e-10, 80.0), kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.Tensor'>, 'spect': <class 'paddle.Tensor'>, 'ref_value': <class 'float'>, 'amin': <class 'float'>, 'top_db': typing.Union[float, NoneType]}), ('document', '28bbb1973e8399e856bfaea0415cecb9'))
-paddle.audio.functional.window.get_window (ArgSpec(args=['window', 'win_length', 'fftbins', 'dtype'], varargs=None, varkw=None, defaults=(True, 'float64'), kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.Tensor'>, 'window': typing.Union[str, typing.Tuple[str, float]], 'win_length': <class 'int'>, 'fftbins': <class 'bool'>, 'dtype': <class 'str'>}), ('document', '2418d63da10c0cd5da9ecf0a88ddf783'))
-paddle.audio.backends (ArgSpec(), ('document', 'd41d8cd98f00b204e9800998ecf8427e'))
-paddle.audio.backends.init_backend.get_current_audio_backend (ArgSpec(args=[], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'str'>}), ('document', '3ff9fd62e8be1f3dc7e34afaf50e1645'))
-paddle.audio.backends.init_backend.list_available_backends (ArgSpec(args=[], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={'return': typing.List[str]}), ('document', '8eba49f1b69f7ec7fa139a0714a2724e'))
-paddle.audio.backends.init_backend.set_backend (ArgSpec(args=['backend_name'], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={'backend_name': <class 'str'>}), ('document', '9680247dd97274d345dee415e2787527'))
-paddle.audio.backends.wave_backend.info (ArgSpec(args=['filepath', 'format'], varargs=None, varkw=None, defaults=(None,), kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.audio.backends.backend.AudioInfo'>, 'filepath': <class 'str'>, 'format': typing.Union[str, NoneType]}), ('document', 'e0ffd3accd942a9b0a4c08463a9f60f6'))
-paddle.audio.backends.wave_backend.load (ArgSpec(args=['filepath', 'frame_offset', 'num_frames', 'normalize', 'channels_first', 'format'], varargs=None, varkw=None, defaults=(0, -1, True, True, None), kwonlyargs=[], kwonlydefaults=None, annotations={'return': typing.Tuple[paddle.Tensor, int], 'filepath': typing.Union[str, pathlib.Path], 'frame_offset': <class 'int'>, 'num_frames': <class 'int'>, 'normalize': <class 'bool'>, 'channels_first': <class 'bool'>, 'format': typing.Union[str, NoneType]}), ('document', '4de50575ca516b4b7c7c82c7fdec808f'))
-paddle.audio.backends.wave_backend.save (ArgSpec(args=['filepath', 'src', 'sample_rate', 'channels_first', 'compression', 'format', 'encoding', 'bits_per_sample'], varargs=None, varkw=None, defaults=(True, None, None, None, None), kwonlyargs=[], kwonlydefaults=None, annotations={'filepath': <class 'str'>, 'src': <class 'paddle.Tensor'>, 'sample_rate': <class 'int'>, 'channels_first': <class 'bool'>, 'compression': typing.Union[float, NoneType], 'format': typing.Union[str, NoneType], 'encoding': typing.Union[str, NoneType], 'bits_per_sample': typing.Union[int, NoneType]}), ('document', '4c85cfcd29a0dcdfc32e74db8c0c3961'))
-paddle.audio.datasets (ArgSpec(), ('document', 'd41d8cd98f00b204e9800998ecf8427e'))
-paddle.audio.datasets.TESS (ArgSpec(), ('document', '3605f3aa2191ede7ddbe594cd27bb067'))
-paddle.audio.datasets.TESS.meta_info (ArgSpec(), ('document', '60d548a6f71629c3b69bcda3a30d4819'))
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
index 5e2be03108294..2d7326f825acc 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -39,7 +39,7 @@ void ComputeInterceptor::PrepareDeps() {
     for (int64_t i = 0; i < node_->max_run_times(); ++i) {
       ready_size_map.emplace(i, 0);
     }
-    in_readys_.emplace(up.first, std::make_pair(up.second, ready_size_map));
+    in_readies_.emplace(up.first, std::make_pair(up.second, ready_size_map));
   }
   for (auto down : downstream) {
     out_buffs_.emplace(down.first, std::make_pair(down.second, 0));
@@ -106,11 +106,11 @@ InterceptorMessage ComputeInterceptor::PrepareVarsMsg() {
 }
 
 void ComputeInterceptor::IncreaseReady(int64_t up_id, int64_t scope_id) {
-  auto it = in_readys_.find(up_id);
+  auto it = in_readies_.find(up_id);
   PADDLE_ENFORCE_NE(it,
-                    in_readys_.end(),
+                    in_readies_.end(),
                     platform::errors::NotFound(
-                        "Cannot find upstream=%lld in in_readys.", up_id));
+                        "Cannot find upstream=%lld in in_readies.", up_id));
 
   auto max_ready_size = it->second.first;
   const auto& ready_scope_map = it->second.second;
@@ -171,7 +171,7 @@ bool ComputeInterceptor::IsInputReady() {
   for (int64_t i = start_micro_step; i < start_micro_step + num_micro_step;
        ++i) {
     bool flag = true;
-    for (auto& ins : in_readys_) {
+    for (auto& ins : in_readies_) {
       auto ready_size_map = ins.second.second;
       flag = flag && (ready_size_map.at(i) != 0);
     }
@@ -268,7 +268,7 @@ void ComputeInterceptor::SendDataReadyToDownStream() {
 }
 
 void ComputeInterceptor::ReplyCompletedToUpStream() {
-  for (auto& ins : in_readys_) {
+  for (auto& ins : in_readies_) {
     auto up_id = ins.first;
     auto ready_size = ins.second.second.at(cur_scope_id_);
     ready_size -= 1;
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.h b/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
index 26205d5ac8264..bb26c62061734 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
@@ -41,7 +41,7 @@ class ComputeInterceptor : public Interceptor {
 
   // upstream_id-->(max_ready_size, scope-->ready_size)
   std::map<int64_t, std::pair<int64_t, std::map<int64_t, int64_t>>>
-      in_readys_{};
+      in_readies_{};
   // downstream_id-->(max_buffer_size, used_size)
   std::map<int64_t, std::pair<int64_t, int64_t>> out_buffs_{};
 
diff --git a/paddle/fluid/distributed/index_dataset/CMakeLists.txt b/paddle/fluid/distributed/index_dataset/CMakeLists.txt
index 0bd11cc214de4..7d6f963e48634 100644
--- a/paddle/fluid/distributed/index_dataset/CMakeLists.txt
+++ b/paddle/fluid/distributed/index_dataset/CMakeLists.txt
@@ -3,11 +3,11 @@ cc_library(
   index_wrapper
   SRCS index_wrapper.cc
   DEPS index_dataset_proto framework_io)
-if(WITH_MKLDNN)
+if(WITH_ONEDNN)
   cc_library(
     index_sampler
     SRCS index_sampler.cc
-    DEPS xxhash index_wrapper eigen3 mkldnn)
+    DEPS xxhash index_wrapper eigen3 onednn)
 else()
   cc_library(
     index_sampler
diff --git a/paddle/fluid/distributed/ps/service/CMakeLists.txt b/paddle/fluid/distributed/ps/service/CMakeLists.txt
index eac2585416d8b..42d9dbce2f4d8 100755
--- a/paddle/fluid/distributed/ps/service/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/service/CMakeLists.txt
@@ -11,6 +11,10 @@ else()
 
 endif()
 
+if(WITH_PSCORE AND NOT WITH_HETERPS)
+  set(BRPC_DEPS ${BRPC_DEPS} ps_service)
+endif()
+
 brpc_library(
   sendrecv_rpc
   SRCS
diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
index a6bb716e6b7ad..64950443c0efc 100644
--- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_subdirectory(generator)
 
-set(EAGER_GENERETOR_DEPS
+set(EAGER_GENERATOR_DEPS
     ${GLOB_OP_LIB}
     ${GLOB_OPERATOR_DEPS}
     pybind
@@ -13,12 +13,12 @@ set(EAGER_GENERETOR_DEPS
     imperative_flag)
 
 if(WITH_CUSTOM_DEVICE)
-  set(EAGER_GENERETOR_DEPS ${EAGER_GENERETOR_DEPS}
+  set(EAGER_GENERATOR_DEPS ${EAGER_GENERATOR_DEPS}
                            custom_device_common_op_registry)
 endif()
 
 add_executable(eager_generator eager_generator.cc)
-target_link_libraries(eager_generator ${EAGER_GENERETOR_DEPS})
+target_link_libraries(eager_generator ${EAGER_GENERATOR_DEPS})
 
 get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
 target_link_libraries(eager_generator ${os_dependency_modules})
@@ -93,13 +93,13 @@ if(WIN32)
     list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/openblas.dll)
   endif()
 
-  if(WITH_MKLDNN)
+  if(WITH_ONEDNN)
     message("Copied mkldnn.dll for Eager AutoCodeGen")
     add_custom_command(
       OUTPUT ${eager_generator_path}/mkldnn.dll
       COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB}
               ${eager_generator_path}
-      DEPENDS mkldnn)
+      DEPENDS onednn)
     list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/mkldnn.dll)
   endif()
 
diff --git a/paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py
index f6892628f3b78..47bed1595a465 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py
@@ -57,6 +57,7 @@
     "conv3d_double_grad",
     "depthwise_conv2d_grad_grad",
     "concat_double_grad",
+    "stack_double_grad",
     "expand_grad",
     "argsort_grad",
     "eigh_grad",
diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index 128f159e1d0e1..c272e09a9579f 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -48,7 +48,7 @@
 # so we should check parameter(output) with rule of inplace.
 # But because there is no check in old dygraph mode, in order to
 # keeping the code compatible, here we also skip inplace check in new dygraph temporarily,
-# and this will be fixed in the futrue.
+# and this will be fixed in the future.
 inplace_check_blacklist = {"assign_out_"}
 
 # Black Ops list that's NO NEED to apply code generation
@@ -75,9 +75,12 @@
     "tanh_triple_grad",
     "minimum_double_grad",
     "maximum_double_grad",
+    "abs_triple_grad",
+    "exp_double_grad",
+    "log_double_grad",
 ]
 
-# white ops list whose kernel can automaically do type promotion.
+# white ops list whose kernel can automatically do type promotion.
 # future will get this list from same place with static graph.
 type_promote_white_list = {
     "add": ["x", "y"],
@@ -85,8 +88,8 @@
     "where": ["x", "y"],
 }
 
-# dict of special api that forward api's output will affect bacward api's output
-# bacward api's output usually affected by backward api's input
+# dict of special api that forward api's output will affect backward api's output
+# backward api's output usually affected by backward api's input
 special_prune_dict = {
     "matmul_grad": {"x": "grad_y", "y": "grad_x"},
 }
@@ -289,7 +292,7 @@ class {} : public egr::GradNodeBase {{
 
   // Forward API Call
 {}
-  // Log memory infomation
+  // Log memory information
 {}
   // Check NaN and Inf if needed
 {}
@@ -343,7 +346,7 @@ class {} : public egr::GradNodeBase {{
 {}
   // Forward API Call
 {}
-  // Log memory infomation
+  // Log memory information
 {}
   // Check NaN and Inf if needed
 {}
@@ -535,8 +538,8 @@ class {} : public egr::GradNodeBase {{
 """
 
 TYPE_PROMOTION_LOGIC_TEMPLATE = """   if (phi::NeedTypePromotion({x}.dtype(), {y}.dtype())) {{
-    VLOG(5) << "got different data type, run type protmotion automatically.";
-    LOG_FIRST_N(WARNING, 1) << "got different data type, run type protmotion automatically, this may cause data type been changed.";
+    VLOG(5) << "got different data type, run type promotion automatically.";
+    LOG_FIRST_N(WARNING, 1) << "got different data type, run type promotion automatically, this may cause data type been changed.";
     {op_name}
     auto promotion_type = phi::GetPromoteDtype(op_name, {x}.dtype(), {y}.dtype());
 
@@ -1128,7 +1131,7 @@ def GenerateNodeCreationCodes(self, for_backward=False, is_inplaced=False):
                             need_pre_contiguous_set.add(name)
                             set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper_{name}({name}_tmp);"
                 set_input_tensor_wrappers_list.append(set_tensor_wrappers)
-            else:  # Forwad's output as backward's input
+            else:  # Forward's output as backward's input
                 if num_fwd_outputs > 1:
                     # Aligned with forward output position
                     assert name in forward_outputs_position_map, AssertMessage(
@@ -1830,9 +1833,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
             f"return {forward_ad_function_name}({amp_inputs_call_args_str});"
         )
         if is_inplaced or (forward_api_name == "cast"):
-            amp_logic_str = "\n VLOG(5) << \" No AMP for {} because it is a inplace or cast api. \"; ".format(
-                forward_ad_function_name
-            )
+            amp_logic_str = f"\n VLOG(5) << \" No AMP for {forward_ad_function_name} because it is a inplace or cast api. \"; "
         else:
             amp_logic_str = AMP_LOGIC_TEMPLATE.format(
                 kernel_trans2_op_name_str,
@@ -1859,11 +1860,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
                 return_value=type_promote_call_list,
             )
         else:
-            type_promotion_logic_str = (
-                "\n VLOG(5) << \" No Type Promotion for {} api. \"; ".format(
-                    forward_ad_function_name
-                )
-            )
+            type_promotion_logic_str = f"\n VLOG(5) << \" No Type Promotion for {forward_ad_function_name} api. \"; "
         # Forward layout autotune
         layout_autotune_list_str = "    ".join(
             layout_autotune_list
@@ -1897,9 +1894,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
         # Generate forward_definition_str and forward_declaration_str
         if self.is_forward_only:
             if len(amp_tensors_vector_list) == 0:
-                amp_logic_str = "\n VLOG(7) << \" No AMP for {} because it has no input. \"; ".format(
-                    forward_ad_function_name
-                )
+                amp_logic_str = f"\n VLOG(7) << \" No AMP for {forward_ad_function_name} because it has no input. \"; "
             self.forward_definition_str += (
                 FORWARD_ONLY_FUNCTION_TEMPLATE.format(
                     returns_type_str,
@@ -3063,7 +3058,7 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
     for i in range(len(api_yaml_paths)):
         api_yaml_path = api_yaml_paths[i]
 
-        # string api is forwrad only
+        # string api is forward only
         if not api_yaml_path.endswith('strings_ops.yaml'):
             backward_yaml_path = backward_yaml_paths[i]
         else:
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index 2a97f5bf35e90..ce7f7caf1f44c 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -261,6 +261,106 @@ void GradNodeBase::SetGradInMeta(const std::vector<paddle::Tensor>& fwd_out,
   }
 }
 
+void GradNodeBase::SetGradInMeta(const std::vector<paddle::Tensor*>& fwd_out,
+                                 size_t slot_rank) {
+  VLOG(7) << "Set GradSlotMeta for Grad Inputs";
+  size_t slot_size = fwd_out.size();
+  PADDLE_ENFORCE_LE(
+      slot_rank,
+      (bwd_in_meta_.size() - 1),
+      paddle::platform::errors::InvalidArgument(
+          "Slot Rank should less equal than bwd_in_meta_ size, since "
+          "bwd_in_meta_ is designed to hold as same num as backward "
+          "inputs."));
+  auto& metas = bwd_in_meta_.at(slot_rank);
+  // Init stop gradient vector before use to avoid push back
+  if (metas.size() < slot_size) {
+    VLOG(7) << "Init bwd_in_meta_ with slot rank: " << slot_rank;
+    metas.resize(slot_size);
+  }
+  for (size_t i = 0; i < slot_size; i++) {
+    auto& meta = metas[i];
+    const auto& fwd_out_tensor = *fwd_out[i];
+    auto* fwd_out_meta =
+        egr::EagerUtils::nullable_autograd_meta(fwd_out_tensor);
+    PADDLE_ENFORCE_NOT_NULL(fwd_out_meta,
+                            paddle::platform::errors::PreconditionNotMet(
+                                "Bwd_in_meta should only be called while "
+                                "autograd_meta is not null. If you got this "
+                                "error, it indicates bugs in framework."));
+    if (fwd_out_meta && fwd_out_meta->StopGradient()) {
+      // Set Stop Gradient only when its true or non-initialized autograd_meta,
+      // since all default value is false.
+      meta.SetStopGradient(fwd_out_meta->StopGradient());
+    }
+
+    if (!fwd_out_tensor.initialized()) {
+      if (fwd_out_tensor.defined() && fwd_out_tensor.is_dist_tensor() &&
+          phi::distributed::NeedComputationClipForPP(fwd_out_tensor.impl())) {
+        VLOG(3) << "Tensor " << fwd_out_tensor.name() << " is DistTensor,"
+                << " and needs computation clip for pipeline parallel."
+                << " Still SetGradInMeta for it.";
+      } else {
+        VLOG(7) << "Skip Configuring GradSlotMeta for uninitialized GradInput "
+                   "Tensor";
+        return;
+      }
+    }
+
+    // Record TensorMeta
+    if (phi::DenseTensor::classof(fwd_out_tensor.impl().get())) {
+      // Only Copy Meta
+      phi::DenseTensor* dense_tensor =
+          static_cast<phi::DenseTensor*>(fwd_out_tensor.impl().get());
+
+      PADDLE_ENFORCE_NE(
+          dense_tensor->meta().dtype,
+          phi::DataType::UNDEFINED,
+          paddle::platform::errors::Fatal("Attempting to copy DenseTensorMeta "
+                                          "with phi::DataType::UNDEFINED,"
+                                          "which is illegal."));
+      meta.SetTensorMeta(dense_tensor->meta());
+      meta.SetPlace(fwd_out_tensor.place());
+
+      if (dense_tensor->type() == phi::DataType::COMPLEX64 ||
+          dense_tensor->type() == phi::DataType::COMPLEX128) {
+        need_complex_to_real_ = true;
+      }
+    } else if (phi::distributed::DistTensor::classof(
+                   fwd_out_tensor.impl().get())) {
+      // Only Copy Meta
+      meta.SetDistAttr(static_cast<phi::distributed::DistTensor*>(
+                           fwd_out_tensor.impl().get())
+                           ->dist_attr());
+      meta.SetDistTensorGlobalDims(static_cast<phi::distributed::DistTensor*>(
+                                       fwd_out_tensor.impl().get())
+                                       ->dims());
+      SetIsRunAutoParallel(true);
+
+      auto dense_tensor = static_cast<phi::distributed::DistTensor*>(
+                              fwd_out_tensor.impl().get())
+                              ->value();
+
+      PADDLE_ENFORCE_NE(
+          dense_tensor.meta().dtype,
+          phi::DataType::UNDEFINED,
+          paddle::platform::errors::Fatal("Attempting to copy DenseTensorMeta "
+                                          "with phi::DataType::UNDEFINED,"
+                                          "which is illegal."));
+      meta.SetTensorMeta(dense_tensor.meta());
+      meta.SetPlace(fwd_out_tensor.place());
+
+      if (dense_tensor.type() == phi::DataType::COMPLEX64 ||
+          dense_tensor.type() == phi::DataType::COMPLEX128) {
+        need_complex_to_real_ = true;
+      }
+    } else {
+      VLOG(7) << "Unable to initialize the DenseTensorMeta of GradSlotMeta "
+                 "with non-DenseTensor argument.";
+    }
+  }
+}
+
 void GradNodeBase::SetGradOutMeta(const paddle::Tensor& fwd_in,
                                   size_t slot_rank) {
   auto* fwd_in_meta = egr::EagerUtils::nullable_autograd_meta(fwd_in);
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index 7b5e36f4d5cdc..73eedaba9e4f3 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -250,7 +250,8 @@ class GradNodeBase {
   void SetGradInMeta(const std::vector<paddle::Tensor>& fwd_out,
                      size_t slot_rank);
   void SetGradInMeta(const paddle::Tensor& fwd_out, size_t slot_rank);
-
+  void SetGradInMeta(const std::vector<paddle::Tensor*>& fwd_out,
+                     size_t slot_rank);
   void SetGradOutMeta(const std::vector<paddle::Tensor>& fwd_in,
                       size_t slot_rank);
   void SetGradOutMeta(const std::vector<const paddle::Tensor*>& fwd_in,
diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h
index 71a72db60d8cb..18e72c4f0782a 100644
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -142,33 +142,6 @@ class TensorWrapper {
     }
   }
 
-#ifndef PADDLE_NO_PYTHON
-  TensorWrapper(const TensorWrapper& other) {
-    no_need_buffer_ = other.no_need_buffer_;
-    intermidiate_tensor_ = other.intermidiate_tensor_;
-    weak_grad_node_ = other.weak_grad_node_;
-    inplace_version_snapshot_ = other.inplace_version_snapshot_;
-    packed_value_ = other.packed_value_;
-    unpack_hook_ = other.unpack_hook_;
-    if (packed_value_) {
-      packed_value_->inc_ref();
-    }
-  }
-
-  TensorWrapper& operator=(const TensorWrapper& other) {
-    no_need_buffer_ = other.no_need_buffer_;
-    intermidiate_tensor_ = other.intermidiate_tensor_;
-    weak_grad_node_ = other.weak_grad_node_;
-    inplace_version_snapshot_ = other.inplace_version_snapshot_;
-    packed_value_ = other.packed_value_;
-    unpack_hook_ = other.unpack_hook_;
-    if (packed_value_) {
-      packed_value_->inc_ref();
-    }
-    return *this;
-  }
-#endif
-
   paddle::Tensor recover() {
     VLOG(6) << "Recover tensor: " << intermidiate_tensor_.name()
             << " for wrapper";
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index 39ec0e7fe31a3..eaee4e9984f8d 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -22,7 +22,6 @@
 #include "paddle/fluid/framework/tensor_ref_array.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/ir_adaptor/translator/program_translator.h"
-#include "paddle/fluid/operators/run_program_op.h"
 #include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
@@ -33,9 +32,14 @@
 #include "paddle/pir/include/core/program.h"
 #include "paddle/pir/include/core/value.h"
 
+#ifdef PADDLE_WITH_DNNL
+#include "paddle/fluid/platform/onednn_helper.h"
+#endif
+
 COMMON_DECLARE_bool(enable_pir_with_pt_in_dy2st);
 COMMON_DECLARE_bool(enable_pir_in_executor);
 COMMON_DECLARE_bool(print_ir);
+COMMON_DECLARE_bool(use_mkldnn);
 
 namespace details {
 using Tensor = paddle::Tensor;
@@ -91,39 +95,45 @@ static bool IsVariableRefArray(const Tensor &tensor) {
 
 static auto GetNameFromValue(const ::pir::Block *block,
                              const std::vector<::pir::Value> &values,
-                             bool is_input) {
+                             bool allow_input,
+                             bool allow_output) {
+  PADDLE_ENFORCE_EQ(
+      allow_input || allow_output,
+      true,
+      paddle::platform::errors::InvalidArgument(
+          "GetNameFromValue should allow input or output at least one."));
   // we use name here, later value is used directly.
   std::unordered_map<::pir::Value, std::string> value2name;
-  if (is_input) {
+  if (allow_input) {
     for (auto &kwarg : block->kwargs()) {
       value2name[kwarg.second] = kwarg.first;
     }
   }
   for (auto &op : *block) {
     std::string name;
-    if (is_input && op.name() == "pd_op.data") {
+    if (allow_input && op.name() == "pd_op.data") {
       name =
           op.attributes().at("name").dyn_cast<pir::StrAttribute>().AsString();
       value2name[op.results()[0].Value::impl()] = name;
-    } else if (!is_input && op.name() == "builtin.set_parameter") {
+    } else if (allow_output && op.name() == "builtin.set_parameter") {
       name = op.attributes()
                  .at("parameter_name")
                  .dyn_cast<pir::StrAttribute>()
                  .AsString();
       value2name[op.operand(0).source()] = name;
-    } else if (!is_input && op.name() == "builtin.shadow_output") {
+    } else if (allow_output && op.name() == "builtin.shadow_output") {
       name = op.attributes()
                  .at("output_name")
                  .dyn_cast<pir::StrAttribute>()
                  .AsString();
       value2name[op.operand(0).source()] = name;
-    } else if (is_input && op.name() == "builtin.parameter") {
+    } else if (allow_input && op.name() == "builtin.parameter") {
       name = op.attributes()
                  .at("parameter_name")
                  .dyn_cast<pir::StrAttribute>()
                  .AsString();
       value2name[op.result(0).Value::impl()] = name;
-    } else if (is_input && op.name() == "builtin.constant") {
+    } else if (allow_input && op.name() == "builtin.constant") {
       if (op.isa<pir::ConstantTensorOp>()) {
         name = op.dyn_cast<pir::ConstantTensorOp>().tensor_name();
         value2name[op.result(0).Value::impl()] = name;
@@ -248,12 +258,7 @@ static void ShareTensorsIntoScopeByValue(
     const std::vector<Tensor> &tensors,
     const std::vector<::pir::Value> &values,
     paddle::framework::Scope *scope) {
-  auto names = GetNameFromValue(block, values, true);
-  if (VLOG_IS_ON(4)) {
-    for (auto &s : names) {
-      VLOG(4) << "ShareTensorIntoScopeByValue name: " << s;
-    }
-  }
+  auto names = GetNameFromValue(block, values, true, false);
   ShareTensorsIntoScopeWithName(tensors, names, scope);
 }
 
@@ -262,11 +267,16 @@ static void ShareTensorsFromScopeByValue(
     const std::vector<Tensor *> &tensors,
     const std::vector<::pir::Value> &values,
     paddle::framework::Scope *scope) {
-  auto names = GetNameFromValue(block, values, false);
+  // NOTE(SigureMo): If the program has an inplace chain connecting
+  // an input value to an output value, the output value will be
+  // replaced with the input value, so we set the `allow_input` to
+  // `true` in `GetNameFromValue`
+  auto names = GetNameFromValue(block, values, true, true);
   for (size_t i = 0; i < tensors.size(); ++i) {
     auto &name = names[i];
     auto &value = values[i];
-    VLOG(2) << "share " << name << " from scope";
+    VLOG(4) << "Share Tensor From Scope: " << name;
+
     if (value.impl() == nullptr) {
       // skip stop_gradient.
       continue;
@@ -524,20 +534,20 @@ inline void PirRunProgramAPI(
     // *backward_program);
 
     // update interpretercore skip_gc_var
-    auto skip_names =
-        details::GetNameFromValue(forward_global_block, middle_values, false);
+    auto skip_names = details::GetNameFromValue(
+        forward_global_block, middle_values, false, true);
     auto skip_names_set =
         std::set<std::string>(skip_names.begin(), skip_names.end());
     auto no_need_buffer_values = PADDLE_GET_CONST(std::vector<::pir::Value>,
                                                   attrs.at("no_need_buffers"));
     auto no_need_buffer_names = details::GetNameFromValue(
-        forward_global_block, no_need_buffer_values, false);
+        forward_global_block, no_need_buffer_values, false, true);
     for (auto &name : no_need_buffer_names) {
       VLOG(4) << "Find no need buffer vars with name:" << name;
       skip_names_set.erase(name);
     }
-    skip_names =
-        details::GetNameFromValue(forward_global_block, output_values, false);
+    skip_names = details::GetNameFromValue(
+        forward_global_block, output_values, false, true);
     skip_names_set.insert(skip_names.begin(), skip_names.end());
     details::print_collection(skip_names_set);
     interpreter_core->SetSkipGcVars(skip_names_set);
@@ -1092,6 +1102,11 @@ inline void PirRunProgramGradAPI(
     // Step 1. share input_vars & parameters into scope
     auto passed_kernel_program =
         paddle::framework::ApplyIrPass(backward_program, place);
+
+    const auto &new_block = passed_kernel_program->block();
+    passed_kernel_program = paddle::framework::ApplyRemoveShadowFeedPass(
+        std::move(passed_kernel_program), new_block, place, global_inner_scope);
+
     if (FLAGS_print_ir) {
       std::ostringstream print_stream;
       print_stream << "LoweredProgram( AfterPass | Backward ) is :\n";
@@ -1127,11 +1142,11 @@ inline void PirRunProgramGradAPI(
 
     // get all eager gc vars
     std::set<std::string> skip_eager_delete_vars;
-    auto skip_names =
-        details::GetNameFromValue(backward_global_block, x_grad_values, false);
+    auto skip_names = details::GetNameFromValue(
+        backward_global_block, x_grad_values, false, true);
     skip_eager_delete_vars.insert(skip_names.begin(), skip_names.end());
-    skip_names =
-        details::GetNameFromValue(backward_global_block, p_grad_values, false);
+    skip_names = details::GetNameFromValue(
+        backward_global_block, p_grad_values, false, true);
     skip_eager_delete_vars.insert(skip_names.begin(), skip_names.end());
     interpreter_core->SetSkipGcVars(skip_eager_delete_vars);
     cache.UpdateSkipEagerDeleteVars(program_id,
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 4dc0db770727a..1659430d6216f 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -717,8 +717,8 @@ std::string EagerUtils::GradNodeStr(const egr::GradNodeBase& node) {
       in_slot_str +=
           paddle::string::Sprintf(SLOT_INFO_TEMPLATE, i, sg_str, edges_str);
     }
-    std::string in_meta_str =
-        paddle::string::Sprintf(GRAD_SLOT_META_TEMPLATE, in_slot_str);
+    std::string in_meta_str = paddle::string::Sprintf(
+        GRAD_SLOT_META_TEMPLATE, in_metas.size(), in_slot_str);
     return paddle::string::Sprintf(
         GRAD_NODE_TEMPLATE, out_meta_str, in_meta_str);
   } else if (VLOG_IS_ON(5)) {
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 4dfd8312f6153..62459827d3c39 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -153,10 +153,10 @@ if(WITH_XPU)
   target_link_libraries(var_type_traits dynload_xpti)
 endif()
 
-# every source file that includes "dnnl.h" must depends on mkldnn
-# or, the first one should depends on mkldnn
-if(WITH_MKLDNN)
-  add_dependencies(var_type_traits mkldnn)
+# every source file that includes "dnnl.h" must depends on onednn
+# or, the first one should depends on onednn
+if(WITH_ONEDNN)
+  add_dependencies(var_type_traits onednn)
 endif()
 
 set(BRPC_DEPS "")
@@ -273,10 +273,10 @@ cc_library(
   SRCS shape_inference.cc
   DEPS phi common attribute selected_rows_utils)
 
-# every source file that includes "dnnl.h" must depends on mkldnn
-# or, the first one should depends on mkldnn
-if(WITH_MKLDNN)
-  add_dependencies(shape_inference mkldnn)
+# every source file that includes "dnnl.h" must depends on onednn
+# or, the first one should depends on onednn
+if(WITH_ONEDNN)
+  add_dependencies(shape_inference onednn)
 endif()
 
 cc_library(
@@ -954,8 +954,8 @@ cc_library(
   DEPS common)
 target_link_libraries(type_info pir op_dialect)
 add_dependencies(type_info framework_proto auto_parallel_proto xxhash)
-if(WITH_MKLDNN)
-  add_dependencies(type_info mkldnn)
+if(WITH_ONEDNN)
+  add_dependencies(type_info onednn)
 endif()
 
 set(FLUID_FRAMEWORK_MODULES
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index 039ed3ffc2441..1a70dca1ff4f1 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -26,7 +26,7 @@ class Variable;
 }  // namespace paddle
 
 #ifdef PADDLE_WITH_DNNL
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index d771a12411adb..20c1444f238eb 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -54,8 +54,8 @@ set(op_handle_deps
     selected_rows_utils
     reference_count_pass_helper)
 
-if(WITH_MKLDNN)
-  set(op_handle_deps ${op_handle_deps} mkldnn)
+if(WITH_ONEDNN)
+  set(op_handle_deps ${op_handle_deps} onednn)
 endif()
 
 if(WITH_DGC)
@@ -161,6 +161,6 @@ cc_library(
   SRCS build_strategy.cc
   DEPS pass_builder ${IR_PASS_DEPS})
 
-if(WITH_MKLDNN)
-  target_link_libraries(build_strategy mkldnn_placement_pass)
+if(WITH_ONEDNN)
+  target_link_libraries(build_strategy onednn_placement_pass)
 endif()
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index f49936bf44739..79578e5653a22 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -61,8 +61,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
       // Note: This is a trick to support 0D-Tensor for CINN. This pass will be
       // removed in the near future.
       AppendPass("cinn_zero_tensor_trick_pass");
-      // Note: This pass is used to enable cinn.
-      AppendPass("build_cinn_pass");
       AppendPrintGraphPass("graph_viz_pass", "_build_cinn_graph");
     }
 #endif
@@ -78,7 +76,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     AppendMultiDevPass();
     AppendMultiGraphOptPasses();
 
-    AppendPassToSetMkldnnAttr("mkldnn_placement_pass");
+    AppendPassToSetMkldnnAttr("onednn_placement_pass");
     // runtime_context_cache pass should be the last pass to enable the attr of
     // all original and fused operators. But no operators can be enabled this
     // attr if putting it after MultiDevPass.
@@ -179,7 +177,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
         "delete_dropout_op_x_pass");
     AppendPassWithCheck(
         strategy_.enable_inference_pass_ && strategy_.use_mkldnn_,
-        "mkldnn_placement_pass");
+        "onednn_placement_pass");
 
     // 2. trainning pass
 #ifdef PADDLE_WITH_CUDNN_FRONTEND
@@ -480,7 +478,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
                    "GPU, skipped.";
         continue;
       }
-    } else if (pass->Type() == "mkldnn_placement_pass") {
+    } else if (pass->Type() == "onednn_placement_pass") {
       pass->Set("mkldnn_enabled_op_types",
                 new std::unordered_set<std::string>(mkldnn_enabled_op_types_));
     } else if (pass->Type() == "backward_optimizer_op_deps_pass") {
@@ -548,7 +546,7 @@ USE_PASS(build_cinn_pass);
 USE_PASS(fused_feedforward_pass);
 #endif
 #ifdef PADDLE_WITH_DNNL
-USE_PASS(mkldnn_placement_pass);
+USE_PASS(onednn_placement_pass);
 #endif
 #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
     !defined(_WIN32) && !defined(__APPLE__)
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index e954fd6a7a348..c0c7e6765b4dc 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -141,8 +141,8 @@ struct BuildStrategy {
   // Fuse ResUnit
   bool fuse_resunit_{false};
   // mkldnn_enabled_op_types specify the operator type list to
-  // use MKLDNN acceleration. It is null in default, means
-  // that all the operators supported by MKLDNN will be
+  // use OneDNN acceleration. It is null in default, means
+  // that all the operators supported by OneDNN will be
   // accelerated. And it should not be set when
   // FLAGS_use_mkldnn=false
   std::unordered_set<std::string> mkldnn_enabled_op_types_;
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index fbc2565e755fa..9d6ac59018856 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #ifdef PADDLE_WITH_DNNL
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #endif
 #include "paddle/common/flags.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
@@ -609,7 +609,7 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) {
   }
 #else
   LOG(WARNING)
-      << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
+      << "'MKLDNN' is not supported, Please re-compile with WITH_ONEDNN option";
 #endif
 }
 }  // namespace framework
diff --git a/paddle/fluid/framework/executor_cache.cc b/paddle/fluid/framework/executor_cache.cc
index 0be2a603502cb..9045ca0f6a17d 100644
--- a/paddle/fluid/framework/executor_cache.cc
+++ b/paddle/fluid/framework/executor_cache.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/ir_adaptor/translator/translate.h"
 #include "paddle/fluid/pir/transforms/general/inplace_pass.h"
+#include "paddle/fluid/pir/transforms/general/remove_shadow_feed_pass.h"
 #include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h"
 #include "paddle/pir/include/core/program.h"
 #include "paddle/pir/include/core/value.h"
@@ -384,6 +385,28 @@ std::unique_ptr<::pir::Program> ApplyIrPass(::pir::Program *program,
   return ir_res;
 }
 
+std::unique_ptr<::pir::Program> ApplyRemoveShadowFeedPass(
+    std::unique_ptr<::pir::Program> program,
+    const pir::Block *block,
+    const phi::Place &place,
+    const paddle::framework::Scope *scope) {
+  ::pir::PassManager pm(::pir::IrContext::Instance(), 3);
+  auto pass = ::pir::CreateRemoveShadowFeedPass();
+  pass->SetNotOwned("top_block", block);
+  pass->SetNotOwned(pir::Pass::kPlaceAttr, &place);
+  pass->SetNotOwned(pir::Pass::kParamScopeAttr, scope);
+  pm.AddPass(std::move(pass));
+  pm.Run(program.get());
+
+  if (FLAGS_print_ir) {
+    std::cout << "IR After RemoveShadowFeedPass -------------------"
+              << std::endl;
+    std::cout << *program << std::endl;
+  }
+
+  return program;
+}
+
 std::unique_ptr<::pir::Program> ConstructForwardIrProgram(
     const paddle::framework::BlockDesc *forward_global_block,
     const paddle::framework::BlockDesc *backward_global_block,
diff --git a/paddle/fluid/framework/executor_cache.h b/paddle/fluid/framework/executor_cache.h
index f9afaabec79dc..1e5136892d13f 100644
--- a/paddle/fluid/framework/executor_cache.h
+++ b/paddle/fluid/framework/executor_cache.h
@@ -273,6 +273,12 @@ std::shared_ptr<InterpreterCore> CreatePirInterpreterCoreInfoToCache(
 std::unique_ptr<::pir::Program> ApplyIrPass(::pir::Program* program,
                                             phi::Place place);
 
+std::unique_ptr<::pir::Program> ApplyRemoveShadowFeedPass(
+    const std::unique_ptr<::pir::Program> program,
+    const pir::Block* block,
+    const phi::Place& place,
+    const paddle::framework::Scope* scope);
+
 std::unique_ptr<::pir::Program> ConstructForwardIrProgram(
     const paddle::framework::BlockDesc* forward_global_block,
     const paddle::framework::BlockDesc* backward_global_block,
diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
index 7ad502c89af92..58ab45db3e940 100644
--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
@@ -355,22 +355,26 @@ void AccessorWrapper<GPUAccessor>::CopyForPushImpl(
   int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
   int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector->ptr());
   int* d_mf_dim_vector = reinterpret_cast<int*>(buf_mf_dim_vector->ptr());
-  cudaMemcpy(gpu_values,
-             grad_values.data(),
-             grad_values.size() * sizeof(float*),
-             cudaMemcpyHostToDevice);
-  cudaMemcpy(gpu_len,
-             slot_lengths_lod.data(),
-             slot_lengths.size() * sizeof(int64_t),
-             cudaMemcpyHostToDevice);
-  cudaMemcpy(d_slot_vector,
-             slot_vector.data(),
-             slot_lengths_lod.size() * sizeof(int),
-             cudaMemcpyHostToDevice);
-  cudaMemcpy(d_mf_dim_vector,
-             slot_mf_dim_vector.data(),
-             slot_lengths_lod.size() * sizeof(int),
-             cudaMemcpyHostToDevice);
+  cudaMemcpyAsync(gpu_values,
+                  grad_values.data(),
+                  grad_values.size() * sizeof(float*),
+                  cudaMemcpyHostToDevice,
+                  stream);
+  cudaMemcpyAsync(gpu_len,
+                  slot_lengths_lod.data(),
+                  slot_lengths.size() * sizeof(int64_t),
+                  cudaMemcpyHostToDevice,
+                  stream);
+  cudaMemcpyAsync(d_slot_vector,
+                  slot_vector.data(),
+                  slot_lengths_lod.size() * sizeof(int),
+                  cudaMemcpyHostToDevice,
+                  stream);
+  cudaMemcpyAsync(d_mf_dim_vector,
+                  slot_mf_dim_vector.data(),
+                  slot_lengths_lod.size() * sizeof(int),
+                  cudaMemcpyHostToDevice,
+                  stream);
   PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
       total_grad_values_gpu,
       gpu_values,
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index 069dfeeec157b..49a1592348895 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -1631,6 +1631,7 @@ void HeterComm<KeyType, ValType, GradType, GPUAccessor>::pull_merge_sparse(
                 val_type_size);
   }
 
+  AnyDeviceGuard guard2(dev_id);
   auto d_merged_vals = MemoryAlloc(place, uniq_len * val_type_size);
   auto d_merged_vals_ptr = reinterpret_cast<float *>(d_merged_vals->ptr());
   heter_comm_kernel_->dy_mf_fill_dvals(d_shard_vals_ptr,
diff --git a/paddle/fluid/framework/io/CMakeLists.txt b/paddle/fluid/framework/io/CMakeLists.txt
index 82f879bce353b..8d55a10ee3310 100644
--- a/paddle/fluid/framework/io/CMakeLists.txt
+++ b/paddle/fluid/framework/io/CMakeLists.txt
@@ -14,6 +14,6 @@ cc_library(
   SRCS ${framework_io_srcs}
   DEPS ${framework_io_deps})
 
-if(WITH_MKLDNN)
-  add_dependencies(framework_io mkldnn)
+if(WITH_ONEDNN)
+  add_dependencies(framework_io onednn)
 endif()
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index cb8093298d9bb..95c5d1ec796cc 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -174,42 +174,42 @@ if(WITH_GPU OR WITH_ROCM)
   pass_library(embedding_eltwise_layernorm_fuse_pass inference)
 endif()
 
-if(WITH_MKLDNN)
-  pass_library(mkldnn_placement_pass base DEPS placement_pass_base DIR mkldnn)
-  pass_library(depthwise_conv_mkldnn_pass base DIR mkldnn)
-  pass_library(conv_affine_channel_mkldnn_fuse_pass inference DIR mkldnn)
-  pass_library(conv_bias_mkldnn_fuse_pass inference DIR mkldnn)
-  pass_library(conv_activation_mkldnn_fuse_pass inference DIR mkldnn)
-  pass_library(conv_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
-  pass_library(int8_scale_calculation_mkldnn_pass inference DIR mkldnn)
-  pass_library(params_quantization_mkldnn_pass inference DIR mkldnn)
-  pass_library(scale_matmul_fuse_pass inference DIR mkldnn)
-  pass_library(cpu_bfloat16_placement_pass inference DIR mkldnn)
-  pass_library(cpu_bfloat16_pass inference DIR mkldnn)
-  pass_library(fc_mkldnn_pass inference DIR mkldnn)
-  pass_library(interpolate_mkldnn_pass inference DIR mkldnn)
-  pass_library(softplus_activation_onednn_fuse_pass inference DIR mkldnn)
-  pass_library(shuffle_channel_mkldnn_detect_pass inference DIR mkldnn)
-  pass_library(fc_act_mkldnn_fuse_pass inference DIR mkldnn)
-  pass_library(elementwise_act_onednn_fuse_pass inference DIR mkldnn)
-  pass_library(matmul_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
-  pass_library(matmul_activation_mkldnn_fuse_pass inference DIR mkldnn)
-  pass_library(operator_scale_onednn_fuse_pass inference DIR mkldnn)
-  pass_library(quant_transpose2_dequant_onednn_fuse_pass inference DIR mkldnn)
-  pass_library(squeeze2_transpose2_onednn_fuse_pass inference DIR mkldnn)
-  pass_library(operator_unsqueeze2_onednn_fuse_pass inference DIR mkldnn)
-  pass_library(operator_reshape2_onednn_fuse_pass inference DIR mkldnn)
-  pass_library(cpu_quantize_placement_pass base DIR mkldnn)
-  pass_library(cpu_quantize_pass inference DIR mkldnn)
-  pass_library(cpu_quantize_squash_pass inference DIR mkldnn)
-  pass_library(reshape_transpose_matmul_mkldnn_fuse_pass inference DIR mkldnn)
-  pass_library(matmul_transpose_reshape_mkldnn_fuse_pass inference DIR mkldnn)
-  pass_library(batch_norm_act_fuse_pass inference DIR mkldnn)
-  pass_library(multi_gru_fuse_pass inference DIR mkldnn)
-  pass_library(multi_gru_seq_fuse_pass inference DIR mkldnn)
-  pass_library(quant_dequant_mkldnn_pass inference DIR mkldnn)
-  pass_library(compute_propagate_scales_mkldnn_pass inference DIR mkldnn)
-  pass_library(self_attention_fuse_pass inference DIR mkldnn)
+if(WITH_ONEDNN)
+  pass_library(onednn_placement_pass base DEPS placement_pass_base DIR onednn)
+  pass_library(depthwise_conv_onednn_pass base DIR onednn)
+  pass_library(conv_affine_channel_onednn_fuse_pass inference DIR onednn)
+  pass_library(conv_bias_onednn_fuse_pass inference DIR onednn)
+  pass_library(conv_activation_onednn_fuse_pass inference DIR onednn)
+  pass_library(conv_elementwise_add_onednn_fuse_pass inference DIR onednn)
+  pass_library(int8_scale_calculation_onednn_pass inference DIR onednn)
+  pass_library(params_quantization_onednn_pass inference DIR onednn)
+  pass_library(scale_matmul_fuse_pass inference DIR onednn)
+  pass_library(cpu_bfloat16_placement_pass inference DIR onednn)
+  pass_library(cpu_bfloat16_pass inference DIR onednn)
+  pass_library(fc_onednn_pass inference DIR onednn)
+  pass_library(interpolate_onednn_pass inference DIR onednn)
+  pass_library(softplus_activation_onednn_fuse_pass inference DIR onednn)
+  pass_library(shuffle_channel_onednn_detect_pass inference DIR onednn)
+  pass_library(fc_act_onednn_fuse_pass inference DIR onednn)
+  pass_library(elementwise_act_onednn_fuse_pass inference DIR onednn)
+  pass_library(matmul_elementwise_add_onednn_fuse_pass inference DIR onednn)
+  pass_library(matmul_activation_onednn_fuse_pass inference DIR onednn)
+  pass_library(operator_scale_onednn_fuse_pass inference DIR onednn)
+  pass_library(quant_transpose2_dequant_onednn_fuse_pass inference DIR onednn)
+  pass_library(squeeze2_transpose2_onednn_fuse_pass inference DIR onednn)
+  pass_library(operator_unsqueeze2_onednn_fuse_pass inference DIR onednn)
+  pass_library(operator_reshape2_onednn_fuse_pass inference DIR onednn)
+  pass_library(cpu_quantize_placement_pass base DIR onednn)
+  pass_library(cpu_quantize_pass inference DIR onednn)
+  pass_library(cpu_quantize_squash_pass inference DIR onednn)
+  pass_library(reshape_transpose_matmul_onednn_fuse_pass inference DIR onednn)
+  pass_library(matmul_transpose_reshape_onednn_fuse_pass inference DIR onednn)
+  pass_library(batch_norm_act_fuse_pass inference DIR onednn)
+  pass_library(multi_gru_fuse_pass inference DIR onednn)
+  pass_library(multi_gru_seq_fuse_pass inference DIR onednn)
+  pass_library(quant_dequant_onednn_pass inference DIR onednn)
+  pass_library(compute_propagate_scales_onednn_pass inference DIR onednn)
+  pass_library(self_attention_fuse_pass inference DIR onednn)
   if(WITH_AVX
      AND AVX512F_FOUND
      AND AVX512F_FLAG)
@@ -274,6 +274,8 @@ if(WITH_XPU)
                ${XPU_PASS_DEPS})
   pass_library(decoder_attention_xpu_fuse_pass inference DIR xpu DEPS
                ${XPU_PASS_DEPS})
+  pass_library(cross_attention_xpu_fuse_pass inference DIR xpu DEPS
+               ${XPU_PASS_DEPS})
   pass_library(multi_encoder_xpu_fuse_pass inference DIR xpu DEPS
                ${XPU_PASS_DEPS})
   pass_library(multi_encoder_xpu_adaptive_seqlen_fuse_pass inference DIR xpu
@@ -301,6 +303,8 @@ if(WITH_XPU)
                ${XPU_PASS_DEPS})
   pass_library(add_layernorm_xpu_fuse_pass inference DIR xpu DEPS
                ${XPU_PASS_DEPS})
+  pass_library(group_norm_silu_xpu_fuse_pass inference DIR xpu DEPS
+               ${XPU_PASS_DEPS})
   pass_library(xpu_delete_cast_op_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
   pass_library(fold_interp_outsize_fuse_pass inference DIR xpu DEPS
                ${XPU_PASS_DEPS})
@@ -324,6 +328,8 @@ if(WITH_XPU)
   pass_library(quant_dequant_xpu_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
   pass_library(roformer_relative_pos_fuse_pass inference DIR xpu DEPS
                ${XPU_PASS_DEPS})
+  pass_library(spatial_transformer_resblock_xpu_fuse_pass inference DIR xpu
+               DEPS ${XPU_PASS_DEPS})
 endif()
 
 cc_library(
@@ -536,19 +542,19 @@ if(NOT WIN32)
     SRCS dense_multihead_matmul_to_sparse_pass_tester.cc
     DEPS multihead_matmul_fuse_pass dense_multihead_matmul_to_sparse_pass)
 endif()
-if(WITH_MKLDNN)
+if(WITH_ONEDNN)
   cc_test(
-    test_depthwise_conv_mkldnn_pass
-    SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc
-    DEPS depthwise_conv_mkldnn_pass)
+    test_depthwise_conv_onednn_pass
+    SRCS onednn/depthwise_conv_onednn_pass_tester.cc
+    DEPS depthwise_conv_onednn_pass)
   cc_test(
-    test_int8_scale_calculation_mkldnn_pass
-    SRCS mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc
-    DEPS int8_scale_calculation_mkldnn_pass pass_test_util)
+    test_int8_scale_calculation_onednn_pass
+    SRCS onednn/int8_scale_calculation_onednn_pass_tester.cc
+    DEPS int8_scale_calculation_onednn_pass pass_test_util)
   cc_test(
-    test_params_quantization_mkldnn_pass
-    SRCS mkldnn/params_quantization_mkldnn_pass_tester.cc
-    DEPS params_quantization_mkldnn_pass)
+    test_params_quantization_onednn_pass
+    SRCS onednn/params_quantization_onednn_pass_tester.cc
+    DEPS params_quantization_onednn_pass)
   set(TEST_CONV_BN_PASS_DEPS
       conv_bn_fuse_pass
       graph_to_program_pass
@@ -566,43 +572,43 @@ if(WITH_MKLDNN)
     set(TEST_CONV_BN_PASS_DEPS ${TEST_CONV_BN_PASS_DEPS} depthwise_conv)
   endif()
   cc_test(
-    test_mkldnn_placement_pass
-    SRCS mkldnn/mkldnn_placement_pass_tester.cc
-    DEPS mkldnn_placement_pass)
+    test_onednn_placement_pass
+    SRCS onednn/onednn_placement_pass_tester.cc
+    DEPS onednn_placement_pass)
   cc_test(
-    test_compute_propagate_scales_mkldnn_pass
-    SRCS mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc
-    DEPS compute_propagate_scales_mkldnn_pass naive_executor)
+    test_compute_propagate_scales_onednn_pass
+    SRCS onednn/compute_propagate_scales_onednn_pass_tester.cc
+    DEPS compute_propagate_scales_onednn_pass naive_executor)
 
   if(WITH_ONNXRUNTIME AND WIN32)
     # Copy onnxruntime for some c++ test in Windows, since the test will
     # be build only in CI, so suppose the generator in Windows is Ninja.
-    copy_onnx(test_compute_propagate_scales_mkldnn_pass)
+    copy_onnx(test_compute_propagate_scales_onednn_pass)
   endif()
 
   cc_test(
     test_cpu_quantize_placement_pass
-    SRCS mkldnn/cpu_quantize_placement_pass_tester.cc
+    SRCS onednn/cpu_quantize_placement_pass_tester.cc
     DEPS cpu_quantize_placement_pass)
   cc_test(
     test_cpu_quantize_pass
-    SRCS mkldnn/cpu_quantize_pass_tester.cc
+    SRCS onednn/cpu_quantize_pass_tester.cc
     DEPS cpu_quantize_pass naive_executor)
   cc_test(
     test_cpu_quantize_squash_pass
-    SRCS mkldnn/cpu_quantize_squash_pass_tester.cc
+    SRCS onednn/cpu_quantize_squash_pass_tester.cc
     DEPS cpu_quantize_squash_pass naive_executor)
   cc_test(
-    test_shuffle_channel_mkldnn_detect_pass
-    SRCS mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc
-    DEPS shuffle_channel_mkldnn_detect_pass)
+    test_shuffle_channel_onednn_detect_pass
+    SRCS onednn/shuffle_channel_onednn_detect_pass_tester.cc
+    DEPS shuffle_channel_onednn_detect_pass)
   cc_test(
     test_cpu_bfloat16_placement_pass
-    SRCS mkldnn/cpu_bfloat16_placement_pass_tester.cc
+    SRCS onednn/cpu_bfloat16_placement_pass_tester.cc
     DEPS cpu_bfloat16_placement_pass)
   cc_test(
     test_cpu_bfloat16_pass
-    SRCS mkldnn/cpu_bfloat16_pass_tester.cc
+    SRCS onednn/cpu_bfloat16_pass_tester.cc
     DEPS cpu_bfloat16_pass)
 endif()
 
diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
index eda982bf77866..b91132784b95f 100644
--- a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
+++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
@@ -773,15 +773,6 @@ bool AutoMixedPrecisionPass::OutputVarsNotConvert(
     if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) {
       return true;
     }
-  } else if (GetOpOriginalType(op_desc->Type()) == "instance_norm") {
-    auto vecs = op_desc->Output("SavedMean");
-    if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) {
-      return true;
-    }
-    vecs = op_desc->Output("SavedVariance");
-    if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) {
-      return true;
-    }
   }
 
   return false;
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index 4faebacb5f55c..947dc73333e0c 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -18,7 +18,7 @@
 
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
+#include "paddle/fluid/framework/ir/onednn/onednn_pass_util.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
index 52ba852a730a5..cd823afa96dd4 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@@ -16,6 +16,9 @@
 #include <string>
 #include "paddle/fluid/framework/ir/cutlass_teller.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#endif
 
 namespace paddle {
 namespace framework {
@@ -194,7 +197,11 @@ void ConvElementwiseAdd2ActFusePass::ApplyImpl(ir::Graph* graph) const {
     auto new_op_proto = PrepareOpDesc(
         base_op_desc, bias_name, bias1_name, act_op_type, act_op_out);
     framework::OpDesc new_op_desc(new_op_proto, nullptr);
-    if (cutlass_can_fuse && cutlass_enable && is_fp16_precision) {
+    int sm = 0;
+#ifdef PADDLE_WITH_CUDA
+    sm = platform::GetGPUComputeCapability(platform::GetCurrentDeviceId());
+#endif
+    if (cutlass_can_fuse && cutlass_enable && (is_fp16_precision || sm >= 80)) {
       new_op_desc.SetAttr("use_cudnn", false);
     }
 
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
index a560c0ab52e5a..0f5f2f7cc78b6 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
@@ -15,6 +15,9 @@
 #include "paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h"
 #include "paddle/fluid/framework/ir/cutlass_teller.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#endif
 
 namespace paddle {
 namespace framework {
@@ -215,7 +218,11 @@ void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const {
     auto new_op_proto =
         PrepareOpDesc(base_op_desc, bias_name, act_op_type, act_op_out, alpha);
     framework::OpDesc new_op_desc(new_op_proto, nullptr);
-    if (cutlass_can_fuse && cutlass_enable && is_fp16_precision) {
+    int sm = 0;
+#ifdef PADDLE_WITH_CUDA
+    sm = platform::GetGPUComputeCapability(platform::GetCurrentDeviceId());
+#endif
+    if (cutlass_can_fuse && cutlass_enable && (is_fp16_precision || sm >= 80)) {
       new_op_desc.SetAttr("use_cudnn", false);
       new_op_desc.Flush();
     }
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
index a3defa9f3ed06..4a0dd02db0f24 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
@@ -15,6 +15,9 @@
 #include "paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h"
 #include "paddle/fluid/framework/ir/cutlass_teller.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#endif
 
 namespace paddle {
 namespace framework {
@@ -121,14 +124,18 @@ void ConvElementwiseAddFusePass::ApplyImpl(ir::Graph* graph) const {
         static_cast<phi::DataType>(Get<int>("model_precision")) ==
             phi::DataType::FLOAT16 ||
         Get<bool>("enable_gpu_mixed");
+
     bool cutlass_enable = Get<bool>("use_cutlass");
     auto* scope = param_scope();
     bool cutlass_can_fuse = CutlassTeller::Instance()->CbaCanSupport(
         conv_op->Op(), scope, act_type, Get<int>("gpu_device_id"));
-    if (cutlass_can_fuse && cutlass_enable && is_fp16_precision) {
+    int sm = 0;
+#ifdef PADDLE_WITH_CUDA
+    sm = platform::GetGPUComputeCapability(platform::GetCurrentDeviceId());
+#endif
+    if (cutlass_can_fuse && cutlass_enable && (is_fp16_precision || sm >= 80)) {
       new_op_desc.SetAttr("use_cudnn", false);
     }
-
     auto* elementwise_add_op_desc = elementwise_add_op->Op();
     auto out_threshold_attr =
         elementwise_add_op_desc->GetNullableAttr("out_threshold");
diff --git a/paddle/fluid/framework/ir/generate_pass_tester.cc b/paddle/fluid/framework/ir/generate_pass_tester.cc
index 58a3741a924aa..f0f9330259fff 100644
--- a/paddle/fluid/framework/ir/generate_pass_tester.cc
+++ b/paddle/fluid/framework/ir/generate_pass_tester.cc
@@ -32,8 +32,7 @@ REGISTER_GENERATE_PASS(generate_fc_fuse) {
       }
     };
     // replace
-    SUBGRAPH_(replace) = [subgraph = &replace, with_relu](
-                             VAR_(x), VAR_(y), VAR_(z)) {
+    SUBGRAPH_(replace) = [subgraph = &replace](VAR_(x), VAR_(y), VAR_(z)) {
       auto& fc = OP_(fc)({{"Input", x}, {"W", y}, {"Bias", z}});
       return fc.Out("Out");
     };
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
index 0398117e08b8f..796e9a5e7f0a9 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
@@ -305,13 +305,6 @@ void EagerDeletionPass::ApplyImpl(ir::Graph *graph) const {
   auto recurrent_op_eager_deletion_pass =
       ir::PassRegistry::Instance().Get("recurrent_op_eager_deletion_pass");
   recurrent_op_eager_deletion_pass->Apply(graph);
-
-#ifdef PADDLE_WITH_CINN
-  auto share_varinfo_into_cinn_pass =
-      ir::PassRegistry::Instance().Get("share_varinfo_into_cinn_pass");
-  share_varinfo_into_cinn_pass->SetNotOwned(kMemOptVarInfoMapList, &var_infos);
-  share_varinfo_into_cinn_pass->Apply(graph);
-#endif
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
index f48897674143a..4dde29316d23e 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
@@ -20,7 +20,7 @@
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
 #include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
-#include "paddle/fluid/operators/cinn/cinn_launch_op.h"
+#include "paddle/fluid/operators/cinn/cinn_op_helper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/utils/string/string_helper.h"
 
diff --git a/paddle/fluid/framework/ir/mkldnn/activation_onednn_fuse_pass.h b/paddle/fluid/framework/ir/onednn/activation_onednn_fuse_pass.h
similarity index 98%
rename from paddle/fluid/framework/ir/mkldnn/activation_onednn_fuse_pass.h
rename to paddle/fluid/framework/ir/onednn/activation_onednn_fuse_pass.h
index 44631c54ef556..138d66731b54c 100644
--- a/paddle/fluid/framework/ir/mkldnn/activation_onednn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/onednn/activation_onednn_fuse_pass.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
+#include "paddle/fluid/framework/ir/onednn/onednn_pass_util.h"
 #include "paddle/fluid/framework/op_desc.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/batch_norm_act_fuse_pass.cc
similarity index 98%
rename from paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc
rename to paddle/fluid/framework/ir/onednn/batch_norm_act_fuse_pass.cc
index 230971a2dd286..788644dc85876 100644
--- a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/batch_norm_act_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/batch_norm_act_fuse_pass.h"
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.h b/paddle/fluid/framework/ir/onednn/batch_norm_act_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.h
rename to paddle/fluid/framework/ir/onednn/batch_norm_act_fuse_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc b/paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass.cc
similarity index 98%
rename from paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
rename to paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass.cc
index b94c99c92cdbb..1c733636ca7b0 100644
--- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h"
+#include "paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass.h"
 
 #include <cfloat>
 
@@ -487,8 +487,8 @@ void ComputePropagateScalesMkldnnPass::PropagateScales(
 }
 
 void ComputePropagateScalesMkldnnPass::ApplyImpl(ir::Graph* graph) const {
-  VLOG(3) << "Convert paddle model to mkldnn quantized model.";
-  const std::string pattern_name = "compute_propagate_scales_mkldnn_pass";
+  VLOG(3) << "Convert paddle model to onednn quantized model.";
+  const std::string pattern_name = "compute_propagate_scales_onednn_pass";
   FusePassBase::Init(pattern_name, graph);
 
   const std::unordered_set<std::string> scale_immutable_ops = {
@@ -520,10 +520,10 @@ void ComputePropagateScalesMkldnnPass::ApplyImpl(ir::Graph* graph) const {
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(compute_propagate_scales_mkldnn_pass,
+REGISTER_PASS(compute_propagate_scales_onednn_pass,
               paddle::framework::ir::ComputePropagateScalesMkldnnPass);
 
-REGISTER_PASS_CAPABILITY(compute_propagate_scales_mkldnn_pass)
+REGISTER_PASS_CAPABILITY(compute_propagate_scales_onednn_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .LE("conv2d", 1)
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h b/paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass.h
similarity index 98%
rename from paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h
rename to paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass.h
index 2c2474438bedf..b63c74a884118 100644
--- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass.h
@@ -17,7 +17,7 @@
 #include <string>
 
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
+#include "paddle/fluid/framework/ir/onednn/onednn_pass_util.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass_tester.cc
similarity index 99%
rename from paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc
rename to paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass_tester.cc
index c09a2d1ffbb8d..9664647fd4214 100644
--- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass_tester.cc
@@ -15,7 +15,7 @@
 #include <gtest/gtest.h>
 #include <unordered_map>
 
-#include "paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h"
+#include "paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass.h"
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/phi/common/place.h"
 
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/conv_activation_onednn_fuse_pass.cc
similarity index 94%
rename from paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
rename to paddle/fluid/framework/ir/onednn/conv_activation_onednn_fuse_pass.cc
index 077a29d113bb7..61c0457f7c740 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/conv_activation_onednn_fuse_pass.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h"
-#include "paddle/fluid/framework/ir/mkldnn/activation_onednn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/conv_activation_onednn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/activation_onednn_fuse_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/utils/string/pretty_log.h"
 
@@ -42,18 +42,18 @@ void ConvActivationMkldnnFusePass::FuseConvAct(Graph* graph,
                                                std::string& act_type) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph, phi::errors::InvalidArgument("Graph cannot be nullptr."));
-  FusePassBase::Init(conv_type + "_" + act_type + "_mkldnn_fuse_pass", graph);
+  FusePassBase::Init(conv_type + "_" + act_type + "_onednn_fuse_pass", graph);
 
   GraphPatternDetector gpd;
   patterns::OperatorActivation conv_act_pattern(gpd.mutable_pattern(),
-                                                "conv_activation_mkldnn_fuse");
+                                                "conv_activation_onednn_fuse");
   conv_act_pattern(conv_type, act_type);
 
   int found_conv_activation_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
     if (!IsCompat(subgraph, g)) {
-      LOG(WARNING) << "conv_activation_mkldnn_fuse_pass op compat failed.";
+      LOG(WARNING) << "conv_activation_onednn_fuse_pass op compat failed.";
       return;
     }
 
@@ -92,12 +92,12 @@ void ConvActivationMkldnnFusePass::FuseConvConcatAct(
     Graph* graph, std::string& act_type) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph, phi::errors::InvalidArgument("Graph cannot be nullptr."));
-  FusePassBase::Init("conv2d_concat_" + act_type + "_mkldnn_fuse_pass", graph);
+  FusePassBase::Init("conv2d_concat_" + act_type + "_onednn_fuse_pass", graph);
 
   GraphPatternDetector gpd;
   auto pattern = gpd.mutable_pattern();
   patterns::OperatorActivation conv_concat_act(
-      pattern, "conv2d_concat_" + act_type + "_mkldnn_fuse_pass");
+      pattern, "conv2d_concat_" + act_type + "_onednn_fuse_pass");
   conv_concat_act("concat", act_type);
 
   int found_conv_concat_activation_count = 0;
@@ -105,7 +105,7 @@ void ConvActivationMkldnnFusePass::FuseConvConcatAct(
                      Graph* g) {
     if (!IsCompat(subgraph, g)) {
       LOG(WARNING)
-          << "conv_concat_activation_mkldnn_fuse_pass op compat failed.";
+          << "conv_concat_activation_onednn_fuse_pass op compat failed.";
       return;
     }
 
@@ -377,10 +377,10 @@ ConvActivationMkldnnFusePass::ConvActivationMkldnnFusePass() {
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(conv_activation_mkldnn_fuse_pass,
+REGISTER_PASS(conv_activation_onednn_fuse_pass,
               paddle::framework::ir::ConvActivationMkldnnFusePass);
 
-REGISTER_PASS_CAPABILITY(conv_activation_mkldnn_fuse_pass)
+REGISTER_PASS_CAPABILITY(conv_activation_onednn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .LE("conv2d", 1)
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/onednn/conv_activation_onednn_fuse_pass.h
similarity index 94%
rename from paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h
rename to paddle/fluid/framework/ir/onednn/conv_activation_onednn_fuse_pass.h
index b50fa8997fdf8..9821421254c66 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/onednn/conv_activation_onednn_fuse_pass.h
@@ -33,9 +33,9 @@ class ConvActivationMkldnnFusePass : public FusePassBase {
 
   void FuseConvAct(Graph *graph,
                    const std::string &conv_type,
-                   std::string &act_type) const;
+                   std::string &act_type) const;  // NOLINT
 
-  void FuseConvConcatAct(Graph *graph, std::string &act_type) const;
+  void FuseConvConcatAct(Graph *graph, std::string &act_type) const;  // NOLINT
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/conv_affine_channel_onednn_fuse_pass.cc
similarity index 98%
rename from paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc
rename to paddle/fluid/framework/ir/onednn/conv_affine_channel_onednn_fuse_pass.cc
index eedb5b3b60bd5..5ee6e361bcc92 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/conv_affine_channel_onednn_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/conv_affine_channel_onednn_fuse_pass.h"
 
 #include <cmath>
 
@@ -313,10 +313,10 @@ void ConvAffineChannelFusePass::FuseConvAffineChannel(
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(conv_affine_channel_mkldnn_fuse_pass,
+REGISTER_PASS(conv_affine_channel_onednn_fuse_pass,
               paddle::framework::ir::ConvAffineChannelFusePass);
 
-REGISTER_PASS_CAPABILITY(conv_affine_channel_mkldnn_fuse_pass)
+REGISTER_PASS_CAPABILITY(conv_affine_channel_onednn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .LE("conv2d", 1)
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/onednn/conv_affine_channel_onednn_fuse_pass.h
similarity index 95%
rename from paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.h
rename to paddle/fluid/framework/ir/onednn/conv_affine_channel_onednn_fuse_pass.h
index cc0a761c31ed2..49545ad565e52 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/onednn/conv_affine_channel_onednn_fuse_pass.h
@@ -38,7 +38,7 @@ class ConvAffineChannelFusePass : public FusePassBase {
   void ApplyImpl(ir::Graph*) const override;
   void FuseConvAffineChannel(ir::Graph* graph,
                              const std::string& conv_type) const;
-  const std::string name_scope_{"conv_affine_channel_mkldnn_fuse"};
+  const std::string name_scope_{"conv_affine_channel_onednn_fuse"};
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/conv_bias_onednn_fuse_pass.cc
similarity index 97%
rename from paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
rename to paddle/fluid/framework/ir/onednn/conv_bias_onednn_fuse_pass.cc
index 0aa71c3df5fb5..1cf663d13deef 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/conv_bias_onednn_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/conv_bias_onednn_fuse_pass.h"
 
 #include <functional>
 #include <vector>
@@ -448,21 +448,21 @@ void ConvBiasFusePass::FuseConvBias(ir::Graph* graph,
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
-REGISTER_PASS(conv_bias_mkldnn_fuse_pass,
+REGISTER_PASS(conv_bias_onednn_fuse_pass,
               paddle::framework::ir::ConvBiasFusePass);
-REGISTER_PASS_CAPABILITY(conv_bias_mkldnn_fuse_pass)
+REGISTER_PASS_CAPABILITY(conv_bias_onednn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .LE("conv2d", 1)
             .LE("elementwise_add", 1));
 
-REGISTER_PASS(conv_transpose_bias_mkldnn_fuse_pass,
+REGISTER_PASS(conv_transpose_bias_onednn_fuse_pass,
               paddle::framework::ir::Conv2DTransposeBiasFusePass);
-REGISTER_PASS_CAPABILITY(conv_transpose_bias_mkldnn_fuse_pass)
+REGISTER_PASS_CAPABILITY(conv_transpose_bias_onednn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .LE("conv2d_transpose", 2)
             .LE("elementwise_add", 1));
 
-REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass,
+REGISTER_PASS(conv3d_bias_onednn_fuse_pass,
               paddle::framework::ir::Conv3DBiasFusePass);
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/onednn/conv_bias_onednn_fuse_pass.h
similarity index 97%
rename from paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
rename to paddle/fluid/framework/ir/onednn/conv_bias_onednn_fuse_pass.h
index 4fb8418686299..f53cdf19d29f2 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/onednn/conv_bias_onednn_fuse_pass.h
@@ -40,7 +40,7 @@ class ConvBiasFusePass : public FusePassBase {
                     const std::string& conv_type,
                     const std::string& fused_conv) const;
 
-  const std::string name_scope_{"conv_bias_mkldnn_fuse"};
+  const std::string name_scope_{"conv_bias_onednn_fuse"};
 };
 
 /*
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/conv_elementwise_add_onednn_fuse_pass.cc
similarity index 95%
rename from paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
rename to paddle/fluid/framework/ir/onednn/conv_elementwise_add_onednn_fuse_pass.cc
index fecf4a4eaf5f8..7733730f7d605 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/conv_elementwise_add_onednn_fuse_pass.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/conv_elementwise_add_onednn_fuse_pass.h"
 
 #include "paddle/fluid/framework/ir/graph_traits.h"
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
+#include "paddle/fluid/framework/ir/onednn/onednn_pass_util.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/utils/string/pretty_log.h"
 
@@ -156,12 +156,12 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConv(
 
     if (!IsCompat(subgraph, g)) {
       LOG(WARNING)
-          << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
+          << "conv_elementwise_add_onednn_fuse_pass in op compat failed.";
       return;
     }
 
     if (residual_data->Var()->GetShape() != conv_output->Var()->GetShape()) {
-      LOG(WARNING) << "conv_elementwise_add_mkldnn_fuse_pass doesn't support " -
+      LOG(WARNING) << "conv_elementwise_add_onednn_fuse_pass doesn't support " -
                           "broadcasting";
       return;
     }
@@ -235,7 +235,7 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
 
     if (!IsCompat(subgraph, g)) {
       LOG(WARNING)
-          << "op compat for conv_elementwise_add_mkldnn_fuse_pass failed.";
+          << "op compat for conv_elementwise_add_onednn_fuse_pass failed.";
       return;
     }
 
@@ -309,9 +309,9 @@ void ResidualConnectionMKLDNNFusePass::ApplyImpl(ir::Graph* graph) const {
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(conv_elementwise_add_mkldnn_fuse_pass,
+REGISTER_PASS(conv_elementwise_add_onednn_fuse_pass,
               paddle::framework::ir::ResidualConnectionMKLDNNFusePass);
-REGISTER_PASS_CAPABILITY(conv_elementwise_add_mkldnn_fuse_pass)
+REGISTER_PASS_CAPABILITY(conv_elementwise_add_onednn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .LE("conv2d", 1)
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/onednn/conv_elementwise_add_onednn_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
rename to paddle/fluid/framework/ir/onednn/conv_elementwise_add_onednn_fuse_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc b/paddle/fluid/framework/ir/onednn/cpu_bfloat16_pass.cc
similarity index 99%
rename from paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
rename to paddle/fluid/framework/ir/onednn/cpu_bfloat16_pass.cc
index 528ba5747218a..1cebbfc1617a0 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/cpu_bfloat16_pass.cc
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h"
+#include "paddle/fluid/framework/ir/onednn/cpu_bfloat16_pass.h"
 
 #include <map>
 #include <string>
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h b/paddle/fluid/framework/ir/onednn/cpu_bfloat16_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h
rename to paddle/fluid/framework/ir/onednn/cpu_bfloat16_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc b/paddle/fluid/framework/ir/onednn/cpu_bfloat16_pass_tester.cc
similarity index 99%
rename from paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
rename to paddle/fluid/framework/ir/onednn/cpu_bfloat16_pass_tester.cc
index 951d064364ce3..c31e59b39216a 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
+++ b/paddle/fluid/framework/ir/onednn/cpu_bfloat16_pass_tester.cc
@@ -14,7 +14,7 @@
 
 #include <gtest/gtest.h>
 
-#include "paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h"
+#include "paddle/fluid/framework/ir/onednn/cpu_bfloat16_pass.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc b/paddle/fluid/framework/ir/onednn/cpu_bfloat16_placement_pass.cc
similarity index 97%
rename from paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
rename to paddle/fluid/framework/ir/onednn/cpu_bfloat16_placement_pass.cc
index 8741b00f689f5..a07887dafb276 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/cpu_bfloat16_placement_pass.cc
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h"
+#include "paddle/fluid/framework/ir/onednn/cpu_bfloat16_placement_pass.h"
 
 #include <string>
 #include <unordered_set>
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #include "paddle/utils/string/pretty_log.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h b/paddle/fluid/framework/ir/onednn/cpu_bfloat16_placement_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
rename to paddle/fluid/framework/ir/onednn/cpu_bfloat16_placement_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc b/paddle/fluid/framework/ir/onednn/cpu_bfloat16_placement_pass_tester.cc
similarity index 98%
rename from paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
rename to paddle/fluid/framework/ir/onednn/cpu_bfloat16_placement_pass_tester.cc
index c420c616a9ca6..e2de24cc398e0 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/onednn/cpu_bfloat16_placement_pass_tester.cc
@@ -14,8 +14,8 @@
 
 #include <gtest/gtest.h>
 
-#include "paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/framework/ir/onednn/cpu_bfloat16_placement_pass.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/onednn/cpu_quantize_pass.cc
similarity index 99%
rename from paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
rename to paddle/fluid/framework/ir/onednn/cpu_quantize_pass.cc
index 0e9c452455de3..a512f4b8021f4 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/cpu_quantize_pass.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
+#include "paddle/fluid/framework/ir/onednn/cpu_quantize_pass.h"
 
 #include <sstream>
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/framework/ir/onednn/onednn_pass_util.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #include "paddle/utils/string/pretty_log.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/onednn/cpu_quantize_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
rename to paddle/fluid/framework/ir/onednn/cpu_quantize_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/onednn/cpu_quantize_pass_tester.cc
similarity index 99%
rename from paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
rename to paddle/fluid/framework/ir/onednn/cpu_quantize_pass_tester.cc
index c7e15e24216aa..3c1f4d8d60925 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/onednn/cpu_quantize_pass_tester.cc
@@ -16,7 +16,7 @@
 
 #include <unordered_map>
 
-#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"  // NOLINT
+#include "paddle/fluid/framework/ir/onednn/cpu_quantize_pass.h"  // NOLINT
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/phi/common/place.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/onednn/cpu_quantize_placement_pass.cc
similarity index 97%
rename from paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
rename to paddle/fluid/framework/ir/onednn/cpu_quantize_placement_pass.cc
index 2071f284126b7..56ba19a5cc22b 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/cpu_quantize_placement_pass.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h"
+#include "paddle/fluid/framework/ir/onednn/cpu_quantize_placement_pass.h"
 
 #include <unordered_set>
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
+#include "paddle/fluid/framework/ir/onednn/onednn_pass_util.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h b/paddle/fluid/framework/ir/onednn/cpu_quantize_placement_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
rename to paddle/fluid/framework/ir/onednn/cpu_quantize_placement_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc b/paddle/fluid/framework/ir/onednn/cpu_quantize_placement_pass_tester.cc
similarity index 98%
rename from paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
rename to paddle/fluid/framework/ir/onednn/cpu_quantize_placement_pass_tester.cc
index 5cbd64c49d200..bd5db7c0e3df2 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/onednn/cpu_quantize_placement_pass_tester.cc
@@ -14,8 +14,8 @@
 
 #include <gtest/gtest.h>
 
-#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/framework/ir/onednn/cpu_quantize_placement_pass.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/onednn/cpu_quantize_squash_pass.cc
similarity index 99%
rename from paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
rename to paddle/fluid/framework/ir/onednn/cpu_quantize_squash_pass.cc
index 578ab67f2a3b7..91f878a16abd0 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/cpu_quantize_squash_pass.cc
@@ -13,13 +13,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h"
+#include "paddle/fluid/framework/ir/onednn/cpu_quantize_squash_pass.h"
 
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/framework/ir/onednn/onednn_pass_util.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/utils/string/pretty_log.h"
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/onednn/cpu_quantize_squash_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
rename to paddle/fluid/framework/ir/onednn/cpu_quantize_squash_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/onednn/cpu_quantize_squash_pass_tester.cc
similarity index 99%
rename from paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
rename to paddle/fluid/framework/ir/onednn/cpu_quantize_squash_pass_tester.cc
index 89e57108b17ef..fc57bdb6b52ef 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
+++ b/paddle/fluid/framework/ir/onednn/cpu_quantize_squash_pass_tester.cc
@@ -14,7 +14,7 @@
 
 #include <gtest/gtest.h>
 
-#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h"
+#include "paddle/fluid/framework/ir/onednn/cpu_quantize_squash_pass.h"
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/phi/common/place.h"
 
@@ -120,8 +120,9 @@ ProgramDesc BuildConvRequantProgramDesc(bool use_mkldnn,
                                         float scale_out,
                                         float scale_in) {
   ProgramDesc prog;
-  for (auto& v : std::initializer_list<std::string>(
-           {"a", "w1", "b1", "d", "e", "f", "w2", "b2", "i"})) {
+  const std::vector<std::string> values = {
+      "a", "w1", "b1", "d", "e", "f", "w2", "b2", "i"};
+  for (auto& v : values) {
     auto* var = prog.MutableBlock(0)->Var(v);
     if (v.find("w") == 0 || v.find("b") == 0) {
       var->SetPersistable(true);
@@ -240,7 +241,7 @@ ProgramDesc BuildOpRequantProgramDesc(bool use_mkldnn,
         {"h"},
         use_mkldnn,
         {matmul_scale, requant_scale3});
-  SetOp(&prog, "concat", "Concat", {"c", "f", "h"}, {"g"}, {use_mkldnn});
+  SetOp(&prog, "concat", "Concat", {"c", "f", "h"}, {"g"}, use_mkldnn);
 
   return prog;
 }
@@ -683,7 +684,7 @@ ProgramDesc BuildRequantOpProgramDesc(bool use_mkldnn,
         {"h"},
         use_mkldnn,
         {op_scale_in, op_scale_out});
-  SetOp(&prog, "concat", "Concat", {"b", "e", "h"}, {"i"}, {use_mkldnn});
+  SetOp(&prog, "concat", "Concat", {"b", "e", "h"}, {"i"}, use_mkldnn);
 
   return prog;
 }
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/onednn/depthwise_conv_onednn_pass.cc
similarity index 90%
rename from paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
rename to paddle/fluid/framework/ir/onednn/depthwise_conv_onednn_pass.cc
index fca71d0bd6900..703a2c685e770 100644
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/depthwise_conv_onednn_pass.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
+#include "paddle/fluid/framework/ir/onednn/depthwise_conv_onednn_pass.h"
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
@@ -76,7 +76,7 @@ DepthwiseConvMKLDNNPass::DepthwiseConvMKLDNNPass() {  // NOLINT
 void DepthwiseConvMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
-  FusePassBase::Init("depthwise_conv_mkldnn_pass", graph);
+  FusePassBase::Init("depthwise_conv_onednn_pass", graph);
   GraphPatternDetector gpd;
 
   auto* pattern = gpd.mutable_pattern();
@@ -84,7 +84,7 @@ void DepthwiseConvMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
       ->assert_is_op("depthwise_conv2d")
       ->assert_op_attr("use_mkldnn", true);
 
-  int found_depthwise_conv_mkldnn_count = 0;
+  int found_depthwise_conv_onednn_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
     if (!IsCompat(subgraph, g)) {
@@ -94,20 +94,20 @@ void DepthwiseConvMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
     VLOG(3) << "handle DepthwiseConvMKLDNN fuse";
     GET_NODE(depthwise_conv, (*pattern));
     depthwise_conv->Op()->SetType("conv2d");
-    found_depthwise_conv_mkldnn_count++;
+    found_depthwise_conv_onednn_count++;
   };
 
   gpd(graph, handler);
-  AddStatis(found_depthwise_conv_mkldnn_count);
+  AddStatis(found_depthwise_conv_onednn_count);
 }
 
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(depthwise_conv_mkldnn_pass,
+REGISTER_PASS(depthwise_conv_onednn_pass,
               paddle::framework::ir::DepthwiseConvMKLDNNPass);
-REGISTER_PASS_CAPABILITY(depthwise_conv_mkldnn_pass)
+REGISTER_PASS_CAPABILITY(depthwise_conv_onednn_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination().LE(
             "depthwise_conv2d", 1));
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h b/paddle/fluid/framework/ir/onednn/depthwise_conv_onednn_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h
rename to paddle/fluid/framework/ir/onednn/depthwise_conv_onednn_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/onednn/depthwise_conv_onednn_pass_tester.cc
similarity index 89%
rename from paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
rename to paddle/fluid/framework/ir/onednn/depthwise_conv_onednn_pass_tester.cc
index f74e95fff10d8..5fdb7ad959921 100644
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/onednn/depthwise_conv_onednn_pass_tester.cc
@@ -14,7 +14,7 @@
 
 #include <gtest/gtest.h>
 
-#include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
+#include "paddle/fluid/framework/ir/onednn/depthwise_conv_onednn_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
@@ -105,7 +105,7 @@ ProgramDesc BuildProgramDesc() {
 TEST(DepthwiseConvMKLDNNPass, pass_op_version_check) {
   ASSERT_TRUE(
       paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
-          .IsPassCompatible("depthwise_conv_mkldnn_pass"));
+          .IsPassCompatible("depthwise_conv_onednn_pass"));
 }
 
 TEST(DepthwiseConvMKLDNNPass, basic) {
@@ -113,12 +113,12 @@ TEST(DepthwiseConvMKLDNNPass, basic) {
 
   std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
 
-  auto pass = PassRegistry::Instance().Get("depthwise_conv_mkldnn_pass");
+  auto pass = PassRegistry::Instance().Get("depthwise_conv_onednn_pass");
 
   struct counters {
-    int mkldnn_depthwise_conv_nodes;
+    int onednn_depthwise_conv_nodes;
     int other_depthwise_conv_nodes;
-    int mkldnn_conv_nodes;
+    int onednn_conv_nodes;
     int other_conv_nodes;
   };
 
@@ -134,12 +134,12 @@ TEST(DepthwiseConvMKLDNNPass, basic) {
       auto* op = node->Op();
       if (op->Type() == "conv2d") {
         if (PADDLE_GET_CONST(bool, op->GetAttr("use_mkldnn")))
-          after.mkldnn_conv_nodes++;
+          after.onednn_conv_nodes++;
         else
           after.other_conv_nodes++;
       } else if (op->Type() == "depthwise_conv2d") {
         if (PADDLE_GET_CONST(bool, op->GetAttr("use_mkldnn")))
-          after.mkldnn_depthwise_conv_nodes++;
+          after.onednn_depthwise_conv_nodes++;
         else
           after.other_depthwise_conv_nodes++;
       }
@@ -149,13 +149,13 @@ TEST(DepthwiseConvMKLDNNPass, basic) {
   EXPECT_EQ(after.other_depthwise_conv_nodes,
             before.other_depthwise_conv_nodes);
   EXPECT_EQ(after.other_conv_nodes, before.other_conv_nodes);
-  EXPECT_EQ(after.mkldnn_depthwise_conv_nodes,
-            before.mkldnn_depthwise_conv_nodes - 1);
-  EXPECT_EQ(after.mkldnn_conv_nodes, before.mkldnn_conv_nodes + 1);
+  EXPECT_EQ(after.onednn_depthwise_conv_nodes,
+            before.onednn_depthwise_conv_nodes - 1);
+  EXPECT_EQ(after.onednn_conv_nodes, before.onednn_conv_nodes + 1);
 }
 
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
 
-USE_PASS(depthwise_conv_mkldnn_pass);
+USE_PASS(depthwise_conv_onednn_pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/elementwise_act_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/elementwise_act_onednn_fuse_pass.cc
similarity index 95%
rename from paddle/fluid/framework/ir/mkldnn/elementwise_act_onednn_fuse_pass.cc
rename to paddle/fluid/framework/ir/onednn/elementwise_act_onednn_fuse_pass.cc
index b6e84145aebff..3f0423870d366 100644
--- a/paddle/fluid/framework/ir/mkldnn/elementwise_act_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/elementwise_act_onednn_fuse_pass.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/elementwise_act_onednn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/elementwise_act_onednn_fuse_pass.h"
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/mkldnn/activation_onednn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/activation_onednn_fuse_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/utils/string/pretty_log.h"
@@ -43,7 +43,7 @@ void ElementwiseActivationOneDNNPass::FuseElementwiseAct(
     const std::string &act_type) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph, phi::errors::InvalidArgument("Graph cannot be nullptr."));
-  FusePassBase::Init(elt_type + "_" + act_type + "_mkldnn_fuse_pass", graph);
+  FusePassBase::Init(elt_type + "_" + act_type + "_onednn_fuse_pass", graph);
 
   GraphPatternDetector gpd;
   patterns::OperatorActivation elementwise_act_pattern(gpd.mutable_pattern(),
diff --git a/paddle/fluid/framework/ir/mkldnn/elementwise_act_onednn_fuse_pass.h b/paddle/fluid/framework/ir/onednn/elementwise_act_onednn_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/elementwise_act_onednn_fuse_pass.h
rename to paddle/fluid/framework/ir/onednn/elementwise_act_onednn_fuse_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/fc_act_onednn_fuse_pass.cc
similarity index 89%
rename from paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc
rename to paddle/fluid/framework/ir/onednn/fc_act_onednn_fuse_pass.cc
index 47c76289d187c..aa4ee8cb5e767 100644
--- a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/fc_act_onednn_fuse_pass.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/fc_act_onednn_fuse_pass.h"
 
-#include "paddle/fluid/framework/ir/mkldnn/activation_onednn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/activation_onednn_fuse_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/utils/string/pretty_log.h"
 
@@ -34,11 +34,11 @@ void FuseFCActOneDNNPass::FuseFCAct(Graph *graph,
                                     const std::string &act_type) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph, phi::errors::InvalidArgument("Graph cannot be nullptr."));
-  FusePassBase::Init("fc_" + act_type + "_mkldnn_fuse_pass", graph);
+  FusePassBase::Init("fc_" + act_type + "_onednn_fuse_pass", graph);
 
   GraphPatternDetector gpd;
   patterns::OperatorActivation fc_act_pattern(
-      gpd.mutable_pattern(), "fc_" + act_type + "_mkldnn_fuse_pass");
+      gpd.mutable_pattern(), "fc_" + act_type + "_onednn_fuse_pass");
   fc_act_pattern("fc", act_type);
 
   int found_fc_act_count = 0;
@@ -70,9 +70,9 @@ void FuseFCActOneDNNPass::FuseFCAct(Graph *graph,
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(fc_act_mkldnn_fuse_pass,
+REGISTER_PASS(fc_act_onednn_fuse_pass,
               paddle::framework::ir::FuseFCActOneDNNPass);
-REGISTER_PASS_CAPABILITY(fc_act_mkldnn_fuse_pass)
+REGISTER_PASS_CAPABILITY(fc_act_onednn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .LE("fc", 0)
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/onednn/fc_act_onednn_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.h
rename to paddle/fluid/framework/ir/onednn/fc_act_onednn_fuse_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc b/paddle/fluid/framework/ir/onednn/fc_onednn_pass.cc
similarity index 93%
rename from paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc
rename to paddle/fluid/framework/ir/onednn/fc_onednn_pass.cc
index f4396d6d8175a..082579428a01a 100644
--- a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/fc_onednn_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h"
+#include "paddle/fluid/framework/ir/onednn/fc_onednn_pass.h"
 
 #include "paddle/phi/core/enforce.h"
 #include "paddle/utils/string/pretty_log.h"
@@ -33,10 +33,10 @@ void FCMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
   PADDLE_ENFORCE_NOT_NULL(graph,
                           platform::errors::InvalidArgument(
                               "Pointer to graph argument should not be NULL."));
-  Init("fc_mkldnn_pass", graph);
+  Init("fc_onednn_pass", graph);
 
   GraphPatternDetector gpd;
-  patterns::FCMKLDNN fc_pattern(gpd.mutable_pattern(), "fc_mkldnn_pass");
+  patterns::FCMKLDNN fc_pattern(gpd.mutable_pattern(), "fc_onednn_pass");
   // searching for fc+residual  doesn't make sense at this stage
   fc_pattern(false /*with_residual*/);
 
@@ -89,4 +89,4 @@ void FCMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(fc_mkldnn_pass, paddle::framework::ir::FCMKLDNNPass);
+REGISTER_PASS(fc_onednn_pass, paddle::framework::ir::FCMKLDNNPass);
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h b/paddle/fluid/framework/ir/onednn/fc_onednn_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h
rename to paddle/fluid/framework/ir/onednn/fc_onednn_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc b/paddle/fluid/framework/ir/onednn/int8_scale_calculation_onednn_pass.cc
similarity index 94%
rename from paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc
rename to paddle/fluid/framework/ir/onednn/int8_scale_calculation_onednn_pass.cc
index a219e47072782..499a7734d71d6 100644
--- a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/int8_scale_calculation_onednn_pass.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h"
+#include "paddle/fluid/framework/ir/onednn/int8_scale_calculation_onednn_pass.h"
 
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
+#include "paddle/fluid/framework/ir/onednn/onednn_pass_util.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #include "paddle/phi/core/enforce.h"
 
 namespace paddle {
@@ -110,10 +110,10 @@ void Int8ScaleCalculationMkldnnPass::Int8ScaleImpl(
   PADDLE_ENFORCE_NOT_NULL(graph,
                           platform::errors::InvalidArgument(
                               "Pointer to graph argument should not be NULL."));
-  FusePassBase::Init("int8_scale_calculation_mkldnn_pass", graph);
+  FusePassBase::Init("int8_scale_calculation_onednn_pass", graph);
   GraphPatternDetector gpd;
   patterns::Conv conv_pattern(gpd.mutable_pattern(),
-                              "int8_scale_calculation_mkldnn_pass");
+                              "int8_scale_calculation_onednn_pass");
   conv_pattern(conv_type);
 
   int found_int8_scales_count = 0;
@@ -214,9 +214,9 @@ void Int8ScaleCalculationMkldnnPass::Int8ScaleImpl(
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(int8_scale_calculation_mkldnn_pass,
+REGISTER_PASS(int8_scale_calculation_onednn_pass,
               paddle::framework::ir::Int8ScaleCalculationMkldnnPass);
-REGISTER_PASS_CAPABILITY(int8_scale_calculation_mkldnn_pass)
+REGISTER_PASS_CAPABILITY(int8_scale_calculation_onednn_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination().LE(
             "conv2d", 1));
diff --git a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h b/paddle/fluid/framework/ir/onednn/int8_scale_calculation_onednn_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h
rename to paddle/fluid/framework/ir/onednn/int8_scale_calculation_onednn_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/onednn/int8_scale_calculation_onednn_pass_tester.cc
similarity index 96%
rename from paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc
rename to paddle/fluid/framework/ir/onednn/int8_scale_calculation_onednn_pass_tester.cc
index fde7fb07b9108..e015276ac1f67 100644
--- a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/onednn/int8_scale_calculation_onednn_pass_tester.cc
@@ -14,7 +14,7 @@
 
 #include <gtest/gtest.h>
 
-#include "paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h"
+#include "paddle/fluid/framework/ir/onednn/int8_scale_calculation_onednn_pass.h"
 
 namespace paddle {
 namespace framework {
@@ -95,7 +95,7 @@ void MainTest(bool convWithExistingBias,
   auto prog = BuildProgramDesc(convWithExistingBias, scale_weights);
   std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
   auto pass =
-      PassRegistry::Instance().Get("int8_scale_calculation_mkldnn_pass");
+      PassRegistry::Instance().Get("int8_scale_calculation_onednn_pass");
   int original_nodes_num = graph->Nodes().size();
   graph.reset(pass->Apply(graph.release()));
   int current_nodes_num = graph->Nodes().size();
@@ -153,4 +153,4 @@ TEST(Int8ScaleCalculationMkldnnPass,
 }  // namespace framework
 }  // namespace paddle
 
-USE_PASS(int8_scale_calculation_mkldnn_pass);
+USE_PASS(int8_scale_calculation_onednn_pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc b/paddle/fluid/framework/ir/onednn/interpolate_onednn_pass.cc
similarity index 90%
rename from paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc
rename to paddle/fluid/framework/ir/onednn/interpolate_onednn_pass.cc
index 04a6f8d6b770d..8f384931a589c 100644
--- a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/interpolate_onednn_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.h"
+#include "paddle/fluid/framework/ir/onednn/interpolate_onednn_pass.h"
 
 #include <string>
 #include <vector>
@@ -36,12 +36,12 @@ void InterpolateOneDNNPass::ApplyImpl(ir::Graph* graph) const {
                           platform::errors::InvalidArgument(
                               "Pointer to graph argument should not be NULL."));
   if (!(graph->Has("use_mkldnn") && graph->Get<bool>("use_mkldnn"))) {
-    VLOG(3) << "Do not handle interpolate_mkldnn_pass";
+    VLOG(3) << "Do not handle interpolate_onednn_pass";
     return;
   }
-  VLOG(4) << "Handle interpolate_mkldnn_pass";
+  VLOG(4) << "Handle interpolate_onednn_pass";
 
-  Init("interpolate_mkldnn_pass", graph);
+  Init("interpolate_onednn_pass", graph);
 
   int found_count = 0;
   const std::vector<std::string> interpolate_op_types = {"bilinear_interp",
@@ -69,5 +69,5 @@ void InterpolateOneDNNPass::ApplyImpl(ir::Graph* graph) const {
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(interpolate_mkldnn_pass,
+REGISTER_PASS(interpolate_onednn_pass,
               paddle::framework::ir::InterpolateOneDNNPass);
diff --git a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.h b/paddle/fluid/framework/ir/onednn/interpolate_onednn_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.h
rename to paddle/fluid/framework/ir/onednn/interpolate_onednn_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/matmul_activation_onednn_fuse_pass.cc
similarity index 93%
rename from paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc
rename to paddle/fluid/framework/ir/onednn/matmul_activation_onednn_fuse_pass.cc
index d547f6fdd1ba2..66c96c268141d 100644
--- a/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/matmul_activation_onednn_fuse_pass.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/matmul_activation_onednn_fuse_pass.h"
 
-#include "paddle/fluid/framework/ir/mkldnn/activation_onednn_fuse_pass.h"
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
+#include "paddle/fluid/framework/ir/onednn/activation_onednn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/onednn_pass_util.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/utils/string/pretty_log.h"
 
@@ -39,11 +39,11 @@ void MatmulActivationMkldnnFusePass::FuseMatmulAct(
     Graph* graph, const std::string& matmul_type, std::string& act_type) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph, phi::errors::InvalidArgument("Graph cannot be nullptr."));
-  FusePassBase::Init(matmul_type + "_" + act_type + "_mkldnn_fuse_pass", graph);
+  FusePassBase::Init(matmul_type + "_" + act_type + "_onednn_fuse_pass", graph);
 
   GraphPatternDetector gpd;
   patterns::OperatorActivation matmul_act_pattern(
-      gpd.mutable_pattern(), "matmul_activation_mkldnn_fuse");
+      gpd.mutable_pattern(), "matmul_activation_onednn_fuse");
   matmul_act_pattern(matmul_type, act_type);
 
   int found_matmul_activation_count = 0;
@@ -52,7 +52,7 @@ void MatmulActivationMkldnnFusePass::FuseMatmulAct(
     VLOG(4) << "handle " + matmul_type + "+" + act_type + " fuse";
 
     if (!IsCompat(subgraph, g)) {
-      LOG(WARNING) << "matmul_activation_mkldnn_fuse_pass op compat failed.";
+      LOG(WARNING) << "matmul_activation_onednn_fuse_pass op compat failed.";
       return;
     }
 
@@ -288,10 +288,10 @@ MatmulActivationMkldnnFusePass::MatmulActivationMkldnnFusePass() {
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(matmul_activation_mkldnn_fuse_pass,
+REGISTER_PASS(matmul_activation_onednn_fuse_pass,
               paddle::framework::ir::MatmulActivationMkldnnFusePass);
 
-REGISTER_PASS_CAPABILITY(matmul_activation_mkldnn_fuse_pass)
+REGISTER_PASS_CAPABILITY(matmul_activation_onednn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .EQ("fused_matmul", 0)
diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/onednn/matmul_activation_onednn_fuse_pass.h
similarity index 95%
rename from paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.h
rename to paddle/fluid/framework/ir/onednn/matmul_activation_onednn_fuse_pass.h
index ebef63e292438..eec62d9e066fa 100644
--- a/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/onednn/matmul_activation_onednn_fuse_pass.h
@@ -33,7 +33,7 @@ class MatmulActivationMkldnnFusePass : public FusePassBase {
 
   void FuseMatmulAct(Graph *graph,
                      const std::string &matmul_type,
-                     std::string &act_type) const;
+                     std::string &act_type) const;  // NOLINT
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/matmul_elementwise_add_onednn_fuse_pass.cc
similarity index 93%
rename from paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc
rename to paddle/fluid/framework/ir/onednn/matmul_elementwise_add_onednn_fuse_pass.cc
index 5bb153d3ece0b..8d80eb57e5032 100644
--- a/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/matmul_elementwise_add_onednn_fuse_pass.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/matmul_elementwise_add_onednn_fuse_pass.h"
 
 #include "paddle/fluid/framework/ir/graph_traits.h"
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
+#include "paddle/fluid/framework/ir/onednn/onednn_pass_util.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/utils/string/pretty_log.h"
 
@@ -62,7 +62,7 @@ void MatmulElementwiseAddMKLDNNFusePass::FuseMatmulElementwiseAdd(
     if (FindFuseOption(*matmul, *elementwise_add) != FUSE_MKLDNN) return;
     if (!IsCompat(subgraph, g)) {
       LOG(WARNING)
-          << "op compat for matmul_elementwise_add_mkldnn_fuse_pass failed.";
+          << "op compat for matmul_elementwise_add_onednn_fuse_pass failed.";
       return;
     }
 
@@ -167,9 +167,9 @@ MatmulElementwiseAddMKLDNNFusePass::MatmulElementwiseAddMKLDNNFusePass() {
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(matmul_elementwise_add_mkldnn_fuse_pass,
+REGISTER_PASS(matmul_elementwise_add_onednn_fuse_pass,
               paddle::framework::ir::MatmulElementwiseAddMKLDNNFusePass);
-REGISTER_PASS_CAPABILITY(matmul_elementwise_add_mkldnn_fuse_pass)
+REGISTER_PASS_CAPABILITY(matmul_elementwise_add_onednn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .EQ("fused_matmul", 0)
diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/onednn/matmul_elementwise_add_onednn_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.h
rename to paddle/fluid/framework/ir/onednn/matmul_elementwise_add_onednn_fuse_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/matmul_transpose_reshape_onednn_fuse_pass.cc
similarity index 94%
rename from paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc
rename to paddle/fluid/framework/ir/onednn/matmul_transpose_reshape_onednn_fuse_pass.cc
index a899744672b4b..0b742d763bebc 100644
--- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/matmul_transpose_reshape_onednn_fuse_pass.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.h"
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
+#include "paddle/fluid/framework/ir/onednn/matmul_transpose_reshape_onednn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/onednn_pass_util.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/utils/string/pretty_log.h"
@@ -37,12 +37,12 @@ void MatmulTransposeReshapeMKLDNNPass::Fuse(
   PADDLE_ENFORCE_NOT_NULL(graph,
                           platform::errors::InvalidArgument(
                               "Pointer to graph argument should not be NULL."));
-  FusePassBase::Init(matmul_type + "_transpose_reshape_mkldnn_fuse_pass",
+  FusePassBase::Init(matmul_type + "_transpose_reshape_onednn_fuse_pass",
                      graph);
   GraphPatternDetector gpd;
   patterns::MatmulTransposeReshapePattern mtrp(
       gpd.mutable_pattern(),
-      matmul_type + "_transpose_reshape_mkldnn_fuse_pass");
+      matmul_type + "_transpose_reshape_onednn_fuse_pass");
   mtrp(matmul_type);
 
   int found_matmul_transpose_reshape_count = 0;
@@ -206,10 +206,10 @@ MatmulTransposeReshapeMKLDNNPass::MatmulTransposeReshapeMKLDNNPass() {
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(matmul_transpose_reshape_mkldnn_fuse_pass,
+REGISTER_PASS(matmul_transpose_reshape_onednn_fuse_pass,
               paddle::framework::ir::MatmulTransposeReshapeMKLDNNPass);
 
-REGISTER_PASS_CAPABILITY(matmul_transpose_reshape_mkldnn_fuse_pass)
+REGISTER_PASS_CAPABILITY(matmul_transpose_reshape_onednn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .EQ("fused_matmul", 0)
diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/onednn/matmul_transpose_reshape_onednn_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.h
rename to paddle/fluid/framework/ir/onednn/matmul_transpose_reshape_onednn_fuse_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/multi_gru_fuse_pass.cc
similarity index 99%
rename from paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc
rename to paddle/fluid/framework/ir/onednn/multi_gru_fuse_pass.cc
index a1f74d3423006..3b95f27a2d302 100644
--- a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/multi_gru_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/multi_gru_fuse_pass.h"
 
 #include <vector>
 
diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.h b/paddle/fluid/framework/ir/onednn/multi_gru_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.h
rename to paddle/fluid/framework/ir/onednn/multi_gru_fuse_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/multi_gru_seq_fuse_pass.cc
similarity index 98%
rename from paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc
rename to paddle/fluid/framework/ir/onednn/multi_gru_seq_fuse_pass.cc
index 7af7b67c4da49..214b8e12fd0b1 100644
--- a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/multi_gru_seq_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/multi_gru_seq_fuse_pass.h"
 
 #include <limits>
 #include <sstream>
@@ -23,7 +23,7 @@
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #include "paddle/utils/string/pretty_log.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.h b/paddle/fluid/framework/ir/onednn/multi_gru_seq_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.h
rename to paddle/fluid/framework/ir/onednn/multi_gru_seq_fuse_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h b/paddle/fluid/framework/ir/onednn/onednn_pass_util.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h
rename to paddle/fluid/framework/ir/onednn/onednn_pass_util.h
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/onednn/onednn_placement_pass.cc
similarity index 95%
rename from paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
rename to paddle/fluid/framework/ir/onednn/onednn_placement_pass.cc
index 23e5497b12fde..7ff379f5e9120 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/onednn_placement_pass.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h"
+#include "paddle/fluid/framework/ir/onednn/onednn_placement_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
@@ -94,10 +94,10 @@ bool MKLDNNPlacementPass::IsSupport(const Node* op) const {
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(mkldnn_placement_pass, paddle::framework::ir::MKLDNNPlacementPass)
+REGISTER_PASS(onednn_placement_pass, paddle::framework::ir::MKLDNNPlacementPass)
     .RequirePassAttr("mkldnn_enabled_op_types");
 
-REGISTER_PASS_CAPABILITY(mkldnn_placement_pass)
+REGISTER_PASS_CAPABILITY(onednn_placement_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination().LE(
             "fusion_gru", 1));
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h b/paddle/fluid/framework/ir/onednn/onednn_placement_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
rename to paddle/fluid/framework/ir/onednn/onednn_placement_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc b/paddle/fluid/framework/ir/onednn/onednn_placement_pass_tester.cc
similarity index 96%
rename from paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
rename to paddle/fluid/framework/ir/onednn/onednn_placement_pass_tester.cc
index b7697252a67c4..052c59ef84a99 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/onednn/onednn_placement_pass_tester.cc
@@ -14,7 +14,7 @@
 
 #include <gtest/gtest.h>
 
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h"
+#include "paddle/fluid/framework/ir/onednn/onednn_placement_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/utils/tribool.h"
 
@@ -133,7 +133,7 @@ class PlacementPassTest {
     RegisterOpKernel({"conv2d", "pool2d", "concat", "relu"});
     std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
 
-    auto pass = PassRegistry::Instance().Get("mkldnn_placement_pass");
+    auto pass = PassRegistry::Instance().Get("onednn_placement_pass");
 
     pass->Set("mkldnn_enabled_op_types",
               new std::unordered_set<std::string>(mkldnn_enabled_op_types));
@@ -156,7 +156,7 @@ class PlacementPassTest {
   }
 
   void PlacementNameTest() {
-    auto pass = PassRegistry::Instance().Get("mkldnn_placement_pass");
+    auto pass = PassRegistry::Instance().Get("onednn_placement_pass");
     EXPECT_EQ(static_cast<PlacementPassBase*>(pass.get())->GetPlacementName(),
               "MKLDNN");
   }
@@ -186,4 +186,4 @@ TEST(MKLDNNPlacementPass, placement_name) {
 }  // namespace framework
 }  // namespace paddle
 
-USE_PASS(mkldnn_placement_pass);
+USE_PASS(onednn_placement_pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/operator_reshape2_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/operator_reshape2_onednn_fuse_pass.cc
similarity index 97%
rename from paddle/fluid/framework/ir/mkldnn/operator_reshape2_onednn_fuse_pass.cc
rename to paddle/fluid/framework/ir/onednn/operator_reshape2_onednn_fuse_pass.cc
index b128159237546..a21ddd579be3c 100644
--- a/paddle/fluid/framework/ir/mkldnn/operator_reshape2_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/operator_reshape2_onednn_fuse_pass.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/operator_reshape2_onednn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/operator_reshape2_onednn_fuse_pass.h"
 
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
+#include "paddle/fluid/framework/ir/onednn/onednn_pass_util.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/backends/onednn/onednn_reuse.h"
 #include "paddle/utils/string/pretty_log.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/operator_reshape2_onednn_fuse_pass.h b/paddle/fluid/framework/ir/onednn/operator_reshape2_onednn_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/operator_reshape2_onednn_fuse_pass.h
rename to paddle/fluid/framework/ir/onednn/operator_reshape2_onednn_fuse_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/operator_scale_onednn_fuse_pass.cc
similarity index 97%
rename from paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc
rename to paddle/fluid/framework/ir/onednn/operator_scale_onednn_fuse_pass.cc
index 4f6c2bfe0507b..2910849af5f8d 100644
--- a/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/operator_scale_onednn_fuse_pass.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/operator_scale_onednn_fuse_pass.h"
 
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
+#include "paddle/fluid/framework/ir/onednn/onednn_pass_util.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/backends/onednn/onednn_reuse.h"
 #include "paddle/utils/string/pretty_log.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.h b/paddle/fluid/framework/ir/onednn/operator_scale_onednn_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.h
rename to paddle/fluid/framework/ir/onednn/operator_scale_onednn_fuse_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/operator_unsqueeze2_onednn_fuse_pass.cc
similarity index 97%
rename from paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.cc
rename to paddle/fluid/framework/ir/onednn/operator_unsqueeze2_onednn_fuse_pass.cc
index 28b01bc065b37..2aea55f473fd4 100644
--- a/paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/operator_unsqueeze2_onednn_fuse_pass.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/operator_unsqueeze2_onednn_fuse_pass.h"
 
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
+#include "paddle/fluid/framework/ir/onednn/onednn_pass_util.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/utils/string/pretty_log.h"
 
diff --git a/paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.h b/paddle/fluid/framework/ir/onednn/operator_unsqueeze2_onednn_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.h
rename to paddle/fluid/framework/ir/onednn/operator_unsqueeze2_onednn_fuse_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass.cc b/paddle/fluid/framework/ir/onednn/params_quantization_onednn_pass.cc
similarity index 95%
rename from paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass.cc
rename to paddle/fluid/framework/ir/onednn/params_quantization_onednn_pass.cc
index 11eba402b55d4..2255458535071 100644
--- a/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/params_quantization_onednn_pass.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass.h"
+#include "paddle/fluid/framework/ir/onednn/params_quantization_onednn_pass.h"
 
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #include "paddle/utils/string/pretty_log.h"
 
 namespace paddle {
@@ -131,7 +131,7 @@ void ParamsQuantizationMkldnnPass::QuantizeConv(ir::Graph* graph,
       LOG(WARNING) << "Pass in op compat failed.";
       return;
     }
-    VLOG(4) << "handle convolution in params_quantization_mkldnn_pass";
+    VLOG(4) << "handle convolution in params_quantization_onednn_pass";
 
     GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
@@ -179,9 +179,9 @@ void ParamsQuantizationMkldnnPass::ApplyImpl(ir::Graph* graph) const {
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(params_quantization_mkldnn_pass,
+REGISTER_PASS(params_quantization_onednn_pass,
               paddle::framework::ir::ParamsQuantizationMkldnnPass);
-REGISTER_PASS_CAPABILITY(params_quantization_mkldnn_pass)
+REGISTER_PASS_CAPABILITY(params_quantization_onednn_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination().LE(
             "conv2d", 1));
diff --git a/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass.h b/paddle/fluid/framework/ir/onednn/params_quantization_onednn_pass.h
similarity index 95%
rename from paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass.h
rename to paddle/fluid/framework/ir/onednn/params_quantization_onednn_pass.h
index e681d9701b8d8..c8bf17cb081ec 100644
--- a/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/onednn/params_quantization_onednn_pass.h
@@ -37,7 +37,7 @@ class ParamsQuantizationMkldnnPass : public FusePassBase {
                     bool with_residual_connection) const;
 
  private:
-  const std::string name_scope_ = "params_quantization_mkldnn_pass";
+  const std::string name_scope_ = "params_quantization_onednn_pass";
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/onednn/params_quantization_onednn_pass_tester.cc
similarity index 98%
rename from paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass_tester.cc
rename to paddle/fluid/framework/ir/onednn/params_quantization_onednn_pass_tester.cc
index 72b07fc8934de..36ff2110e582f 100755
--- a/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/onednn/params_quantization_onednn_pass_tester.cc
@@ -14,7 +14,7 @@
 
 #include <gtest/gtest.h>
 
-#include "paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass.h"  // NOLINT
+#include "paddle/fluid/framework/ir/onednn/params_quantization_onednn_pass.h"  // NOLINT
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/phi/common/place.h"
 
@@ -39,8 +39,8 @@ struct Data {
   const std::vector<float>& getData() const { return data; }
 
  private:
-  const std::vector<int64_t> shape;
-  const std::vector<float> data;
+  const std::vector<int64_t> shape{};
+  const std::vector<float> data{};
 };
 
 struct TestScope {
@@ -245,7 +245,7 @@ struct ParamsQuantizationMkldnnPassTestFixture : public ::testing::Test {
   void RunPassTest(std::unique_ptr<ProgramStrategy> program) {
     auto graph = program->CreateGraph();
 
-    auto pass = PassRegistry::Instance().Get("params_quantization_mkldnn_pass");
+    auto pass = PassRegistry::Instance().Get("params_quantization_onednn_pass");
     graph.reset(pass->Apply(graph.release()));
 
     program->CheckGraph(graph);
@@ -384,4 +384,4 @@ TEST_F(ParamsQuantizationMkldnnPassTestFixture, conv_with_bias_2g2o2i1h1ws) {
 }  // namespace framework
 }  // namespace paddle
 
-USE_PASS(params_quantization_mkldnn_pass);
+USE_PASS(params_quantization_onednn_pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/onednn/quant_dequant_onednn_pass.cc
similarity index 98%
rename from paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
rename to paddle/fluid/framework/ir/onednn/quant_dequant_onednn_pass.cc
index 734915b0dfe95..6ffd3963504f2 100644
--- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/quant_dequant_onednn_pass.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h"
+#include "paddle/fluid/framework/ir/onednn/quant_dequant_onednn_pass.h"
 
 #include <string>
 
 #include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
+#include "paddle/fluid/framework/ir/onednn/onednn_pass_util.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
@@ -703,8 +703,8 @@ void QuantDequantMkldnnPass::RemoveCtrlVars(ir::Graph* graph) const {
 }
 
 void QuantDequantMkldnnPass::ApplyImpl(ir::Graph* graph) const {
-  VLOG(3) << "Convert paddle slim quantized model to mkldnn quantized model.";
-  const std::string pattern_name = "quant_dequant_mkldnn_pass";
+  VLOG(3) << "Convert paddle slim quantized model to onednn quantized model.";
+  const std::string pattern_name = "quant_dequant_onednn_pass";
   FusePassBase::Init(pattern_name, graph);
 
   const std::unordered_set<std::string> skip_ops = {"conv2d",
@@ -753,7 +753,7 @@ void QuantDequantMkldnnPass::ApplyImpl(ir::Graph* graph) const {
   RemoveCtrlVars(graph);
 
   // save var_quant_scales in the temporary save op's attr
-  // for compute_propagate_scales_mkldnn_pass
+  // for compute_propagate_scales_onednn_pass
   SaveInfoInTheTmpOp(
       graph, "has_quant_info", "var_quant_scales", var_quant_scales);
 }
@@ -762,10 +762,10 @@ void QuantDequantMkldnnPass::ApplyImpl(ir::Graph* graph) const {
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(quant_dequant_mkldnn_pass,
+REGISTER_PASS(quant_dequant_onednn_pass,
               paddle::framework::ir::QuantDequantMkldnnPass);
 
-REGISTER_PASS_CAPABILITY(quant_dequant_mkldnn_pass)
+REGISTER_PASS_CAPABILITY(quant_dequant_onednn_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .LE("conv2d", 1)
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h b/paddle/fluid/framework/ir/onednn/quant_dequant_onednn_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h
rename to paddle/fluid/framework/ir/onednn/quant_dequant_onednn_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_transpose2_dequant_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/quant_transpose2_dequant_onednn_fuse_pass.cc
similarity index 98%
rename from paddle/fluid/framework/ir/mkldnn/quant_transpose2_dequant_onednn_fuse_pass.cc
rename to paddle/fluid/framework/ir/onednn/quant_transpose2_dequant_onednn_fuse_pass.cc
index 5d5edb83a9134..37dfec26b36f2 100644
--- a/paddle/fluid/framework/ir/mkldnn/quant_transpose2_dequant_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/quant_transpose2_dequant_onednn_fuse_pass.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/quant_transpose2_dequant_onednn_fuse_pass.h"
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
+#include "paddle/fluid/framework/ir/onednn/quant_transpose2_dequant_onednn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/onednn_pass_util.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/utils/string/pretty_log.h"
 
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_transpose2_dequant_onednn_fuse_pass.h b/paddle/fluid/framework/ir/onednn/quant_transpose2_dequant_onednn_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/quant_transpose2_dequant_onednn_fuse_pass.h
rename to paddle/fluid/framework/ir/onednn/quant_transpose2_dequant_onednn_fuse_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/reshape_transpose_matmul_onednn_fuse_pass.cc
similarity index 95%
rename from paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
rename to paddle/fluid/framework/ir/onednn/reshape_transpose_matmul_onednn_fuse_pass.cc
index 07675a3f4efeb..f3250c32604c6 100644
--- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/reshape_transpose_matmul_onednn_fuse_pass.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h"
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
+#include "paddle/fluid/framework/ir/onednn/reshape_transpose_matmul_onednn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/onednn_pass_util.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/utils/string/pretty_log.h"
@@ -53,13 +53,13 @@ void ReshapeTransposeMatmulMkldnnFusePass::Fuse(
   PADDLE_ENFORCE_NOT_NULL(graph,
                           platform::errors::InvalidArgument(
                               "Pointer to graph argument should not be NULL."));
-  FusePassBase::Init("reshape_transpose_" + matmul_type + "_mkldnn_fuse_pass",
+  FusePassBase::Init("reshape_transpose_" + matmul_type + "_onednn_fuse_pass",
                      graph);
 
   GraphPatternDetector gpd;
   patterns::ReshapeTransposeMatmulPattern rtm_pattern(
       gpd.mutable_pattern(),
-      "reshape_transpose_" + matmul_type + "_mkldnn_fuse_pass");
+      "reshape_transpose_" + matmul_type + "_onednn_fuse_pass");
 
   rtm_pattern(matmul_type, with_reshape_xshape, with_transpose_xshape);
 
@@ -68,7 +68,7 @@ void ReshapeTransposeMatmulMkldnnFusePass::Fuse(
                      Graph *g) {
     if (!IsCompat(subgraph, g)) {
       LOG(WARNING) << "Op compatible check in reshape_transpose_" << matmul_type
-                   << "_mkldnn_fuse_pass failed.";
+                   << "_onednn_fuse_pass failed.";
       return;
     }
 
@@ -268,10 +268,10 @@ ReshapeTransposeMatmulMkldnnFusePass::ReshapeTransposeMatmulMkldnnFusePass() {
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(reshape_transpose_matmul_mkldnn_fuse_pass,
+REGISTER_PASS(reshape_transpose_matmul_onednn_fuse_pass,
               paddle::framework::ir::ReshapeTransposeMatmulMkldnnFusePass);
 
-REGISTER_PASS_CAPABILITY(reshape_transpose_matmul_mkldnn_fuse_pass)
+REGISTER_PASS_CAPABILITY(reshape_transpose_matmul_onednn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .EQ("reshape2", 0)
diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/onednn/reshape_transpose_matmul_onednn_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h
rename to paddle/fluid/framework/ir/onednn/reshape_transpose_matmul_onednn_fuse_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/scale_matmul_fuse_pass.cc
similarity index 97%
rename from paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
rename to paddle/fluid/framework/ir/onednn/scale_matmul_fuse_pass.cc
index 9f50aefc46ce5..7ae647c6d28f7 100644
--- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/scale_matmul_fuse_pass.cc
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/scale_matmul_fuse_pass.h"
 
 #include <string>
 #include <vector>
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #include "paddle/utils/string/pretty_log.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h b/paddle/fluid/framework/ir/onednn/scale_matmul_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h
rename to paddle/fluid/framework/ir/onednn/scale_matmul_fuse_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/self_attention_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/self_attention_fuse_pass.cc
similarity index 98%
rename from paddle/fluid/framework/ir/mkldnn/self_attention_fuse_pass.cc
rename to paddle/fluid/framework/ir/onednn/self_attention_fuse_pass.cc
index e02b167a19e3b..4e409f764491c 100644
--- a/paddle/fluid/framework/ir/mkldnn/self_attention_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/self_attention_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/self_attention_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/self_attention_fuse_pass.h"
 
 #include <string>
 #include "paddle/fluid/framework/lod_tensor.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/self_attention_fuse_pass.h b/paddle/fluid/framework/ir/onednn/self_attention_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/self_attention_fuse_pass.h
rename to paddle/fluid/framework/ir/onednn/self_attention_fuse_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc b/paddle/fluid/framework/ir/onednn/shuffle_channel_onednn_detect_pass.cc
similarity index 97%
rename from paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc
rename to paddle/fluid/framework/ir/onednn/shuffle_channel_onednn_detect_pass.cc
index 764712a2fcd8a..7bce1813fed8a 100644
--- a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/shuffle_channel_onednn_detect_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h"
+#include "paddle/fluid/framework/ir/onednn/shuffle_channel_onednn_detect_pass.h"
 
 #include <string>
 
@@ -235,9 +235,9 @@ void ShuffleChannelMKLDNNDetectPass::ApplyImpl(ir::Graph* graph) const {
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(shuffle_channel_mkldnn_detect_pass,
+REGISTER_PASS(shuffle_channel_onednn_detect_pass,
               paddle::framework::ir::ShuffleChannelMKLDNNDetectPass);
-REGISTER_PASS_CAPABILITY(shuffle_channel_mkldnn_detect_pass)
+REGISTER_PASS_CAPABILITY(shuffle_channel_onednn_detect_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .EQ("reshape2", 0)
diff --git a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h b/paddle/fluid/framework/ir/onednn/shuffle_channel_onednn_detect_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h
rename to paddle/fluid/framework/ir/onednn/shuffle_channel_onednn_detect_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc b/paddle/fluid/framework/ir/onednn/shuffle_channel_onednn_detect_pass_tester.cc
similarity index 94%
rename from paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc
rename to paddle/fluid/framework/ir/onednn/shuffle_channel_onednn_detect_pass_tester.cc
index 4c6fc3774e840..da389d3a1353c 100644
--- a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc
+++ b/paddle/fluid/framework/ir/onednn/shuffle_channel_onednn_detect_pass_tester.cc
@@ -16,7 +16,7 @@
 
 #include <vector>
 
-#include "paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h"
+#include "paddle/fluid/framework/ir/onednn/shuffle_channel_onednn_detect_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
 namespace paddle {
@@ -55,7 +55,7 @@ void MainTest() {
 
   int original_nodes_num = graph->Nodes().size();
   auto pass =
-      PassRegistry::Instance().Get("shuffle_channel_mkldnn_detect_pass");
+      PassRegistry::Instance().Get("shuffle_channel_onednn_detect_pass");
   graph.reset(pass->Apply(graph.release()));
   int current_nodes_num = graph->Nodes().size();
 
@@ -82,4 +82,4 @@ TEST(ShuffleChannelOneDNNDetectPass, ShuffleChannelOneDNNDetectPassTest) {
 }  // namespace framework
 }  // namespace paddle
 
-USE_PASS(shuffle_channel_mkldnn_detect_pass);
+USE_PASS(shuffle_channel_onednn_detect_pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/softplus_activation_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/softplus_activation_onednn_fuse_pass.cc
similarity index 94%
rename from paddle/fluid/framework/ir/mkldnn/softplus_activation_onednn_fuse_pass.cc
rename to paddle/fluid/framework/ir/onednn/softplus_activation_onednn_fuse_pass.cc
index 2030a7dadc02e..d18765ff27bdd 100644
--- a/paddle/fluid/framework/ir/mkldnn/softplus_activation_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/softplus_activation_onednn_fuse_pass.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/softplus_activation_onednn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/softplus_activation_onednn_fuse_pass.h"
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/mkldnn/activation_onednn_fuse_pass.h"
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
+#include "paddle/fluid/framework/ir/onednn/activation_onednn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/onednn_pass_util.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/utils/string/pretty_log.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/softplus_activation_onednn_fuse_pass.h b/paddle/fluid/framework/ir/onednn/softplus_activation_onednn_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/softplus_activation_onednn_fuse_pass.h
rename to paddle/fluid/framework/ir/onednn/softplus_activation_onednn_fuse_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/squeeze2_transpose2_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/squeeze2_transpose2_onednn_fuse_pass.cc
similarity index 96%
rename from paddle/fluid/framework/ir/mkldnn/squeeze2_transpose2_onednn_fuse_pass.cc
rename to paddle/fluid/framework/ir/onednn/squeeze2_transpose2_onednn_fuse_pass.cc
index 1aafcc0614afb..4af9c6a770436 100644
--- a/paddle/fluid/framework/ir/mkldnn/squeeze2_transpose2_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/squeeze2_transpose2_onednn_fuse_pass.cc
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/ir/mkldnn/squeeze2_transpose2_onednn_fuse_pass.h"
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
+#include "paddle/fluid/framework/ir/onednn/squeeze2_transpose2_onednn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/onednn/onednn_pass_util.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/backends/onednn/onednn_reuse.h"
 #include "paddle/utils/string/pretty_log.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/squeeze2_transpose2_onednn_fuse_pass.h b/paddle/fluid/framework/ir/onednn/squeeze2_transpose2_onednn_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/squeeze2_transpose2_onednn_fuse_pass.h
rename to paddle/fluid/framework/ir/onednn/squeeze2_transpose2_onednn_fuse_pass.h
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
index 779d9986ef8a1..0b3ebd324dc7a 100644
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -29,7 +29,7 @@ class Graph;
 }  // namespace framework
 }  // namespace paddle
 #ifdef PADDLE_WITH_DNNL
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #endif
 
 namespace paddle {
@@ -68,6 +68,7 @@ static const std::vector<std::string> xpu_support_subgraph_passes = {
     "constant_folding_pass",
     "delete_elementwise_mul_op_pass",
     "generate_sequence_xpu_fuse_pass",
+    "group_norm_silu_xpu_fuse_pass",
     "embedding_with_eltwise_add_xpu_fuse_pass",
     "multi_encoder_xpu_fuse_pass",
     "multi_encoder_xpu_adaptive_seqlen_fuse_pass",
@@ -82,6 +83,7 @@ static const std::vector<std::string> xpu_support_subgraph_passes = {
     "fc_xpu_fuse_pass",
     "link_xpu_op_max_pass",
     "xpu_delete_cast_op_pass",
+    "spatial_transformer_resblock_xpu_fuse_pass",
 };
 
 static std::vector<std::string> support_subgraph_generate_passes;
diff --git a/paddle/fluid/framework/ir/transfer_layout_pass.cc b/paddle/fluid/framework/ir/transfer_layout_pass.cc
index c31737958dffb..b989f51dfe8f9 100644
--- a/paddle/fluid/framework/ir/transfer_layout_pass.cc
+++ b/paddle/fluid/framework/ir/transfer_layout_pass.cc
@@ -107,13 +107,17 @@ void TransferLayoutPass::ApplyImpl(ir::Graph *graph) const {
   FusePassBase::Init("fused_conv2d_add_act_layout_transfer", graph);
   auto *scope = param_scope();
 
-  // only float16 compute precision need insert transfer_layout.
+  // float16 for all(cutlass cudnn), float32 for cutlass.
+  // why?
+  // In the case of cudnn nhwc fp32, performance degradation will occur
   bool is_fp16_precision =
       static_cast<phi::DataType>(Get<int>("model_precision")) ==
           phi::DataType::FLOAT16 ||
       Get<bool>("enable_gpu_mixed");
 
-  if (!is_fp16_precision) return;
+  bool cutlass_enable = Get<bool>("use_cutlass");
+
+  if (!is_fp16_precision && !cutlass_enable) return;
 
   PADDLE_ENFORCE_EQ(graph->IsMainGraph(),
                     true,
diff --git a/paddle/fluid/framework/ir/xpu/cross_attention_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/cross_attention_xpu_fuse_pass.cc
new file mode 100644
index 0000000000000..cea83dae5e8bf
--- /dev/null
+++ b/paddle/fluid/framework/ir/xpu/cross_attention_xpu_fuse_pass.cc
@@ -0,0 +1,666 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/xpu/cross_attention_xpu_fuse_pass.h"
+
+#include "glog/logging.h"
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/xpu/pass_utils.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+namespace patterns {
+
+struct CrossAttentionFusePattern : public PatternBase {
+  CrossAttentionFusePattern(PDPattern* pattern,
+                            const std::string& name_scope,
+                            bool with_q_scale);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(q_mul);
+  PATTERN_DECL_NODE(k_mul);
+  PATTERN_DECL_NODE(v_mul);
+  PATTERN_DECL_NODE(q_add);
+  PATTERN_DECL_NODE(k_add);
+  PATTERN_DECL_NODE(v_add);
+  PATTERN_DECL_NODE(reshape_1);
+  PATTERN_DECL_NODE(reshape_2);
+  PATTERN_DECL_NODE(reshape_3);
+  PATTERN_DECL_NODE(transpose_1);
+  PATTERN_DECL_NODE(transpose_2);
+  PATTERN_DECL_NODE(transpose_3);
+  PATTERN_DECL_NODE(scale);
+  PATTERN_DECL_NODE(qk_matmul);
+  PATTERN_DECL_NODE(qk_add);
+  PATTERN_DECL_NODE(qk_softmax);
+  PATTERN_DECL_NODE(qkv_matmul);
+  PATTERN_DECL_NODE(transpose_4);
+  PATTERN_DECL_NODE(reshape_4);
+
+  // declare variable node's name
+  PATTERN_DECL_NODE(input_q);
+  PATTERN_DECL_NODE(input_kv);
+  PATTERN_DECL_NODE(mask);
+  PATTERN_DECL_NODE(q_mul_w);
+  PATTERN_DECL_NODE(k_mul_w);
+  PATTERN_DECL_NODE(v_mul_w);
+  PATTERN_DECL_NODE(q_mul_out);
+  PATTERN_DECL_NODE(k_mul_out);
+  PATTERN_DECL_NODE(v_mul_out);
+  PATTERN_DECL_NODE(q_add_bias);
+  PATTERN_DECL_NODE(k_add_bias);
+  PATTERN_DECL_NODE(v_add_bias);
+  PATTERN_DECL_NODE(q_add_out);
+  PATTERN_DECL_NODE(k_add_out);
+  PATTERN_DECL_NODE(v_add_out);
+  PATTERN_DECL_NODE(reshape_1_out);
+  PATTERN_DECL_NODE(reshape_2_out);
+  PATTERN_DECL_NODE(reshape_3_out);
+  PATTERN_DECL_NODE(transpose_1_out);
+  PATTERN_DECL_NODE(transpose_2_out);
+  PATTERN_DECL_NODE(transpose_3_out);
+  PATTERN_DECL_NODE(scale_out);
+  PATTERN_DECL_NODE(qk_matmul_out);
+  PATTERN_DECL_NODE(qk_add_out);
+  PATTERN_DECL_NODE(qk_softmax_out);
+  PATTERN_DECL_NODE(qkv_matmul_out);
+  PATTERN_DECL_NODE(transpose_4_out);
+  PATTERN_DECL_NODE(output);
+
+ private:
+  bool with_q_scale_{false};
+};
+
+CrossAttentionFusePattern::CrossAttentionFusePattern(
+    PDPattern* pattern, const std::string& name_scope, bool with_q_scale)
+    : PatternBase(pattern, name_scope, name_scope),
+      with_q_scale_(with_q_scale) {
+  auto* input_q = pattern->NewNode(input_q_repr())
+                      ->assert_is_op_input("matmul_v2", "X")
+                      ->AsInput();
+  auto* input_kv = pattern->NewNode(input_kv_repr())
+                       ->assert_is_op_input("matmul_v2", "X")
+                       ->AsInput();
+  auto* mask = pattern->NewNode(mask_repr())
+                   ->assert_is_op_input("elementwise_add", "Y")
+                   ->AsInput();
+  auto* q_mul_w =
+      pattern->NewNode(q_mul_w_repr())->assert_is_op_input("matmul_v2", "Y");
+  auto* q_mul = pattern->NewNode(q_mul_repr())->assert_is_op("matmul_v2");
+  auto* q_mul_out = pattern->NewNode(q_mul_out_repr())
+                        ->assert_is_op_output("matmul_v2", "Out")
+                        ->assert_is_op_input("elementwise_add", "X");
+  auto* k_mul_w =
+      pattern->NewNode(k_mul_w_repr())->assert_is_op_input("matmul_v2", "Y");
+  auto* k_mul = pattern->NewNode(k_mul_repr())->assert_is_op("matmul_v2");
+  auto* k_mul_out = pattern->NewNode(k_mul_out_repr())
+                        ->assert_is_op_output("matmul_v2", "Out")
+                        ->assert_is_op_input("elementwise_add", "X");
+  auto* v_mul_w =
+      pattern->NewNode(v_mul_w_repr())->assert_is_op_input("matmul_v2", "Y");
+  auto* v_mul = pattern->NewNode(v_mul_repr())->assert_is_op("matmul_v2");
+  auto* v_mul_out = pattern->NewNode(v_mul_out_repr())
+                        ->assert_is_op_output("matmul_v2", "Out")
+                        ->assert_is_op_input("elementwise_add", "X");
+  auto* q_add = pattern->NewNode(q_add_repr())->assert_is_op("elementwise_add");
+  auto* q_add_bias = pattern->NewNode(q_add_bias_repr())
+                         ->assert_is_op_input("elementwise_add", "Y");
+  auto* q_add_out = pattern->NewNode(q_add_out_repr())
+                        ->assert_is_op_output("elementwise_add", "Out")
+                        ->assert_is_op_input("reshape2", "X");
+  auto* k_add = pattern->NewNode(k_add_repr())->assert_is_op("elementwise_add");
+  auto* k_add_bias = pattern->NewNode(k_add_bias_repr())
+                         ->assert_is_op_input("elementwise_add", "Y");
+  auto* k_add_out = pattern->NewNode(k_add_out_repr())
+                        ->assert_is_op_output("elementwise_add", "Out")
+                        ->assert_is_op_input("reshape2", "X");
+  auto* v_add = pattern->NewNode(v_add_repr())->assert_is_op("elementwise_add");
+  auto* v_add_bias = pattern->NewNode(v_add_bias_repr())
+                         ->assert_is_op_input("elementwise_add", "Y");
+  auto* v_add_out = pattern->NewNode(v_add_out_repr())
+                        ->assert_is_op_output("elementwise_add", "Out")
+                        ->assert_is_op_input("reshape2", "X");
+  auto* reshape_1 =
+      pattern->NewNode(reshape_1_repr())->assert_is_op("reshape2");
+  auto* reshape_1_out = pattern->NewNode(reshape_1_out_repr())
+                            ->assert_is_op_output("reshape2", "Out")
+                            ->assert_is_op_input("transpose2", "X");
+  auto* reshape_2 =
+      pattern->NewNode(reshape_2_repr())->assert_is_op("reshape2");
+  auto* reshape_2_out = pattern->NewNode(reshape_2_out_repr())
+                            ->assert_is_op_output("reshape2", "Out")
+                            ->assert_is_op_input("transpose2", "X");
+  auto* reshape_3 =
+      pattern->NewNode(reshape_3_repr())->assert_is_op("reshape2");
+  auto* reshape_3_out = pattern->NewNode(reshape_3_out_repr())
+                            ->assert_is_op_output("reshape2", "Out")
+                            ->assert_is_op_input("transpose2", "X");
+  auto* transpose_1 =
+      pattern->NewNode(transpose_1_repr())
+          ->assert_is_op("transpose2")
+          ->assert_more([](Node* node) {
+            auto* op_desc = node->Op();
+            auto axis = op_desc->GetAttrIfExists<std::vector<int>>("axis");
+            size_t axis_rank = axis.size();
+            return axis_rank == 4 && axis[0] == 0 && axis[1] == 2 &&
+                   axis[2] == 1 && axis[3] == 3;
+          });
+
+  auto* transpose_2 =
+      pattern->NewNode(transpose_2_repr())
+          ->assert_is_op("transpose2")
+          ->assert_more([](Node* node) {
+            auto* op_desc = node->Op();
+            auto axis = op_desc->GetAttrIfExists<std::vector<int>>("axis");
+            size_t axis_rank = axis.size();
+            return axis_rank == 4 && axis[0] == 0 && axis[1] == 2 &&
+                   axis[2] == 1 && axis[3] == 3;
+          });
+  auto* transpose_2_out = pattern->NewNode(transpose_2_out_repr())
+                              ->assert_is_op_output("transpose2", "Out")
+                              ->assert_is_op_input("matmul_v2", "Y");
+  auto* transpose_3 =
+      pattern->NewNode(transpose_3_repr())
+          ->assert_is_op("transpose2")
+          ->assert_more([](Node* node) {
+            auto* op_desc = node->Op();
+            auto axis = op_desc->GetAttrIfExists<std::vector<int>>("axis");
+            size_t axis_rank = axis.size();
+            return axis_rank == 4 && axis[0] == 0 && axis[1] == 2 &&
+                   axis[2] == 1 && axis[3] == 3;
+          });
+  auto* transpose_3_out = pattern->NewNode(transpose_3_out_repr())
+                              ->assert_is_op_output("transpose2", "Out")
+                              ->assert_is_op_input("matmul_v2", "Y");
+  PDNode* transpose_1_out = nullptr;
+  PDNode* scale = nullptr;
+  PDNode* scale_out = nullptr;
+  if (with_q_scale_) {
+    transpose_1_out = pattern->NewNode(transpose_1_out_repr())
+                          ->assert_is_op_output("transpose2", "Out")
+                          ->assert_is_op_input("scale", "X");
+    scale = pattern->NewNode(scale_repr())->assert_is_op("scale");
+    scale_out = pattern->NewNode(scale_out_repr())
+                    ->assert_is_op_output("scale", "Out")
+                    ->assert_is_op_input("matmul_v2", "X");
+  } else {
+    transpose_1_out = pattern->NewNode(transpose_1_out_repr())
+                          ->assert_is_op_output("transpose2", "Out")
+                          ->assert_is_op_input("matmul_v2", "X");
+  }
+  auto* qk_matmul =
+      pattern->NewNode(qk_matmul_repr())->assert_is_op("matmul_v2");
+  auto* qk_matmul_out = pattern->NewNode(qk_matmul_out_repr())
+                            ->assert_is_op_output("matmul_v2", "Out")
+                            ->assert_is_op_input("elementwise_add", "X");
+  auto* qk_add =
+      pattern->NewNode(qk_add_repr())->assert_is_op("elementwise_add");
+  auto* qk_add_out = pattern->NewNode(qk_add_out_repr())
+                         ->assert_is_op_output("elementwise_add", "Out")
+                         ->assert_is_op_input("softmax", "X");
+  auto* qk_softmax =
+      pattern->NewNode(qk_softmax_repr())->assert_is_op("softmax");
+  auto* qk_softmax_out = pattern->NewNode(qk_softmax_out_repr())
+                             ->assert_is_op_output("softmax", "Out")
+                             ->assert_is_op_input("matmul_v2", "X");
+  auto* qkv_matmul =
+      pattern->NewNode(qkv_matmul_repr())->assert_is_op("matmul_v2");
+  auto* qkv_matmul_out = pattern->NewNode(qkv_matmul_out_repr())
+                             ->assert_is_op_output("matmul_v2", "Out")
+                             ->assert_is_op_input("transpose2", "X");
+  auto* transpose_4 =
+      pattern->NewNode(transpose_4_repr())->assert_is_op("transpose2");
+  auto* transpose_4_out = pattern->NewNode(transpose_4_out_repr())
+                              ->assert_is_op_output("transpose2", "Out")
+                              ->assert_is_op_input("reshape2", "X");
+  auto* reshape_4 =
+      pattern->NewNode(reshape_4_repr())->assert_is_op("reshape2");
+  auto* output = pattern->NewNode(output_repr())
+                     ->AsOutput()
+                     ->assert_is_op_output("reshape2", "Out");
+
+  // link nodes
+  q_mul->LinksFrom({input_q, q_mul_w}).LinksTo({q_mul_out});
+  q_add->LinksFrom({q_mul_out, q_add_bias}).LinksTo({q_add_out});
+  reshape_1->LinksFrom({q_add_out}).LinksTo({reshape_1_out});
+  transpose_1->LinksFrom({reshape_1_out}).LinksTo({transpose_1_out});
+  k_mul->LinksFrom({input_kv, k_mul_w}).LinksTo({k_mul_out});
+  k_add->LinksFrom({k_mul_out, k_add_bias}).LinksTo({k_add_out});
+  reshape_2->LinksFrom({k_add_out}).LinksTo({reshape_2_out});
+  transpose_2->LinksFrom({reshape_2_out}).LinksTo({transpose_2_out});
+  if (with_q_scale_) {
+    scale->LinksFrom({transpose_1_out}).LinksTo({scale_out});
+    qk_matmul->LinksFrom({scale_out, transpose_2_out}).LinksTo({qk_matmul_out});
+  } else {
+    qk_matmul->LinksFrom({transpose_1_out, transpose_2_out})
+        .LinksTo({qk_matmul_out});
+  }
+  qk_add->LinksFrom({qk_matmul_out, mask}).LinksTo({qk_add_out});
+  qk_softmax->LinksFrom({qk_add_out}).LinksTo({qk_softmax_out});
+  v_mul->LinksFrom({input_kv, v_mul_w}).LinksTo({v_mul_out});
+  v_add->LinksFrom({v_mul_out, v_add_bias}).LinksTo({v_add_out});
+  reshape_3->LinksFrom({v_add_out}).LinksTo({reshape_3_out});
+  transpose_3->LinksFrom({reshape_3_out}).LinksTo({transpose_3_out});
+  qkv_matmul->LinksFrom({qk_softmax_out, transpose_3_out})
+      .LinksTo({qkv_matmul_out});
+  transpose_4->LinksFrom({qkv_matmul_out}).LinksTo({transpose_4_out});
+  reshape_4->LinksFrom({transpose_4_out}).LinksTo({output});
+}
+
+}  // namespace patterns
+
+void CrossAttentionXPUFusePass::PrepareQKVWeight(Graph* graph,
+                                                 Scope* scope,
+                                                 BlockDesc* block,
+                                                 Node* w,
+                                                 Node** real_w,
+                                                 Node** w_max) const {
+  phi::DenseTensor w_tensor;
+  phi::DenseTensor w_int16_tensor;
+  phi::DenseTensor w_max_tensor;
+
+  Assign(scope->Var(w->Name())->Get<phi::DenseTensor>(), &w_tensor);
+  CastToFp32(&w_tensor, &w_int16_tensor);
+  ConvertWithQuant<float, int16_t>(
+      &w_int16_tensor, &w_max_tensor, nullptr, false);
+
+  size_t real_w_hash = HashTensor<int16_t>(w_int16_tensor);
+  size_t w_max_hash = HashTensor<float>(w_max_tensor);
+  std::string real_w_name = std::to_string(real_w_hash);
+  std::string w_max_name = std::to_string(w_max_hash);
+
+  *real_w = FindNodeWithName(graph, real_w_name);
+
+  if (*real_w == nullptr) {
+    // Create real_w node
+    // Update real_w var_desc in block
+    VarDesc real_w_desc(real_w_name);
+    real_w_desc.SetPersistable(true);
+    real_w_desc.SetShape(common::vectorize(w_int16_tensor.dims()));
+    real_w_desc.SetDataType(
+        framework::TransToProtoVarType(w_int16_tensor.dtype()));
+    *real_w = graph->CreateVarNode(&real_w_desc);
+    auto* block_real_w_desc = block->Var(real_w_name);
+    block_real_w_desc->SetPersistable(real_w_desc.Persistable());
+    block_real_w_desc->SetShape(real_w_desc.GetShape());
+    block_real_w_desc->SetDataType(real_w_desc.GetDataType());
+    // Create w_max node
+    // Update w_max var_desc in block
+    VarDesc w_max_desc(w_max_name);
+    w_max_desc.SetPersistable(true);
+    w_max_desc.SetShape(common::vectorize(w_max_tensor.dims()));
+    w_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
+    *w_max = graph->CreateVarNode(&w_max_desc);
+    auto* block_w_max_desc = block->Var(w_max_name);
+    block_w_max_desc->SetPersistable(w_max_desc.Persistable());
+    block_w_max_desc->SetShape(w_max_desc.GetShape());
+    block_w_max_desc->SetDataType(w_max_desc.GetDataType());
+
+    // Find real_w/w_max variable in scope
+    auto* w_var = scope->FindVar(real_w_name);
+    if (w_var == nullptr) {
+      // Create qkv_w_intx/qkv_w_max variable/tensor
+      Assign(w_int16_tensor,
+             scope->Var(real_w_name)->GetMutable<phi::DenseTensor>());
+      Assign(w_max_tensor,
+             scope->Var(w_max_name)->GetMutable<phi::DenseTensor>());
+    } else {
+      // Share the same variable
+      PADDLE_ENFORCE_NOT_NULL(
+          scope->FindVar(w_max_name),
+          platform::errors::Fatal(
+              "w_max(%s) variable should not be nullptr if real_w(%s) "
+              "variable is exist.",
+              w_max_name,
+              real_w_name));
+    }
+  } else {
+    *w_max = FindNodeWithName(graph, w_max_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        *w_max,
+        platform::errors::Fatal(
+            "w_max(%s) variable should not be nullptr if real_w(%s) "
+            "variable is exist.",
+            w_max_name,
+            real_w_name));
+  }
+}
+
+void CrossAttentionXPUFusePass::PrepareQKVBias(Graph* graph,
+                                               Scope* scope,
+                                               BlockDesc* block,
+                                               Node* q_bias,
+                                               Node* k_bias,
+                                               Node* v_bias,
+                                               Node** real_q_bias,
+                                               Node** real_k_bias,
+                                               Node** real_v_bias) const {
+  phi::DenseTensor* q_bias_tensor;
+  phi::DenseTensor* k_bias_tensor;
+  phi::DenseTensor* v_bias_tensor;
+  phi::DenseTensor q_bias_fp32_tensor;
+  phi::DenseTensor k_bias_fp32_tensor;
+  phi::DenseTensor v_bias_fp32_tensor;
+  q_bias_tensor = scope->Var(q_bias->Name())->GetMutable<phi::DenseTensor>();
+  k_bias_tensor = scope->Var(k_bias->Name())->GetMutable<phi::DenseTensor>();
+  v_bias_tensor = scope->Var(v_bias->Name())->GetMutable<phi::DenseTensor>();
+  CastToFp32(q_bias_tensor, &q_bias_fp32_tensor);
+  CastToFp32(k_bias_tensor, &k_bias_fp32_tensor);
+  CastToFp32(v_bias_tensor, &v_bias_fp32_tensor);
+
+  size_t q_bias_hash = HashTensor<float>(q_bias_fp32_tensor);
+  std::string q_bias_name = std::to_string(q_bias_hash);
+  *real_q_bias = FindNodeWithName(graph, q_bias_name);
+
+  size_t k_bias_hash = HashTensor<float>(k_bias_fp32_tensor);
+  std::string k_bias_name = std::to_string(k_bias_hash);
+  *real_k_bias = FindNodeWithName(graph, k_bias_name);
+
+  size_t v_bias_hash = HashTensor<float>(v_bias_fp32_tensor);
+  std::string v_bias_name = std::to_string(v_bias_hash);
+  *real_v_bias = FindNodeWithName(graph, v_bias_name);
+  if (*real_q_bias == nullptr) {
+    // Create q_bias node
+    // Update q_bias var_desc in block
+    VarDesc q_bias_desc(q_bias_name);
+    q_bias_desc.SetPersistable(true);
+    q_bias_desc.SetShape(common::vectorize(q_bias_fp32_tensor.dims()));
+    q_bias_desc.SetDataType(
+        framework::TransToProtoVarType(q_bias_fp32_tensor.dtype()));
+    *real_q_bias = graph->CreateVarNode(&q_bias_desc);
+    auto* block_q_bias_desc = block->Var(q_bias_name);
+    block_q_bias_desc->SetPersistable(q_bias_desc.Persistable());
+    block_q_bias_desc->SetShape(q_bias_desc.GetShape());
+    block_q_bias_desc->SetDataType(q_bias_desc.GetDataType());
+    Assign(q_bias_fp32_tensor,
+           scope->Var(q_bias_name)->GetMutable<phi::DenseTensor>());
+  }
+  if (*real_k_bias == nullptr) {
+    // Create k_bias node
+    // Update k_bias var_desc in block
+    VarDesc k_bias_desc(k_bias_name);
+    k_bias_desc.SetPersistable(true);
+    k_bias_desc.SetShape(common::vectorize(k_bias_fp32_tensor.dims()));
+    k_bias_desc.SetDataType(
+        framework::TransToProtoVarType(k_bias_fp32_tensor.dtype()));
+    *real_k_bias = graph->CreateVarNode(&k_bias_desc);
+    auto* block_k_bias_desc = block->Var(k_bias_name);
+    block_k_bias_desc->SetPersistable(k_bias_desc.Persistable());
+    block_k_bias_desc->SetShape(k_bias_desc.GetShape());
+    block_k_bias_desc->SetDataType(k_bias_desc.GetDataType());
+    Assign(k_bias_fp32_tensor,
+           scope->Var(k_bias_name)->GetMutable<phi::DenseTensor>());
+  }
+  if (*real_v_bias == nullptr) {
+    // Create v_bias node
+    // Update v_bias var_desc in block
+    VarDesc v_bias_desc(v_bias_name);
+    v_bias_desc.SetPersistable(true);
+    v_bias_desc.SetShape(common::vectorize(v_bias_fp32_tensor.dims()));
+    v_bias_desc.SetDataType(
+        framework::TransToProtoVarType(v_bias_fp32_tensor.dtype()));
+    *real_v_bias = graph->CreateVarNode(&v_bias_desc);
+    auto* block_v_bias_desc = block->Var(v_bias_name);
+    block_v_bias_desc->SetPersistable(v_bias_desc.Persistable());
+    block_v_bias_desc->SetShape(v_bias_desc.GetShape());
+    block_v_bias_desc->SetDataType(v_bias_desc.GetDataType());
+    Assign(v_bias_fp32_tensor,
+           scope->Var(v_bias_name)->GetMutable<phi::DenseTensor>());
+  }
+}
+
+void CrossAttentionXPUFusePass::ApplyCrossAttentionXPUFuse(
+    ir::Graph* graph, bool with_q_scale) const {
+  GraphPatternDetector gpd;
+  patterns::CrossAttentionFusePattern pattern(
+      gpd.mutable_pattern(), name_scope_, with_q_scale);
+  int found_subgraph_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    VLOG(4) << "handle CrossAttentionXPUFusePass";
+
+    // declare operator node's name
+    GET_IR_NODE(q_mul);
+    GET_IR_NODE(k_mul);
+    GET_IR_NODE(v_mul);
+    GET_IR_NODE(q_add);
+    GET_IR_NODE(k_add);
+    GET_IR_NODE(v_add);
+    GET_IR_NODE(reshape_1);
+    GET_IR_NODE(reshape_2);
+    GET_IR_NODE(reshape_3);
+    GET_IR_NODE(transpose_1);
+    GET_IR_NODE(transpose_2);
+    GET_IR_NODE(transpose_3);
+    GET_IR_NODE(scale);
+    GET_IR_NODE(qk_matmul);
+    GET_IR_NODE(qk_add);
+    GET_IR_NODE(qk_softmax);
+    GET_IR_NODE(qkv_matmul);
+    GET_IR_NODE(transpose_4);
+    GET_IR_NODE(reshape_4);
+
+    // declare variable node's name
+    GET_IR_NODE(input_q);
+    GET_IR_NODE(input_kv);
+    GET_IR_NODE(mask);
+    GET_IR_NODE(q_mul_w);
+    GET_IR_NODE(k_mul_w);
+    GET_IR_NODE(v_mul_w);
+    GET_IR_NODE(q_mul_out);
+    GET_IR_NODE(k_mul_out);
+    GET_IR_NODE(v_mul_out);
+    GET_IR_NODE(q_add_bias);
+    GET_IR_NODE(k_add_bias);
+    GET_IR_NODE(v_add_bias);
+    GET_IR_NODE(q_add_out);
+    GET_IR_NODE(k_add_out);
+    GET_IR_NODE(v_add_out);
+    GET_IR_NODE(reshape_1_out);
+    GET_IR_NODE(reshape_2_out);
+    GET_IR_NODE(reshape_3_out);
+    GET_IR_NODE(transpose_1_out);
+    GET_IR_NODE(transpose_2_out);
+    GET_IR_NODE(transpose_3_out);
+    GET_IR_NODE(scale_out);
+    GET_IR_NODE(qk_matmul_out);
+    GET_IR_NODE(qk_add_out);
+    GET_IR_NODE(qk_softmax_out);
+    GET_IR_NODE(qkv_matmul_out);
+    GET_IR_NODE(transpose_4_out);
+    GET_IR_NODE(output);
+
+    // generate fuse op
+    auto* scope = param_scope();
+    auto* block = q_mul->Op()->Block();
+    framework::OpDesc fused_op_desc(block);
+    fused_op_desc.SetType("cross_attention_xpu");
+
+    Node* real_q_w = nullptr;
+    Node* q_w_max = nullptr;
+    Node* real_k_w = nullptr;
+    Node* k_w_max = nullptr;
+    Node* real_v_w = nullptr;
+    Node* v_w_max = nullptr;
+    PrepareQKVWeight(graph, scope, block, q_mul_w, &real_q_w, &q_w_max);
+    PrepareQKVWeight(graph, scope, block, k_mul_w, &real_k_w, &k_w_max);
+    PrepareQKVWeight(graph, scope, block, v_mul_w, &real_v_w, &v_w_max);
+
+    std::vector<Node*> fc_weight_nodes = {real_q_w, real_k_w, real_v_w};
+    std::vector<std::string> fc_weight_names;
+    for (auto* node : fc_weight_nodes) {
+      if (node) {
+        fc_weight_names.push_back(node->Name());
+      }
+    }
+    std::vector<Node*> fc_weight_max_nodes = {q_w_max, k_w_max, v_w_max};
+    std::vector<std::string> fc_weight_max_names;
+    for (auto* node : fc_weight_max_nodes) {
+      if (node) {
+        fc_weight_max_names.push_back(node->Name());
+      }
+    }
+
+    Node* q_add_bias_fp32 = nullptr;
+    Node* k_add_bias_fp32 = nullptr;
+    Node* v_add_bias_fp32 = nullptr;
+    PrepareQKVBias(graph,
+                   scope,
+                   block,
+                   q_add_bias,
+                   k_add_bias,
+                   v_add_bias,
+                   &q_add_bias_fp32,
+                   &k_add_bias_fp32,
+                   &v_add_bias_fp32);
+    std::vector<Node*> fc_bias_nodes = {
+        q_add_bias_fp32, k_add_bias_fp32, v_add_bias_fp32};
+    std::vector<std::string> fc_bias_names;
+    for (auto* node : fc_bias_nodes) {
+      if (node) {
+        fc_bias_names.push_back(node->Name());
+      }
+    }
+
+    // set input of fuse_op
+    fused_op_desc.SetInput("input_q", {input_q->Name()});
+    fused_op_desc.SetInput("input_kv", {input_kv->Name()});
+    fused_op_desc.SetInput("fc_weight", fc_weight_names);
+    fused_op_desc.SetInput("fc_weight_max", fc_weight_max_names);
+    fused_op_desc.SetInput("fc_bias", fc_bias_names);
+    fused_op_desc.SetInput("mask", {mask->Name()});
+
+    // set attributes of fuse_op
+    if (with_q_scale) {
+      float scale_val = PADDLE_GET_CONST(float, scale->Op()->GetAttr("scale"));
+      fused_op_desc.SetAttr("alpha", scale_val);
+      VLOG(4) << "while with_q_scale, scale_val = " << scale_val;
+    } else {
+      // in xdnn, 0.0f is default value of NewBaseAttnParam.alpha
+      fused_op_desc.SetAttr("alpha", 0.0f);
+    }
+    fused_op_desc.SetAttr(
+        "head_num", static_cast<int>(transpose_1_out->Var()->GetShape()[1]));
+    fused_op_desc.SetAttr(
+        "head_dim", static_cast<int>(transpose_1_out->Var()->GetShape()[3]));
+    // TODO(tianrui): support more out_dtype
+    fused_op_desc.SetAttr("out_dtype", input_q->Var()->GetDataType());
+
+    // set output of fuse_op
+    VarDesc fused_op_out_max_desc("qkv_max");
+    Node* fused_op_out_max = graph->CreateVarNode(&fused_op_out_max_desc);
+    fused_op_desc.SetOutput("qkv_max", {"qkv_max"});
+    fused_op_desc.SetOutput("qkv", {output->Name()});
+
+    auto* fused_op = graph->CreateOpNode(&fused_op_desc);
+
+    // link input of fuse_op
+    IR_NODE_LINK_TO(input_q, fused_op);
+    IR_NODE_LINK_TO(input_kv, fused_op);
+    for (auto* node : fc_weight_nodes) {
+      if (node) {
+        IR_NODE_LINK_TO(node, fused_op);
+      }
+    }
+    for (auto* node : fc_weight_max_nodes) {
+      if (node) {
+        IR_NODE_LINK_TO(node, fused_op);
+      }
+    }
+    for (auto* node : fc_bias_nodes) {
+      if (node) {
+        IR_NODE_LINK_TO(node, fused_op);
+      }
+    }
+    // link output of fuse_op
+    IR_NODE_LINK_TO(fused_op, output);
+    IR_NODE_LINK_TO(fused_op, fused_op_out_max);
+
+    // delete useless node
+    std::unordered_set<const Node*> del_node_set;
+    del_node_set.insert(q_mul);
+    del_node_set.insert(q_mul_out);
+    del_node_set.insert(k_mul);
+    del_node_set.insert(k_mul_out);
+    del_node_set.insert(v_mul);
+    del_node_set.insert(v_mul_out);
+    del_node_set.insert(q_add);
+    del_node_set.insert(q_add_out);
+    del_node_set.insert(k_add);
+    del_node_set.insert(k_add_out);
+    del_node_set.insert(v_add);
+    del_node_set.insert(v_add_out);
+    del_node_set.insert(reshape_1);
+    del_node_set.insert(reshape_1_out);
+    del_node_set.insert(reshape_2);
+    del_node_set.insert(reshape_2_out);
+    del_node_set.insert(reshape_3);
+    del_node_set.insert(reshape_3_out);
+    del_node_set.insert(transpose_1);
+    del_node_set.insert(transpose_1_out);
+    del_node_set.insert(transpose_2);
+    del_node_set.insert(transpose_2_out);
+    del_node_set.insert(transpose_3);
+    del_node_set.insert(transpose_3_out);
+    del_node_set.insert(qk_matmul);
+    del_node_set.insert(qk_matmul_out);
+    del_node_set.insert(qk_add);
+    del_node_set.insert(qk_add_out);
+    del_node_set.insert(qk_softmax);
+    del_node_set.insert(qk_softmax_out);
+    del_node_set.insert(qkv_matmul);
+    del_node_set.insert(qkv_matmul_out);
+    del_node_set.insert(transpose_4);
+    del_node_set.insert(transpose_4_out);
+    del_node_set.insert(reshape_4);
+    if (with_q_scale) {
+      del_node_set.insert(scale);
+      del_node_set.insert(scale_out);
+    }
+    GraphSafeRemoveNodes(graph, del_node_set);
+
+    found_subgraph_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_subgraph_count);
+}
+
+void CrossAttentionXPUFusePass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  Init(name_scope_, graph);
+
+  for (auto with_q_scale : {true, false}) {
+    ApplyCrossAttentionXPUFuse(graph, with_q_scale);
+  }
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(cross_attention_xpu_fuse_pass,
+              paddle::framework::ir::CrossAttentionXPUFusePass);
+
+REGISTER_PASS_CAPABILITY(cross_attention_xpu_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().EQ(
+            "cross_attention_xpu", 0));
diff --git a/paddle/fluid/framework/ir/xpu/cross_attention_xpu_fuse_pass.h b/paddle/fluid/framework/ir/xpu/cross_attention_xpu_fuse_pass.h
new file mode 100644
index 0000000000000..9a04275294ea8
--- /dev/null
+++ b/paddle/fluid/framework/ir/xpu/cross_attention_xpu_fuse_pass.h
@@ -0,0 +1,126 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace phi {
+class DenseTensor;
+}  // namespace phi
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+This pass is used to fuse the cross attention op into one op in decoder.
+models .
+
+Origin subgraph:
+
+  mask      input_q          input_kv
+   |           |          |           |
+   |           |          |-----------|
+   |         matmul      matmul      matmul
+   |          |q          |k          |v
+   |          |           |           |
+   |          |           |           |
+   |         add         add         add
+   |          |           |           |
+   |          |           |           |
+   |       reshape     reshape     reshape
+   |          |           |           |
+   |          |           |           |
+   |      transpose   transpose  transpose
+   |          |           |           |
+   |          |           |           |
+   |       (scale)        |           |
+   |          |           |           |
+     \        |(x)        |(y)        |
+       \       \        /             |
+         \     qk_matmul              |
+           \      |                   |
+             \    |                   |
+                add                  /
+                  |                 /
+                  |               /
+               softmax          /
+                  \           /
+                   \        /
+                   qkv_matmul
+                       |
+                       |
+                   transpose
+                       |
+                       |
+                    reshape
+                       |
+                       |
+                     output
+
+-------------------------------------------------------
+Fused subgraph:
+                    input_q   input_kv
+                       |        |
+                       |        |
+                       |        |
+                   cross_attention_xpu
+                           |
+                           |
+                           |
+                         output
+
+*/
+
+class CrossAttentionXPUFusePass : public FusePassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+
+ private:
+  void ApplyCrossAttentionXPUFuse(ir::Graph* graph, bool with_q_scale) const;
+
+  // 1. Generate q/k/v_w_max tensor
+  // 2. Quant q/k/v_w to int16
+  void PrepareQKVWeight(Graph* graph,
+                        Scope* scope,
+                        BlockDesc* block,
+                        Node* w,
+                        Node** real_w,
+                        Node** w_max) const;
+
+  // Cast fc_bias to fp32
+  void PrepareQKVBias(Graph* graph,
+                      Scope* scope,
+                      BlockDesc* block,
+                      Node* q_bias,
+                      Node* k_bias,
+                      Node* v_bias,
+                      Node** real_q_bias,
+                      Node** real_k_bias,
+                      Node** real_v_bias) const;
+
+  const std::string name_scope_{"cross_attention_xpu_fuse_pass"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/xpu/decoder_attention_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/decoder_attention_xpu_fuse_pass.cc
index ad8dd1a55a868..c86180e24088a 100644
--- a/paddle/fluid/framework/ir/xpu/decoder_attention_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/decoder_attention_xpu_fuse_pass.cc
@@ -17,6 +17,7 @@
 #include "glog/logging.h"
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/quantize_helper.h"
 #include "paddle/fluid/framework/ir/xpu/pass_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -162,15 +163,15 @@ DecoderAttentionFusePattern::DecoderAttentionFusePattern(
 
   // link nodes
   reshape2_1->LinksFrom({input_q}).LinksTo({reshape2_1_out});
-  reshape2_2->LinksFrom({input_k}).LinksTo({reshape2_2_out});
-  reshape2_3->LinksFrom({input_v}).LinksTo({reshape2_3_out});
   transpose2_1->LinksFrom({reshape2_1_out}).LinksTo({transpose2_1_out});
+  reshape2_2->LinksFrom({input_k}).LinksTo({reshape2_2_out});
   transpose2_2->LinksFrom({reshape2_2_out}).LinksTo({transpose2_2_out});
-  transpose2_3->LinksFrom({reshape2_3_out}).LinksTo({transpose2_3_out});
   qk_matmul->LinksFrom({transpose2_1_out, transpose2_2_out})
       .LinksTo({qk_matmul_out});
   scale->LinksFrom({qk_matmul_out}).LinksTo({scale_out});
   qk_softmax->LinksFrom({scale_out}).LinksTo({qk_softmax_out});
+  reshape2_3->LinksFrom({input_v}).LinksTo({reshape2_3_out});
+  transpose2_3->LinksFrom({reshape2_3_out}).LinksTo({transpose2_3_out});
   qkv_matmul->LinksFrom({qk_softmax_out, transpose2_3_out})
       .LinksTo({qkv_matmul_out});
   transpose2_4->LinksFrom({qkv_matmul_out}).LinksTo({transpose2_4_out});
@@ -222,6 +223,7 @@ void DecoderAttentionXPUFusePass::ApplyDecoderAttentionXPUFuse(
     GET_IR_NODE(output);
 
     // Generate fuse op
+    auto* scope = param_scope();
     auto* block = reshape2_1->Op()->Block();
     framework::OpDesc fused_op_desc(block);
     fused_op_desc.SetType("qkv_attention_xpu");
@@ -230,6 +232,54 @@ void DecoderAttentionXPUFusePass::ApplyDecoderAttentionXPUFuse(
     fused_op_desc.SetInput("q", {input_q->Name()});
     fused_op_desc.SetInput("k", {input_k->Name()});
     fused_op_desc.SetInput("v", {input_v->Name()});
+    std::unordered_map<std::string, std::vector<float>> var_quant_scales =
+        GetQuantInfoFromTheGraph(graph, "has_quant_info", "var_quant_scales");
+    // recored q/k/v max, qk_max, and qkv_max
+    std::vector<Node*> input_max_nodes;
+    if (var_quant_scales.find(input_q->Name()) != var_quant_scales.end() &&
+        var_quant_scales.find(input_k->Name()) != var_quant_scales.end() &&
+        var_quant_scales.find(input_v->Name()) != var_quant_scales.end() &&
+        var_quant_scales.find(qk_matmul_out->Name()) !=
+            var_quant_scales.end() &&
+        var_quant_scales.find(qkv_matmul_out->Name()) !=
+            var_quant_scales.end()) {
+      std::vector<float> input_max_vec;
+      input_max_vec.push_back(var_quant_scales.at(input_q->Name())[0]);
+      input_max_vec.push_back(var_quant_scales.at(input_k->Name())[0]);
+      input_max_vec.push_back(var_quant_scales.at(input_v->Name())[0]);
+      input_max_vec.push_back(var_quant_scales.at(qk_matmul_out->Name())[0]);
+      input_max_vec.push_back(var_quant_scales.at(qkv_matmul_out->Name())[0]);
+      std::vector<std::string> quant_max_names = {
+          "q_max", "k_max", "v_max", "qk_max", "qkv_max"};
+      for (size_t i = 0; i < input_max_vec.size(); i++) {
+        std::string input_max_name =
+            input_q->Name() + "_" + std::to_string(i) + "_max_in";
+        int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
+        VarDesc input_max_desc(input_max_name);
+        input_max_desc.SetPersistable(true);
+        input_max_desc.SetShape({static_cast<int64_t>(max_ptr_size)});
+        input_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
+        Node* input_max_in = graph->CreateVarNode(&input_max_desc);
+        auto* block_input_max_in_desc = block->Var(input_max_name);
+        block_input_max_in_desc->SetPersistable(input_max_desc.Persistable());
+        block_input_max_in_desc->SetShape(input_max_desc.GetShape());
+        block_input_max_in_desc->SetDataType(input_max_desc.GetDataType());
+        phi::DenseTensor input_max_in_cpu_tensor;
+        auto* cpu_ctx = static_cast<phi::CPUContext*>(
+            platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
+        input_max_in_cpu_tensor.set_type(phi::DataType::FLOAT32);
+        input_max_in_cpu_tensor.Resize({max_ptr_size});
+        std::vector<float> input_max(max_ptr_size, input_max_vec[i]);
+        memcpy(cpu_ctx->Alloc<float>(&input_max_in_cpu_tensor),
+               input_max.data(),
+               max_ptr_size * sizeof(float));
+        Assign(input_max_in_cpu_tensor,
+               scope->Var(input_max_name)->GetMutable<phi::DenseTensor>());
+        fused_op_desc.SetInput(quant_max_names[i], {input_max_name});
+
+        input_max_nodes.push_back(input_max_in);
+      }
+    }
 
     // set attributes of fuse_op
     float scale_val = PADDLE_GET_CONST(float, scale->Op()->GetAttr("scale"));
@@ -245,9 +295,6 @@ void DecoderAttentionXPUFusePass::ApplyDecoderAttentionXPUFuse(
     fused_op_desc.SetAttr("out_dtype", input_q->Var()->GetDataType());
 
     // set output of fuse_op
-    VarDesc fused_op_out_max_desc("qkv_max");
-    Node* fused_op_out_max = graph->CreateVarNode(&fused_op_out_max_desc);
-    fused_op_desc.SetOutput("qkv_max", {"qkv_max"});
     fused_op_desc.SetOutput("qkv", {output->Name()});
 
     auto* fused_op = graph->CreateOpNode(&fused_op_desc);
@@ -256,7 +303,9 @@ void DecoderAttentionXPUFusePass::ApplyDecoderAttentionXPUFuse(
     IR_NODE_LINK_TO(input_k, fused_op);
     IR_NODE_LINK_TO(input_v, fused_op);
     IR_NODE_LINK_TO(fused_op, output);
-    IR_NODE_LINK_TO(fused_op, fused_op_out_max);
+    for (size_t i = 0; i < input_max_nodes.size(); i++) {
+      IR_NODE_LINK_TO(input_max_nodes[i], fused_op);
+    }
 
     // delete useless node
     std::unordered_set<const Node*> del_node_set;
diff --git a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
index 04b645a4d33d8..2010d4cb48de0 100644
--- a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
@@ -841,6 +841,35 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph,
     } else if (filter_data_type == phi::DataType::FLOAT16) {
       op_weights_precision = "float16";
     }
+    if (op_weights_precision == "float32" &&
+        AreScalesPresentForNodes(&var_quant_scales, {mul_w})) {
+      // convert weight to int8
+      auto* var = scope->FindVar(mul_w_name);
+      PADDLE_ENFORCE_NOT_NULL(
+          var,
+          platform::errors::NotFound(
+              "The input persistable [%s] var of [%s] op is not found.",
+              mul_w_name));
+      auto* weight_tensor = var->GetMutable<phi::DenseTensor>();
+      float* fp32_weight_data = weight_tensor->data<float>();
+      std::vector<int8_t> weight_data;
+      weight_data.resize(weight_tensor->numel());
+      for (int i = 0; i < weight_tensor->numel(); i++) {
+        weight_data[i] = static_cast<int8_t>(fp32_weight_data[i]);
+      }
+      const auto weight_dims = weight_tensor->dims();
+      weight_tensor->clear();  // clear int weight
+      weight_tensor->set_type(phi::DataType::INT8);
+      weight_tensor->Resize(common::make_ddim(common::vectorize(weight_dims)));
+      auto* cpu_ctx = static_cast<phi::CPUContext*>(
+          platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
+      auto* new_weight_data = cpu_ctx->Alloc<int8_t>(weight_tensor);
+      memcpy(new_weight_data,
+             weight_data.data(),
+             weight_tensor->numel() * sizeof(int8_t));
+      op_weights_precision = "int8";
+    }
+
     VLOG(4) << "FC fusion fuse pass is running on " << op_weights_precision
             << " precision!";
     auto* block = mul->Op()->Block();
diff --git a/paddle/fluid/framework/ir/xpu/group_norm_silu_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/group_norm_silu_xpu_fuse_pass.cc
new file mode 100644
index 0000000000000..86fef3fd0c2ae
--- /dev/null
+++ b/paddle/fluid/framework/ir/xpu/group_norm_silu_xpu_fuse_pass.cc
@@ -0,0 +1,208 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "glog/logging.h"
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/xpu/pass_utils.h"
+#include "paddle/fluid/framework/ir/xpu/quant_utils.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace phi {
+class DenseTensor;
+}  // namespace phi
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+/*
+fuse gn + activation block in to xpu_ele_fusion op
+For example:
+graph:
+                      X
+              Scale   |   Bias
+                   \  |  /
+                  group norm
+                   /  |  \
+                  /   |   \
+            variance  |   mean
+                      |
+                     silu
+                      |
+                    output
+------------------------------------------------------
+After the pass is applied:
+                      X
+              Scale   |   Bias
+                   \  |  /
+                gn_silu_fusion
+                      |
+                     Out
+*/
+struct GroupNormalizeSiluXPUPattern : public PatternBase {
+  GroupNormalizeSiluXPUPattern(PDPattern* pattern,
+                               const std::string& name_scope);
+  // declare operator node's name
+  PATTERN_DECL_NODE(gn);
+  PATTERN_DECL_NODE(silu);
+  // declare variable node's name
+  PATTERN_DECL_NODE(gn_x);
+  PATTERN_DECL_NODE(gn_bias);
+  PATTERN_DECL_NODE(gn_scale);
+  PATTERN_DECL_NODE(gn_y);
+  PATTERN_DECL_NODE(gn_mean);
+  PATTERN_DECL_NODE(gn_variance);
+  PATTERN_DECL_NODE(silu_out);
+};
+
+GroupNormalizeSiluXPUPattern::GroupNormalizeSiluXPUPattern(
+    PDPattern* pattern, const std::string& name_scope)
+    : PatternBase(pattern, name_scope, name_scope) {
+  auto gn = pattern->NewNode(gn_repr())->assert_is_op("group_norm");
+  auto gn_x = pattern->NewNode(gn_x_repr())
+                  ->assert_is_op_input("group_norm", "X")
+                  ->AsInput();
+  auto gn_bias = pattern->NewNode(gn_bias_repr())
+                     ->assert_is_op_input("group_norm", "Bias")
+                     ->assert_is_persistable_var()
+                     ->AsInput();
+  auto gn_scale = pattern->NewNode(gn_scale_repr())
+                      ->assert_is_op_input("group_norm", "Scale")
+                      ->assert_is_persistable_var()
+                      ->AsInput();
+  auto gn_y = pattern->NewNode(gn_y_repr())
+                  ->assert_is_op_output("group_norm", "Y")
+                  ->assert_is_op_input("silu", "X")
+                  ->assert_has_n_outputs(1);
+  auto gn_mean = pattern->NewNode(gn_mean_repr())
+                     ->assert_is_op_output("group_norm", "Mean")
+                     ->assert_has_n_outputs(0);
+  auto gn_variance = pattern->NewNode(gn_variance_repr())
+                         ->assert_is_op_output("group_norm", "Variance")
+                         ->assert_has_n_outputs(0);
+  gn->LinksFrom({gn_x, gn_bias, gn_scale})
+      .LinksTo({gn_y, gn_mean, gn_variance});
+
+  auto silu = pattern->NewNode(silu_repr())->assert_is_op("silu");
+  auto silu_out = pattern->NewNode(silu_out_repr())
+                      ->AsOutput()
+                      ->assert_is_op_output("silu", "Out");
+  silu->LinksFrom({gn_y}).LinksTo({silu_out});
+}
+
+}  // namespace patterns
+
+class GroupNormalizeSiluXPUFusePass : public FusePassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+
+ private:
+  void FuseGroupNormalizeSilu(ir::Graph* graph) const;
+
+  const std::string name_scope_{"group_norm_silu_xpu_fuse_pass"};
+};
+
+void GroupNormalizeSiluXPUFusePass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  Init(name_scope_, graph);
+
+  FuseGroupNormalizeSilu(graph);
+}
+
+void GroupNormalizeSiluXPUFusePass::FuseGroupNormalizeSilu(
+    ir::Graph* graph) const {
+  GraphPatternDetector gpd;
+  patterns::GroupNormalizeSiluXPUPattern pattern(gpd.mutable_pattern(),
+                                                 name_scope_);
+
+  int found_subgraph_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    VLOG(4) << "handle GroupNormalizeSiluXPUFusePass fuse";
+    // declare operator node's name
+    GET_IR_NODE(gn);
+    GET_IR_NODE(silu);
+    // declare variable node's name
+    GET_IR_NODE(gn_x);
+    GET_IR_NODE(gn_bias);
+    GET_IR_NODE(gn_scale);
+    GET_IR_NODE(gn_y);
+    GET_IR_NODE(gn_mean);
+    GET_IR_NODE(gn_variance);
+    GET_IR_NODE(silu_out);
+
+    auto* block = gn->Op()->Block();
+    auto* scope = param_scope();
+    PADDLE_ENFORCE_NOT_NULL(
+        scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
+    // delete useless node
+    std::unordered_set<const Node*> delete_nodes;
+
+    float eps = PADDLE_GET_CONST(float, gn->Op()->GetAttr("epsilon"));
+    int groups = PADDLE_GET_CONST(int, gn->Op()->GetAttr("groups"));
+
+    std::string fused_op_out_name;
+    fused_op_out_name = silu_out->Name();
+    // Generate add_layernorm fused op
+    framework::OpDesc fused_op_desc(block);
+
+    fused_op_desc.SetType("group_norm_silu_xpu");
+    // set attrs for fused op
+    fused_op_desc.SetInput("x", {gn_x->Name()});
+    fused_op_desc.SetInput("bias", {gn_bias->Name()});
+    fused_op_desc.SetInput("scale", {gn_scale->Name()});
+    fused_op_desc.SetAttr("epsilon", eps);
+    fused_op_desc.SetAttr("groups", groups);
+    fused_op_desc.SetOutput("out", {fused_op_out_name});
+    // relink fused op
+    auto* fused_op = graph->CreateOpNode(&fused_op_desc);
+    IR_NODE_LINK_TO(gn_x, fused_op);
+    IR_NODE_LINK_TO(gn_bias, fused_op);
+    IR_NODE_LINK_TO(gn_scale, fused_op);
+    IR_NODE_LINK_TO(fused_op, silu_out);
+
+    delete_nodes.insert({gn, silu, gn_y, gn_mean, gn_variance});
+    GraphSafeRemoveNodes(graph, delete_nodes);
+    found_subgraph_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_subgraph_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(group_norm_silu_xpu_fuse_pass,
+              paddle::framework::ir::GroupNormalizeSiluXPUFusePass);
+
+REGISTER_PASS_CAPABILITY(group_norm_silu_xpu_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().EQ(
+            "group_norm_silu_xpu", 0));
diff --git a/paddle/fluid/framework/ir/xpu/qk_qkv_attention_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/qk_qkv_attention_xpu_fuse_pass.cc
index 2ca1d081aab89..2d56306e97faa 100644
--- a/paddle/fluid/framework/ir/xpu/qk_qkv_attention_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/qk_qkv_attention_xpu_fuse_pass.cc
@@ -17,6 +17,7 @@
 #include "glog/logging.h"
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/quantize_helper.h"
 #include "paddle/fluid/framework/ir/xpu/pass_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -212,6 +213,7 @@ void QkQkvAttentionXPUFusePass::ApplyQkQkvAttentionXPUFuse(
     GET_IR_NODE(output);
 
     // Generate fuse op
+    auto* scope = param_scope();
     auto* block = reshape_1->Op()->Block();
     framework::OpDesc fused_op_desc(block);
     fused_op_desc.SetType("qkv_attention_xpu");
@@ -219,6 +221,57 @@ void QkQkvAttentionXPUFusePass::ApplyQkQkvAttentionXPUFuse(
     fused_op_desc.SetInput("q", {input->Name()});
     fused_op_desc.SetInput("k", {input->Name()});
     fused_op_desc.SetInput("v", {input->Name()});
+    std::unordered_map<std::string, std::vector<float>> var_quant_scales =
+        GetQuantInfoFromTheGraph(graph, "has_quant_info", "var_quant_scales");
+    // recored q/k/v max, qk_max, and qkv_max
+    std::vector<Node*> input_max_nodes;
+    if (var_quant_scales.find(input->Name()) != var_quant_scales.end() &&
+        var_quant_scales.find(qk_matmul_out->Name()) !=
+            var_quant_scales.end() &&
+        var_quant_scales.find(qkv_matmul_out->Name()) !=
+            var_quant_scales.end()) {
+      std::vector<float> input_max_vec;
+      input_max_vec.push_back(var_quant_scales.at(input->Name())[0]);
+      input_max_vec.push_back(var_quant_scales.at(qk_matmul_out->Name())[0]);
+      input_max_vec.push_back(var_quant_scales.at(qkv_matmul_out->Name())[0]);
+      std::vector<std::string> quant_max_names = {
+          "input_max", "qk_max", "qkv_max"};
+      for (size_t i = 0; i < input_max_vec.size(); i++) {
+        std::string input_max_name =
+            input->Name() + "_" + std::to_string(i) + "_max_in";
+        int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
+        VarDesc input_max_desc(input_max_name);
+        input_max_desc.SetPersistable(true);
+        input_max_desc.SetShape({static_cast<int64_t>(max_ptr_size)});
+        input_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
+        Node* input_max_in = graph->CreateVarNode(&input_max_desc);
+        auto* block_input_max_in_desc = block->Var(input_max_name);
+        block_input_max_in_desc->SetPersistable(input_max_desc.Persistable());
+        block_input_max_in_desc->SetShape(input_max_desc.GetShape());
+        block_input_max_in_desc->SetDataType(input_max_desc.GetDataType());
+
+        phi::DenseTensor input_max_in_cpu_tensor;
+        auto* cpu_ctx = static_cast<phi::CPUContext*>(
+            platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
+        input_max_in_cpu_tensor.set_type(phi::DataType::FLOAT32);
+        input_max_in_cpu_tensor.Resize({max_ptr_size});
+        std::vector<float> input_max(max_ptr_size, input_max_vec[i]);
+        memcpy(cpu_ctx->Alloc<float>(&input_max_in_cpu_tensor),
+               input_max.data(),
+               max_ptr_size * sizeof(float));
+        Assign(input_max_in_cpu_tensor,
+               scope->Var(input_max_name)->GetMutable<phi::DenseTensor>());
+        if (i == 0) {
+          fused_op_desc.SetInput("q_max", {input_max_name});
+          fused_op_desc.SetInput("k_max", {input_max_name});
+          fused_op_desc.SetInput("v_max", {input_max_name});
+        } else {
+          fused_op_desc.SetInput(quant_max_names[i], {input_max_name});
+        }
+        input_max_nodes.push_back(input_max_in);
+      }
+    }
+
     // set attributes of fuse_op
     if (with_q_scale) {
       float scale_val = PADDLE_GET_CONST(float, scale->Op()->GetAttr("scale"));
@@ -239,16 +292,15 @@ void QkQkvAttentionXPUFusePass::ApplyQkQkvAttentionXPUFuse(
     fused_op_desc.SetAttr("out_dtype", input->Var()->GetDataType());
 
     // set output of fuse_op
-    VarDesc fused_op_out_max_desc("qkv_max");
-    Node* fused_op_out_max = graph->CreateVarNode(&fused_op_out_max_desc);
-    fused_op_desc.SetOutput("qkv_max", {"qkv_max"});
     fused_op_desc.SetOutput("qkv", {output->Name()});
 
     auto* fused_op = graph->CreateOpNode(&fused_op_desc);
 
     IR_NODE_LINK_TO(input, fused_op);
     IR_NODE_LINK_TO(fused_op, output);
-    IR_NODE_LINK_TO(fused_op, fused_op_out_max);
+    for (size_t i = 0; i < input_max_nodes.size(); i++) {
+      IR_NODE_LINK_TO(input_max_nodes[i], fused_op);
+    }
 
     // delete useless node
     std::unordered_set<const Node*> del_node_set;
diff --git a/paddle/fluid/framework/ir/xpu/quant_dequant_xpu_pass.cc b/paddle/fluid/framework/ir/xpu/quant_dequant_xpu_pass.cc
index 29a222281b217..eecefa6330d69 100644
--- a/paddle/fluid/framework/ir/xpu/quant_dequant_xpu_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/quant_dequant_xpu_pass.cc
@@ -17,7 +17,7 @@
 #include <string>
 
 #include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
+#include "paddle/fluid/framework/ir/onednn/onednn_pass_util.h"
 #include "paddle/fluid/framework/ir/quantize_helper.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
diff --git a/paddle/fluid/framework/ir/xpu/spatial_transformer_resblock_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/spatial_transformer_resblock_xpu_fuse_pass.cc
new file mode 100644
index 0000000000000..a80d3763c366d
--- /dev/null
+++ b/paddle/fluid/framework/ir/xpu/spatial_transformer_resblock_xpu_fuse_pass.cc
@@ -0,0 +1,594 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "glog/logging.h"
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/xpu/pass_utils.h"
+#include "paddle/fluid/framework/ir/xpu/quant_utils.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace phi {
+class DenseTensor;
+}  // namespace phi
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+/*
+Fuse original subgraph into __xpu__spatial_transformer_resblock op.
+Currently there are 3 different original patterns to match.
+
+Original subgraph (situation 1):(todo)
+
+      ------------Input1                     Input2
+      |              |                          |
+      |          group_norm                    silu
+      |              |                          |
+      |             silu                      _xpu_fc
+      |              |                          |
+      |         _xpu_conv2d                  unsqueeze
+      |              \                           /
+      |               \                         /
+      |                \                       /
+      |                 \                     /
+      |                     elementwise_add
+      |                           |
+      |                      group_norm
+      |                           |
+      |                          silu
+      |                           |
+      |                       _xpu_conv2d
+      |                           |
+      |____________________elementwise_add
+                                  |
+                                output
+
+Original subgraph (situation 2):
+
+      -------------- in
+      |              |
+      |          group_norm_silu_xpu
+      |              |
+      |          conv2d_xpu
+      |              |
+      |          group_norm_silu_x pu
+      |              |
+      -----------conv2d_xpu
+                     |
+                    out
+
+Original subgraph (situation 3):
+
+      -------------- in
+      |              |
+      |          group_norm_silu_xpu
+      |              |
+      |          conv2d_xpu
+      |              |
+  conv2d_xpu     group_norm_silu_xpu
+      |              |
+      -----------conv2d_xpu
+                     |
+                    out
+
+Fuse to:
+(Situation 1):(todo)
+         Input1     Input2
+            \         /
+   spatial_transformer_resblock_xpu
+                 |
+              output
+or:
+(Situation 2 and 3):
+                 in
+                 |
+  spatial_transformer_resblock_xpu
+                 |
+                out
+*/
+struct SpatialTransformerResBlockXPUPattern : public PatternBase {
+  SpatialTransformerResBlockXPUPattern(PDPattern* pattern,
+                                       const std::string& name_scope,
+                                       bool conv_fix = false,
+                                       bool input_max = false,
+                                       bool has_silu_fc_input = false,
+                                       bool include_silu = false);
+  // declare operator node's name
+  PATTERN_DECL_NODE(gn_silu_0);
+  PATTERN_DECL_NODE(conv2d_0);
+  PATTERN_DECL_NODE(gn_silu_1);
+  PATTERN_DECL_NODE(conv2d_1);
+  PATTERN_DECL_NODE(conv2d_2);
+  // declare variable node's name
+  PATTERN_DECL_NODE(gn_silu_0_x);
+  PATTERN_DECL_NODE(gn_silu_0_bias);
+  PATTERN_DECL_NODE(gn_silu_0_scale);
+  PATTERN_DECL_NODE(gn_silu_0_out);
+  PATTERN_DECL_NODE(conv2d_0_bias);
+  PATTERN_DECL_NODE(conv2d_0_filter);
+  PATTERN_DECL_NODE(conv2d_0_filter_max);
+  PATTERN_DECL_NODE(conv2d_0_out);
+  PATTERN_DECL_NODE(conv2d_0_out_max);
+  PATTERN_DECL_NODE(gn_silu_1_bias);
+  PATTERN_DECL_NODE(gn_silu_1_scale);
+  PATTERN_DECL_NODE(gn_silu_1_out);
+  PATTERN_DECL_NODE(conv2d_1_bias);
+  PATTERN_DECL_NODE(conv2d_1_filter);
+  PATTERN_DECL_NODE(conv2d_1_filter_max);
+  PATTERN_DECL_NODE(conv2d_1_out);
+  PATTERN_DECL_NODE(conv2d_1_out_max);
+  PATTERN_DECL_NODE(conv2d_2_x_max);
+  PATTERN_DECL_NODE(conv2d_2_bias);
+  PATTERN_DECL_NODE(conv2d_2_filter);
+  PATTERN_DECL_NODE(conv2d_2_filter_max);
+  PATTERN_DECL_NODE(conv2d_2_out);
+  PATTERN_DECL_NODE(conv2d_2_out_max);
+
+ private:
+  bool conv_fix_{false};
+  bool input_max_{false};
+  bool has_silu_fc_input_{false};
+  bool include_silu_{false};
+};
+
+SpatialTransformerResBlockXPUPattern::SpatialTransformerResBlockXPUPattern(
+    PDPattern* pattern,
+    const std::string& name_scope,
+    bool conv_fix,
+    bool input_max,
+    bool has_silu_fc_input,
+    bool include_silu)
+    : PatternBase(pattern, name_scope, name_scope),
+      conv_fix_(conv_fix),
+      input_max_(input_max),
+      has_silu_fc_input_(has_silu_fc_input),
+      include_silu_(include_silu) {
+  // gn_silu_0
+  auto gn_silu_0 =
+      pattern->NewNode(gn_silu_0_repr())->assert_is_op("group_norm_silu_xpu");
+  auto gn_silu_0_x = pattern->NewNode(gn_silu_0_x_repr())
+                         ->assert_is_op_input("group_norm_silu_xpu", "x")
+                         ->AsInput();
+  auto gn_silu_0_bias = pattern->NewNode(gn_silu_0_bias_repr())
+                            ->assert_is_op_input("group_norm_silu_xpu", "bias")
+                            ->AsInput();
+  auto gn_silu_0_scale =
+      pattern->NewNode(gn_silu_0_scale_repr())
+          ->assert_is_op_input("group_norm_silu_xpu", "scale")
+          ->AsInput();
+  auto gn_silu_0_out = pattern->NewNode(gn_silu_0_out_repr())
+                           ->assert_is_op_output("group_norm_silu_xpu", "out")
+                           ->assert_is_op_input("conv2d_xpu", "x")
+                           ->assert_has_n_outputs(1);
+  gn_silu_0->LinksFrom({gn_silu_0_x, gn_silu_0_bias, gn_silu_0_scale})
+      .LinksTo({gn_silu_0_out});
+
+  PDNode* conv2d_2_x_max = nullptr;
+  PDNode* conv2d_2_bias = nullptr;
+  PDNode* conv2d_2_filter = nullptr;
+  PDNode* conv2d_2_filter_max = nullptr;
+  PDNode* conv2d_2_out = nullptr;
+  PDNode* conv2d_2_out_max = nullptr;
+  if (conv_fix_) {
+    gn_silu_0_x->assert_is_op_input("conv2d_xpu", "x");  // conv2d_2 x
+    if (input_max_) {
+      conv2d_2_x_max = pattern->NewNode(conv2d_2_x_max_repr())
+                           ->assert_is_op_input("conv2d_xpu", "x_max")
+                           ->AsInput();
+    }
+    // conv2d_2
+    auto conv2d_2 =
+        pattern->NewNode(conv2d_2_repr())->assert_is_op("conv2d_xpu");
+    conv2d_2_bias = pattern->NewNode(conv2d_2_bias_repr())
+                        ->assert_is_op_input("conv2d_xpu", "bias")
+                        ->AsInput();
+    conv2d_2_filter = pattern->NewNode(conv2d_2_filter_repr())
+                          ->assert_is_op_input("conv2d_xpu", "filter")
+                          ->AsInput();
+    conv2d_2_filter_max = pattern->NewNode(conv2d_2_filter_max_repr())
+                              ->assert_is_op_input("conv2d_xpu", "filter_max")
+                              ->AsInput();
+    conv2d_2_out = pattern->NewNode(conv2d_2_out_repr())
+                       ->assert_is_op_output("conv2d_xpu", "out")
+                       ->assert_is_op_input("conv2d_xpu", "branch")
+                       ->assert_has_n_outputs(1);
+    conv2d_2_out_max = pattern->NewNode(conv2d_2_out_max_repr())
+                           ->assert_is_op_output("conv2d_xpu", "out_max");
+    std::vector<PDNode*> conv2d_2_input{
+        gn_silu_0_x, conv2d_2_bias, conv2d_2_filter, conv2d_2_filter_max};
+    if (input_max_) {
+      conv2d_2_input.push_back(conv2d_2_x_max);
+    }
+    conv2d_2->LinksFrom(conv2d_2_input)
+        .LinksTo({conv2d_2_out, conv2d_2_out_max});
+  } else {
+    gn_silu_0_x->assert_is_op_input("conv2d_xpu", "branch");  // conv2d_1 branch
+    conv2d_2_out = gn_silu_0_x;
+  }
+
+  // conv2d_0
+  auto conv2d_0 = pattern->NewNode(conv2d_0_repr())->assert_is_op("conv2d_xpu");
+  auto conv2d_0_bias = pattern->NewNode(conv2d_0_bias_repr())
+                           ->assert_is_op_input("conv2d_xpu", "bias")
+                           ->AsInput();
+  auto conv2d_0_filter = pattern->NewNode(conv2d_0_filter_repr())
+                             ->assert_is_op_input("conv2d_xpu", "filter")
+                             ->AsInput();
+  auto conv2d_0_filter_max =
+      pattern->NewNode(conv2d_0_filter_max_repr())
+          ->assert_is_op_input("conv2d_xpu", "filter_max")
+          ->AsInput();
+  auto conv2d_0_out = pattern->NewNode(conv2d_0_out_repr())
+                          ->assert_is_op_output("conv2d_xpu", "out")
+                          ->assert_is_op_input("group_norm_silu_xpu", "x")
+                          ->assert_has_n_outputs(1);
+  auto conv2d_0_out_max = pattern->NewNode(conv2d_0_out_max_repr())
+                              ->assert_is_op_output("conv2d_xpu", "out_max");
+  conv2d_0
+      ->LinksFrom(
+          {gn_silu_0_out, conv2d_0_bias, conv2d_0_filter, conv2d_0_filter_max})
+      .LinksTo({conv2d_0_out, conv2d_0_out_max});
+
+  // gn_silu_1
+  auto gn_silu_1 =
+      pattern->NewNode(gn_silu_1_repr())->assert_is_op("group_norm_silu_xpu");
+  auto gn_silu_1_bias = pattern->NewNode(gn_silu_1_bias_repr())
+                            ->assert_is_op_input("group_norm_silu_xpu", "bias")
+                            ->assert_is_persistable_var()
+                            ->AsInput();
+  auto gn_silu_1_scale =
+      pattern->NewNode(gn_silu_1_scale_repr())
+          ->assert_is_op_input("group_norm_silu_xpu", "scale")
+          ->assert_is_persistable_var()
+          ->AsInput();
+  auto gn_silu_1_out = pattern->NewNode(gn_silu_1_out_repr())
+                           ->assert_is_op_output("group_norm_silu_xpu", "out")
+                           ->assert_is_op_input("conv2d_xpu", "x")
+                           ->assert_has_n_outputs(1);
+  gn_silu_1->LinksFrom({conv2d_0_out, gn_silu_1_bias, gn_silu_1_scale})
+      .LinksTo({gn_silu_1_out});
+
+  // conv2d_1
+  auto conv2d_1 = pattern->NewNode(conv2d_1_repr())->assert_is_op("conv2d_xpu");
+  auto conv2d_1_bias = pattern->NewNode(conv2d_1_bias_repr())
+                           ->assert_is_op_input("conv2d_xpu", "bias")
+                           ->AsInput();
+  auto conv2d_1_filter = pattern->NewNode(conv2d_1_filter_repr())
+                             ->assert_is_op_input("conv2d_xpu", "filter")
+                             ->AsInput();
+  auto conv2d_1_filter_max =
+      pattern->NewNode(conv2d_1_filter_max_repr())
+          ->assert_is_op_input("conv2d_xpu", "filter_max")
+          ->AsInput();
+  auto conv2d_1_out = pattern->NewNode(conv2d_1_out_repr())
+                          ->assert_is_op_output("conv2d_xpu", "out");
+  auto conv2d_1_out_max = pattern->NewNode(conv2d_1_out_max_repr())
+                              ->assert_is_op_output("conv2d_xpu", "out_max");
+  conv2d_1
+      ->LinksFrom({gn_silu_1_out,
+                   conv2d_2_out,
+                   conv2d_1_bias,
+                   conv2d_1_filter,
+                   conv2d_1_filter_max})
+      .LinksTo({conv2d_1_out, conv2d_1_out_max});
+}
+
+}  // namespace patterns
+
+namespace {
+static std::vector<int> IntVec2DTo1D(const std::vector<std::vector<int>>& vec) {
+  std::vector<int> res;
+  for (const auto& v : vec) {
+    for (const auto& ele : v) {
+      res.emplace_back(ele);
+    }
+  }
+  return res;
+}
+
+}  // namespace
+
+class SpatialTransformerResBlockXPUFusePass : public FusePassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+
+ private:
+  int FuseSpatialTransformerResBlock(ir::Graph* graph,
+                                     bool conv_fix = false,
+                                     bool input_max = false,
+                                     bool has_silu_fc_input = false,
+                                     bool include_silu = false) const;
+
+  const std::string name_scope_{"spatial_transformer_resblock_xpu_fuse_pass"};
+};
+
+void SpatialTransformerResBlockXPUFusePass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  Init(name_scope_, graph);
+  int found_subgraph_count = 0;
+  for (auto conv_fix : {false, true}) {
+    for (auto has_silu_fc_input : {false}) {
+      for (auto include_silu : {false}) {
+        if (conv_fix == true) {
+          for (auto input_max : {true, false}) {
+            found_subgraph_count +=
+                FuseSpatialTransformerResBlock(graph,
+                                               conv_fix /*true*/,
+                                               input_max,
+                                               has_silu_fc_input,
+                                               include_silu);
+          }
+        } else {
+          found_subgraph_count +=
+              FuseSpatialTransformerResBlock(graph,
+                                             conv_fix /*false*/,
+                                             false,
+                                             has_silu_fc_input,
+                                             include_silu);
+        }
+      }
+    }
+  }
+
+  AddStatis(found_subgraph_count);
+}
+
+int SpatialTransformerResBlockXPUFusePass::FuseSpatialTransformerResBlock(
+    ir::Graph* graph,
+    bool conv_fix,
+    bool input_max,
+    bool has_silu_fc_input,
+    bool include_silu) const {
+  GraphPatternDetector gpd;
+  patterns::SpatialTransformerResBlockXPUPattern pattern(gpd.mutable_pattern(),
+                                                         name_scope_,
+                                                         conv_fix,
+                                                         input_max,
+                                                         has_silu_fc_input,
+                                                         include_silu);
+
+  int found_subgraph_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    VLOG(4) << "handle SpatialTransformerResBlockXPUFusePass fuse";
+    // declare operator node's name
+    GET_IR_NODE(gn_silu_0);
+    GET_IR_NODE(conv2d_0);
+    GET_IR_NODE(gn_silu_1);
+    GET_IR_NODE(conv2d_1);
+    GET_IR_NODE(conv2d_2);
+    // declare variable node's name
+    GET_IR_NODE(gn_silu_0_x);
+    GET_IR_NODE(gn_silu_0_bias);
+    GET_IR_NODE(gn_silu_0_scale);
+    GET_IR_NODE(gn_silu_0_out);
+    GET_IR_NODE(conv2d_0_bias);
+    GET_IR_NODE(conv2d_0_filter);
+    GET_IR_NODE(conv2d_0_filter_max);
+    GET_IR_NODE(conv2d_0_out);
+    GET_IR_NODE(conv2d_0_out_max);
+    GET_IR_NODE(gn_silu_1_bias);
+    GET_IR_NODE(gn_silu_1_scale);
+    GET_IR_NODE(gn_silu_1_out);
+    GET_IR_NODE(conv2d_1_bias);
+    GET_IR_NODE(conv2d_1_filter);
+    GET_IR_NODE(conv2d_1_filter_max);
+    GET_IR_NODE(conv2d_1_out);
+    GET_IR_NODE(conv2d_1_out_max);
+    GET_IR_NODE(conv2d_2_x_max);
+    GET_IR_NODE(conv2d_2_bias);
+    GET_IR_NODE(conv2d_2_filter);
+    GET_IR_NODE(conv2d_2_filter_max);
+    GET_IR_NODE(conv2d_2_out);
+    GET_IR_NODE(conv2d_2_out_max);
+
+    auto* block = gn_silu_1->Op()->Block();
+    auto* scope = param_scope();
+    PADDLE_ENFORCE_NOT_NULL(
+        scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
+    // delete useless node
+    std::unordered_set<const Node*> delete_nodes;
+
+    std::vector<std::vector<int>> strides;
+    std::vector<std::vector<int>> paddings;
+    std::vector<std::vector<int>> dilations;
+    std::vector<int> groups;
+    std::vector<float> gn_eps;
+    std::vector<int> gn_groups;
+
+    // get attr
+    float gn_silu_0_eps =
+        PADDLE_GET_CONST(float, gn_silu_0->Op()->GetAttr("epsilon"));
+    gn_eps.emplace_back(std::move(gn_silu_0_eps));
+    int gn_silu_0_groups =
+        PADDLE_GET_CONST(int, gn_silu_0->Op()->GetAttr("groups"));
+    gn_groups.emplace_back(std::move(gn_silu_0_groups));
+    float gn_silu_1_eps =
+        PADDLE_GET_CONST(float, gn_silu_1->Op()->GetAttr("epsilon"));
+    gn_eps.emplace_back(std::move(gn_silu_1_eps));
+    int gn_silu_1_groups =
+        PADDLE_GET_CONST(int, gn_silu_1->Op()->GetAttr("groups"));
+    gn_groups.emplace_back(std::move(gn_silu_1_groups));
+
+    // conv2d_0
+    auto conv2d_0_dilations = PADDLE_GET_CONST(
+        std::vector<int>, conv2d_0->Op()->GetAttr("dilations"));
+    dilations.emplace_back(std::move(conv2d_0_dilations));
+    int conv2d_0_groups =
+        PADDLE_GET_CONST(int, conv2d_0->Op()->GetAttr("groups"));
+    groups.emplace_back(std::move(conv2d_0_groups));
+    auto conv2d_0_paddings =
+        PADDLE_GET_CONST(std::vector<int>, conv2d_0->Op()->GetAttr("paddings"));
+    paddings.emplace_back(std::move(conv2d_0_paddings));
+    std::string conv2d_0_padding_algorithm = PADDLE_GET_CONST(
+        std::string, conv2d_0->Op()->GetAttr("padding_algorithm"));
+    auto conv2d_0_strides =
+        PADDLE_GET_CONST(std::vector<int>, conv2d_0->Op()->GetAttr("strides"));
+    strides.emplace_back(std::move(conv2d_0_strides));
+
+    // conv2d_1
+    auto conv2d_1_dilations = PADDLE_GET_CONST(
+        std::vector<int>, conv2d_1->Op()->GetAttr("dilations"));
+    dilations.emplace_back(std::move(conv2d_1_dilations));
+    int conv2d_1_groups =
+        PADDLE_GET_CONST(int, conv2d_1->Op()->GetAttr("groups"));
+    groups.emplace_back(std::move(conv2d_1_groups));
+    auto conv2d_1_paddings =
+        PADDLE_GET_CONST(std::vector<int>, conv2d_1->Op()->GetAttr("paddings"));
+    paddings.emplace_back(std::move(conv2d_1_paddings));
+    std::string conv2d_1_padding_algorithm = PADDLE_GET_CONST(
+        std::string, conv2d_1->Op()->GetAttr("padding_algorithm"));
+    auto conv2d_1_strides =
+        PADDLE_GET_CONST(std::vector<int>, conv2d_1->Op()->GetAttr("strides"));
+    strides.emplace_back(std::move(conv2d_1_strides));
+
+    std::vector<std::string> conv_bias_names{conv2d_0_bias->Name(),
+                                             conv2d_1_bias->Name()};
+    std::vector<std::string> conv_filter_names{conv2d_0_filter->Name(),
+                                               conv2d_1_filter->Name()};
+    std::vector<std::string> conv_filter_max_names{conv2d_0_filter_max->Name(),
+                                                   conv2d_1_filter_max->Name()};
+
+    // conv2d_2
+    std::string conv2d_2_padding_algorithm;
+    if (conv_fix) {
+      auto conv2d_2_dilations = PADDLE_GET_CONST(
+          std::vector<int>, conv2d_2->Op()->GetAttr("dilations"));
+      dilations.emplace_back(std::move(conv2d_2_dilations));
+      int conv2d_2_groups =
+          PADDLE_GET_CONST(int, conv2d_2->Op()->GetAttr("groups"));
+      groups.emplace_back(std::move(conv2d_2_groups));
+      auto conv2d_2_paddings = PADDLE_GET_CONST(
+          std::vector<int>, conv2d_2->Op()->GetAttr("paddings"));
+      paddings.emplace_back(std::move(conv2d_2_paddings));
+      conv2d_2_padding_algorithm = PADDLE_GET_CONST(
+          std::string, conv2d_2->Op()->GetAttr("padding_algorithm"));
+      auto conv2d_2_strides = PADDLE_GET_CONST(
+          std::vector<int>, conv2d_2->Op()->GetAttr("strides"));
+      strides.emplace_back(std::move(conv2d_2_strides));
+
+      conv_bias_names.emplace_back(std::move(conv2d_2_bias->Name()));
+      conv_filter_names.emplace_back(std::move(conv2d_2_filter->Name()));
+      conv_filter_max_names.emplace_back(
+          std::move(conv2d_2_filter_max->Name()));
+    }
+
+    std::string fused_op_out_name;
+    fused_op_out_name = conv2d_1_out->Name();
+    // Generate add_layernorm fused op
+    framework::OpDesc fused_op_desc(block);
+
+    fused_op_desc.SetType("spatial_transformer_resblock_xpu");
+    // set attrs for fused op
+    fused_op_desc.SetInput("x", {gn_silu_0_x->Name()});
+
+    if (input_max) {
+      fused_op_desc.SetInput("x_max", {conv2d_2_x_max->Name()});
+    } else {
+      fused_op_desc.SetInput("x_max", {});
+    }
+
+    fused_op_desc.SetInput("conv_bias", conv_bias_names);
+    fused_op_desc.SetInput("conv_filter", conv_filter_names);
+    fused_op_desc.SetInput("conv_filter_max", conv_filter_max_names);
+    fused_op_desc.SetInput("gn_bias",
+                           {gn_silu_0_bias->Name(), gn_silu_1_bias->Name()});
+    fused_op_desc.SetInput("gn_scale",
+                           {gn_silu_0_scale->Name(), gn_silu_1_scale->Name()});
+    fused_op_desc.SetOutput("out", {fused_op_out_name});
+    fused_op_desc.SetOutput("out_max", {conv2d_1_out_max->Name()});
+
+    fused_op_desc.SetAttr("dilations", IntVec2DTo1D(dilations));
+    fused_op_desc.SetAttr("paddings", IntVec2DTo1D(paddings));
+    fused_op_desc.SetAttr("strides", IntVec2DTo1D(strides));
+    fused_op_desc.SetAttr("groups", groups);
+    fused_op_desc.SetAttr("gn_eps", gn_eps);
+    fused_op_desc.SetAttr("gn_groups", gn_groups);
+    fused_op_desc.SetAttr("conv_fix", conv_fix);
+    fused_op_desc.SetAttr("has_silu_fc_input", has_silu_fc_input);
+    fused_op_desc.SetAttr("include_silu", include_silu);
+
+    // relink fused op
+    auto* fused_op = graph->CreateOpNode(&fused_op_desc);
+
+    IR_NODE_LINK_TO(gn_silu_0_x, fused_op);
+    IR_NODE_LINK_TO(gn_silu_0_bias, fused_op);
+    IR_NODE_LINK_TO(gn_silu_0_scale, fused_op);
+    IR_NODE_LINK_TO(conv2d_0_bias, fused_op);
+    IR_NODE_LINK_TO(conv2d_0_filter, fused_op);
+    IR_NODE_LINK_TO(conv2d_0_filter_max, fused_op);
+    IR_NODE_LINK_TO(gn_silu_1_bias, fused_op);
+    IR_NODE_LINK_TO(gn_silu_1_scale, fused_op);
+    IR_NODE_LINK_TO(conv2d_1_bias, fused_op);
+    IR_NODE_LINK_TO(conv2d_1_filter, fused_op);
+    IR_NODE_LINK_TO(conv2d_1_filter_max, fused_op);
+
+    if (conv_fix) {
+      if (input_max) {
+        IR_NODE_LINK_TO(conv2d_2_x_max, fused_op);
+      }
+      IR_NODE_LINK_TO(conv2d_2_bias, fused_op);
+      IR_NODE_LINK_TO(conv2d_2_filter, fused_op);
+      IR_NODE_LINK_TO(conv2d_2_filter_max, fused_op);
+    }
+
+    IR_NODE_LINK_TO(fused_op, conv2d_1_out);
+    IR_NODE_LINK_TO(fused_op, conv2d_1_out_max);
+
+    delete_nodes.insert({gn_silu_0,
+                         gn_silu_1,
+                         conv2d_0,
+                         conv2d_1,
+                         gn_silu_0_out,
+                         conv2d_0_out,
+                         conv2d_0_out_max,
+                         gn_silu_1_out});
+
+    if (conv_fix) {
+      delete_nodes.insert({conv2d_2, conv2d_2_out, conv2d_2_out_max});
+    }
+    GraphSafeRemoveNodes(graph, delete_nodes);
+    found_subgraph_count++;
+  };
+
+  gpd(graph, handler);
+  return found_subgraph_count;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(spatial_transformer_resblock_xpu_fuse_pass,
+              paddle::framework::ir::SpatialTransformerResBlockXPUFusePass);
+
+REGISTER_PASS_CAPABILITY(spatial_transformer_resblock_xpu_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().EQ(
+            "spatial_transformer_resblock_xpu", 0));
diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
index 8a319b8a350a0..381215b857303 100644
--- a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
@@ -164,7 +164,9 @@ void XPUQuantizeOpPass::QuantizeConv(ir::Graph* graph) const {
           out_var_node = output_node;
         }
       }
-      if (!AreScalesPresentForNodes(&var_quant_scales_, {x_var_node})) {
+      if (!AreScalesPresentForNodes(&var_quant_scales_, {x_var_node}) ||
+          w_var_node->Var()->GetDataType() !=
+              proto::VarType::Type::VarType_Type_INT8) {
         VLOG(4) << "Skip quantize op: " << n->Name()
                 << "x_var_node_name:" << x_var_node->Name()
                 << " w_var_node_name:" << w_var_node->Name();
@@ -239,8 +241,9 @@ void XPUQuantizeOpPass::QuantizeFC(ir::Graph* graph) const {
           out_var_node = output_node;
         }
       }
-      if (!AreScalesPresentForNodes(&var_quant_scales_,
-                                    {x_var_node, w_var_node})) {
+      if (!AreScalesPresentForNodes(&var_quant_scales_, {x_var_node}) ||
+          w_var_node->Var()->GetDataType() !=
+              proto::VarType::Type::VarType_Type_INT8) {
         MarkAndLogCannotQuantizeOp(n, "No scale available for the operator");
         continue;
       }
@@ -261,6 +264,71 @@ void XPUQuantizeOpPass::QuantizeFC(ir::Graph* graph) const {
   }
 }
 
+void XPUQuantizeOpPass::QuantizeQkvAttention(ir::Graph* graph) const {
+  for (auto* n : graph->Nodes()) {
+    if (n->IsOp()) {
+      auto* op = n->Op();
+      if (op->Type() != "qkv_attention_xpu") {
+        continue;
+      }
+      std::vector<std::string> max_node_names = {
+          "q_max", "k_max", "v_max", "qk_max"};
+      std::unordered_map<std::string, Node*> input_node_map;
+      for (auto* input_node : n->inputs) {
+        if (!input_node->IsVar()) {
+          continue;
+        }
+        for (auto input_name : op->InputNames()) {
+          if (op->Input(input_name)[0] == input_node->Var()->Name()) {
+            input_node_map[input_name] = input_node;
+          }
+        }
+      }
+      bool continue_flag = false;
+      for (auto max_name : max_node_names) {
+        if (input_node_map.find(max_name) == input_node_map.end()) {
+          continue_flag = true;
+          break;
+        }
+      }
+      if (continue_flag) {
+        continue;
+      }
+      Node* out_var_node = nullptr;
+      for (auto* output_node : n->outputs) {
+        if (!output_node->IsVar()) {
+          continue;
+        }
+        if (output_node->Var()->Name() == op->Output("qkv")[0]) {
+          out_var_node = output_node;
+        }
+      }
+      if (input_node_map["q"]->Name() == input_node_map["k"]->Name() &&
+          input_node_map["q"]->Name() == input_node_map["v"]->Name()) {
+        QuantizeInput(graph, n, input_node_map["q"], "q");
+        op->SetInput("k", op->Input("q"));
+        op->SetInput("v", op->Input("q"));
+        UnlinkNodes(input_node_map["k"], n);
+        UnlinkNodes(input_node_map["v"], n);
+      } else {
+        QuantizeInput(graph, n, input_node_map["q"], "q");
+        QuantizeInput(graph, n, input_node_map["k"], "k");
+        QuantizeInput(graph, n, input_node_map["v"], "v");
+      }
+      auto has_output_scale =
+          AreScalesPresentForNodes(&var_quant_scales_, {out_var_node});
+      if (has_output_scale) {
+        DequantizeOutput(graph, n, out_var_node, "qkv");
+        n->Op()->SetAttr(
+            "out_dtype",
+            static_cast<int>(proto::VarType::Type::VarType_Type_INT8));
+      } else {
+        n->Op()->SetAttr("out_dtype",
+                         input_node_map["q"]->Var()->GetDataType());
+      }
+    }
+  }
+}
 void XPUQuantizeOpPass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Insert quantize/dequantize op to the graph.";
   PADDLE_ENFORCE_NOT_NULL(
@@ -273,6 +341,7 @@ void XPUQuantizeOpPass::ApplyImpl(ir::Graph* graph) const {
   GetQuantInfo(graph);
   QuantizeConv(graph);
   QuantizeFC(graph);
+  QuantizeQkvAttention(graph);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h
index 28d0f42e76bde..312b6a540c8cc 100644
--- a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h
+++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h
@@ -38,6 +38,7 @@ class XPUQuantizeOpPass : public FusePassBase {
  protected:
   void ApplyImpl(Graph* graph) const override;
   void QuantizeConv(Graph* graph) const;
+  void QuantizeQkvAttention(Graph* graph) const;
   void QuantizeFC(Graph* graph) const;
 
  private:
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 5dae6c1c84514..ccb5e1e5320d5 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -24,7 +24,7 @@
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/denormal.h"
 #ifdef PADDLE_WITH_DNNL
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #endif
 #ifdef PADDLE_WITH_TENSORRT
 #include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
@@ -32,9 +32,6 @@
 #ifdef PADDLE_WITH_NVTX
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 #endif
-#ifdef PADDLE_WITH_LITE
-#include "paddle/fluid/operators/lite/lite_engine_op.h"
-#endif
 
 namespace paddle {
 namespace framework {
@@ -234,6 +231,20 @@ void NaiveExecutor::RegisterInputHook(const HookFunc &hookfunc) {
   }
 }
 
+void NaiveExecutor::RegisterOutputHook(const PirHookFunc &hookfunc) {
+  pir_output_hookfuncs_.push_back(hookfunc);
+  if (interpreter_core_) {
+    interpreter_core_->SetOutputHooks(pir_output_hookfuncs_);
+  }
+}
+
+void NaiveExecutor::RegisterInputHook(const PirHookFunc &hookfunc) {
+  pir_input_hookfuncs_.push_back(hookfunc);
+  if (interpreter_core_) {
+    interpreter_core_->SetInputHooks(pir_input_hookfuncs_);
+  }
+}
+
 void NaiveExecutor::MakeReusePlan(
     const std::unordered_map<std::string, std::string> &reuse_table) {
   std::unordered_map<std::string, std::unordered_set<std::string>> clusters;
@@ -320,38 +331,7 @@ void NaiveExecutor::ResetTrtOps(int num) {
 #endif
 }
 
-void NaiveExecutor::CloneLiteEngine(int num, void *stream) {
-#ifdef PADDLE_WITH_LITE
-  for (auto &op : ops_) {
-    if (op->Type() == "lite_engine") {
-      operators::LiteEngineOp *lite_op =
-          dynamic_cast<operators::LiteEngineOp *>(op.get());
-      PADDLE_ENFORCE_NOT_NULL(
-          lite_op,
-          phi::errors::InvalidArgument(
-              "lite_op(type: lite_engine) should be created."));
-      std::string engine_key = lite_op->Attr<std::string>("engine_key");
-      std::string new_engine_key = engine_key + "_" + std::to_string(num);
-      PADDLE_ENFORCE(
-          paddle::inference::Singleton<inference::lite::EngineManager>::Global()
-              .Has(engine_key),
-          phi::errors::InvalidArgument(
-              "lite_engine(key: %s) should be created.", engine_key));
-      auto *lite_engine =
-          paddle::inference::Singleton<inference::lite::EngineManager>::Global()
-              .Get(engine_key);
-      auto new_lite_engine = lite_engine->Clone();
-#ifdef LITE_SUBGRAPH_WITH_XPU
-      new_lite_engine->SetStream(TARGET(kXPU), stream);
-#endif
-      paddle::inference::Singleton<inference::lite::EngineManager>::Global()
-          .Set(new_engine_key, new_lite_engine);
-      lite_op->SetAttr("engine_key", new_engine_key);
-      lite_op->SetEngine(new_lite_engine.get());
-    }
-  }
-#endif
-}
+void NaiveExecutor::CloneLiteEngine(int num, void *stream) {}
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h
index d36e3042b0b72..47f58924de144 100644
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@@ -45,6 +45,9 @@ class NaiveExecutor {
  public:
   using HookFunc = std::function<void(OperatorBase*, Scope*)>;
 
+  using PirHookFunc =
+      std::function<void(InstructionBase*, ValueExecutionInfo*, Scope*)>;
+
   explicit NaiveExecutor(const platform::Place& place) : place_(place) {}
 
   ~NaiveExecutor();
@@ -94,6 +97,8 @@ class NaiveExecutor {
 
   void RegisterOutputHook(const HookFunc& hookfunc);
   void RegisterInputHook(const HookFunc& hookfunc);
+  void RegisterOutputHook(const PirHookFunc& hookfunc);
+  void RegisterInputHook(const PirHookFunc& hookfunc);
 
  private:
   void CreateOps(const ProgramDesc& desc, int block_id);
@@ -107,6 +112,9 @@ class NaiveExecutor {
   std::vector<HookFunc> output_hookfuncs_;
   std::vector<HookFunc> input_hookfuncs_;
 
+  std::vector<PirHookFunc> pir_output_hookfuncs_;
+  std::vector<PirHookFunc> pir_input_hookfuncs_;
+
   // Record information that tensor_a should ShareBufferWith tensor_b.
   std::unordered_map<OperatorBase*, std::unordered_map<phi::DenseTensor*, int>>
       reuse_cache_;
diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt
index d06fdd8c4c7cd..01c6cd7c12a43 100644
--- a/paddle/fluid/framework/new_executor/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/CMakeLists.txt
@@ -5,7 +5,7 @@ if(NOT (WITH_CINN))
        ${CMAKE_CURRENT_SOURCE_DIR}/instruction/cinn_jit_instruction.cc)
 endif()
 
-if(NOT WITH_MKLDNN)
+if(NOT WITH_ONEDNN)
   list(
     REMOVE_ITEM
     standalone_executor_srcs
@@ -54,6 +54,6 @@ cc_library(
 
 add_dependencies(standalone_executor xxhash framework_proto)
 
-if(WITH_MKLDNN)
-  add_dependencies(standalone_executor mkldnn)
+if(WITH_ONEDNN)
+  add_dependencies(standalone_executor onednn)
 endif()
diff --git a/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc b/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc
index db8ef9f2de7bf..8bd67fe50d698 100644
--- a/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc
@@ -38,7 +38,7 @@
 #include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
 
 #ifdef PADDLE_WITH_DNNL
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #endif
 
 namespace paddle {
@@ -198,6 +198,16 @@ IfInstruction::~IfInstruction() {
   }
 }
 
+void IfInstruction::SetOutputHooks(const std::vector<PirHookFunc>& hookfuncs) {
+  true_branch_inter_->SetOutputHooks(hookfuncs);
+  false_branch_inter_->SetOutputHooks(hookfuncs);
+}
+
+void IfInstruction::SetInputHooks(const std::vector<PirHookFunc>& hookfuncs) {
+  true_branch_inter_->SetInputHooks(hookfuncs);
+  false_branch_inter_->SetInputHooks(hookfuncs);
+}
+
 void IfInstruction::Run() {
   bool cond = true;
   if (cond_var_->IsType<phi::DenseTensor>()) {
diff --git a/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.h b/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.h
index cf0de0fc3581f..7667c9128a8a7 100644
--- a/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.h
+++ b/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.h
@@ -48,6 +48,10 @@ class IfInstruction : public InstructionBase {
 
   PirInterpreter* FalseBranchInterpreter() const { return false_branch_inter_; }
 
+  void SetOutputHooks(const std::vector<PirHookFunc>& hookfuncs);
+
+  void SetInputHooks(const std::vector<PirHookFunc>& hookfuncs);
+
  private:
   ::pir::Operation* op_;
 
diff --git a/paddle/fluid/framework/new_executor/instruction/control_flow/pylayer_instruction.cc b/paddle/fluid/framework/new_executor/instruction/control_flow/pylayer_instruction.cc
index 56bf04227d49b..838f6dbce67b6 100644
--- a/paddle/fluid/framework/new_executor/instruction/control_flow/pylayer_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/control_flow/pylayer_instruction.cc
@@ -37,7 +37,7 @@
 #include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
 
 #ifdef PADDLE_WITH_DNNL
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/framework/new_executor/instruction/control_flow/while_instruction.cc b/paddle/fluid/framework/new_executor/instruction/control_flow/while_instruction.cc
index ae8b0d1df2eee..1385f1d357a3d 100644
--- a/paddle/fluid/framework/new_executor/instruction/control_flow/while_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/control_flow/while_instruction.cc
@@ -38,7 +38,7 @@
 #include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
 
 #ifdef PADDLE_WITH_DNNL
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #endif
 
 namespace paddle {
@@ -240,6 +240,16 @@ void WhileInstruction::ShareDatasToOutputs() {
   }
 }
 
+void WhileInstruction::SetOutputHooks(
+    const std::vector<PirHookFunc>& hookfuncs) {
+  body_inter_->SetOutputHooks(hookfuncs);
+}
+
+void WhileInstruction::SetInputHooks(
+    const std::vector<PirHookFunc>& hookfuncs) {
+  body_inter_->SetInputHooks(hookfuncs);
+}
+
 void WhileInstruction::Run() {
 #ifdef PADDLE_WITH_DNNL
   // Executor on being destroyed clears oneDNN cache and resets
diff --git a/paddle/fluid/framework/new_executor/instruction/control_flow/while_instruction.h b/paddle/fluid/framework/new_executor/instruction/control_flow/while_instruction.h
index 849d4ec4d184d..b6f729a784f5a 100644
--- a/paddle/fluid/framework/new_executor/instruction/control_flow/while_instruction.h
+++ b/paddle/fluid/framework/new_executor/instruction/control_flow/while_instruction.h
@@ -50,6 +50,10 @@ class WhileInstruction : public InstructionBase {
 
   PirInterpreter* BodyInterpreter() const { return body_inter_.get(); }
 
+  void SetOutputHooks(const std::vector<PirHookFunc>& hookfuncs);
+
+  void SetInputHooks(const std::vector<PirHookFunc>& hookfuncs);
+
  private:
   // 'output' = 'input'
   void ShareInputsToOutputs();
diff --git a/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc b/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc
index 3bc5893a162b3..00b5410247ddc 100644
--- a/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc
@@ -702,7 +702,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
                                              &arguments);
     }
 #ifdef PADDLE_WITH_DNNL
-    // For input that is Extra, only MKLDNN will use Extra Inputs
+    // For input that is Extra, only OneDNN will use Extra Inputs
     auto& extra_input_names =
         paddle::operators::ExtraInfoUtils::Instance().GetExtraInputNamesMap(
             op_with_kernel->Type());
diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
index 1e093f7247320..850a038ea790c 100644
--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -42,7 +42,7 @@
 #include "paddle/phi/core/kernel_factory.h"
 
 #ifdef PADDLE_WITH_DNNL
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #endif
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
diff --git a/paddle/fluid/framework/new_executor/interpreter/static_build.cc b/paddle/fluid/framework/new_executor/interpreter/static_build.cc
index 131f756bdb1d3..ac58f499e91ca 100644
--- a/paddle/fluid/framework/new_executor/interpreter/static_build.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/static_build.cc
@@ -24,7 +24,7 @@
 #include "paddle/fluid/platform/flags.h"
 
 #ifdef PADDLE_WITH_DNNL
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #endif
 
 COMMON_DECLARE_bool(cache_inference_while_scope);
diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
index e99a02f37136e..1a6fe75fc518a 100644
--- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h
+++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
@@ -104,13 +104,17 @@ class InterpreterBaseImpl {
 
   virtual void SetInputHooks(const std::vector<HookFunc>& hookfuncs) = 0;
 
+  virtual void SetOutputHooks(const std::vector<PirHookFunc>& hookfuncs) = 0;
+
+  virtual void SetInputHooks(const std::vector<PirHookFunc>& hookfuncs) = 0;
+
   virtual std::shared_ptr<std::vector<size_t>> GetDependencyCount() const = 0;
 
   virtual bool IsSharedResultsBuild() const = 0;
 
-  virtual void Build(
-      const std::vector<std::string>& feed_names,
-      std::vector<paddle::framework::OpFuncNode>* op_func_nodes) = 0;
+  virtual void Build(const std::vector<std::string>& feed_names,
+                     std::vector<paddle::framework::OpFuncNode>* op_func_nodes,
+                     bool switch_stream = false) = 0;
 
   virtual bool IsStaticBuild() const = 0;
 
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 61151373b2a29..7bf78eed8b04e 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -139,6 +139,15 @@ void InterpreterCore::SetOutputHooks(const std::vector<HookFunc>& hookfuncs) {
   impl_->SetOutputHooks(hookfuncs);
 }
 
+void InterpreterCore::SetInputHooks(const std::vector<PirHookFunc>& hookfuncs) {
+  impl_->SetInputHooks(hookfuncs);
+}
+
+void InterpreterCore::SetOutputHooks(
+    const std::vector<PirHookFunc>& hookfuncs) {
+  impl_->SetOutputHooks(hookfuncs);
+}
+
 void InterpreterCore::Build(
     const std::vector<std::string>& feed_names,
     std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index f2b4426b8ebb2..39ad549a78455 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -14,6 +14,7 @@
 #pragma once
 
 #include "paddle/fluid/framework/new_executor/interpreter_base_impl.h"
+#include "paddle/fluid/framework/new_executor/new_executor_defs.h"
 
 PD_DECLARE_bool(new_executor_use_local_scope);
 
@@ -88,6 +89,10 @@ class InterpreterCore {
 
   void SetInputHooks(const std::vector<HookFunc>& hookfuncs);
 
+  void SetOutputHooks(const std::vector<PirHookFunc>& hookfuncs);
+
+  void SetInputHooks(const std::vector<PirHookFunc>& hookfuncs);
+
   void Build(const std::vector<std::string>& feed_names,
              std::vector<paddle::framework::OpFuncNode>* op_func_nodes);
 
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index c416b151aef03..79619828980aa 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -40,9 +40,13 @@ COMMON_DECLARE_bool(dynamic_static_unified_comm);
 namespace paddle {
 namespace framework {
 
+class InstructionBase;
+class ValueExecutionInfo;
 using OpKernelComputeFunc = std::function<void(const ExecutionContext&)>;
 
 using HookFunc = std::function<void(OperatorBase*, Scope*)>;
+using PirHookFunc =
+    std::function<void(InstructionBase*, ValueExecutionInfo*, Scope*)>;
 
 using SchedulingPriority = int64_t;
 
diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc
index 03439ad6fd417..a8d525ee9e93b 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc
@@ -37,7 +37,7 @@
 #include "paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.h"
 #include "paddle/fluid/framework/new_executor/instruction/onednn/onednn_legacy_instruction.h"
 #include "paddle/fluid/framework/new_executor/instruction/onednn/onednn_mixed_instruction.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #endif
 
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
@@ -723,8 +723,16 @@ void PirInterpreter::BuildInstruction() {
       }
     } else if (op.dialect()->name() == "pd_op") {
       if (op.isa<paddle::dialect::IfOp>()) {  // NOLINT
-        vec_instruction_base_.emplace_back(std::make_unique<IfInstruction>(
-            op_idx++, place_, &op, value_exe_info_.get(), execution_config_));
+        std::unique_ptr<IfInstruction> if_instr_ptr =
+            std::make_unique<IfInstruction>(op_idx++,
+                                            place_,
+                                            &op,
+                                            value_exe_info_.get(),
+                                            execution_config_);
+        if_instr_ptr->SetOutputHooks(pir_output_hookfuncs_);
+        if_instr_ptr->SetInputHooks(pir_input_hookfuncs_);
+        vec_instruction_base_.emplace_back(std::move(if_instr_ptr));
+
         sub_blocks_.insert(
             {&op.dyn_cast<paddle::dialect::IfOp>().true_block(),
              dynamic_cast<IfInstruction*>(vec_instruction_base_.back().get())
@@ -742,8 +750,16 @@ void PirInterpreter::BuildInstruction() {
                  vec_instruction_base_.back().get())
                  ->ForwardInterpreter()});
       } else if (op.isa<paddle::dialect::WhileOp>()) {
-        vec_instruction_base_.emplace_back(std::make_unique<WhileInstruction>(
-            op_idx++, place_, &op, value_exe_info_.get(), execution_config_));
+        std::unique_ptr<WhileInstruction> while_instr_ptr =
+            std::make_unique<WhileInstruction>(op_idx++,
+                                               place_,
+                                               &op,
+                                               value_exe_info_.get(),
+                                               execution_config_);
+        while_instr_ptr->SetOutputHooks(pir_output_hookfuncs_);
+        while_instr_ptr->SetInputHooks(pir_input_hookfuncs_);
+        vec_instruction_base_.emplace_back(std::move(while_instr_ptr));
+
         sub_blocks_.insert(
             {&op.dyn_cast<paddle::dialect::WhileOp>().body(),
              dynamic_cast<WhileInstruction*>(vec_instruction_base_.back().get())
@@ -1183,7 +1199,9 @@ void PirInterpreter::CalculateLastLiveOps() {
     for (auto& item : ins_and_outs) {
       for (auto var_id : item.second) {
         // skip no_need_buffer input vars
-        if (ins.count(item.first) && instr->NoNeedBuffer().count(item.first)) {
+        if ((ins.count(item.first) &&
+             instr->NoNeedBuffer().count(item.first)) ||
+            instr->Name() == "builtin_combine_instruction") {
           continue;
         }
         gc_check_vars.insert(var_id);
@@ -1764,6 +1782,13 @@ void PirInterpreter::RunInstructionBase(InstructionBase* instr_node) {
             << " runs on " << platform::GetCurrentThreadName() << "\n"
             << "Before: " << cur_place << " "
             << instr_node->DebugStringEx(scope_, value_exe_info_.get());
+
+    if (execution_config_.used_for_inference) {
+      for (auto& hook : pir_input_hookfuncs_) {
+        hook(instr_node, value_exe_info_.get(), scope_);
+      }
+    }
+
     if (!instr_node->IsArtificial()) {
       instr_node->Run();
 
@@ -1789,6 +1814,13 @@ void PirInterpreter::RunInstructionBase(InstructionBase* instr_node) {
       VLOG(4) << "done CheckGC";
       memory::LogDeviceMemoryStats(cur_place, instr_node->Name());
     }
+
+    if (execution_config_.used_for_inference) {
+      for (auto& hook : pir_output_hookfuncs_) {
+        hook(instr_node, value_exe_info_.get(), scope_);
+      }
+    }
+
     VLOG(5) << "after run kernel";
     instr_node->RecordEvent(cur_place);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -1897,7 +1929,8 @@ Variable* PirInterpreter::DebugVar(const std::string& name) const {
 
 void PirInterpreter::Build(
     const std::vector<std::string>& feed_names,
-    std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
+    std::vector<paddle::framework::OpFuncNode>* op_func_nodes,
+    bool switch_stream) {
   PADDLE_THROW(platform::errors::Unimplemented(
       "Build is not implemented in PirInterpreter."));
 }
diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.h b/paddle/fluid/framework/new_executor/pir_interpreter.h
index e28e418b9dd95..819bf7486d685 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.h
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.h
@@ -96,12 +96,16 @@ class PirInterpreter : public InterpreterBaseImpl {
 
   const platform::Place& GetPlace() const override { return place_; }
 
-  void SetOutputHooks(const std::vector<HookFunc>& hookfuncs) override {
-    output_hookfuncs_ = hookfuncs;
+  void SetOutputHooks(const std::vector<HookFunc>& hookfuncs) override {}
+
+  void SetInputHooks(const std::vector<HookFunc>& hookfuncs) override {}
+
+  void SetOutputHooks(const std::vector<PirHookFunc>& hookfuncs) override {
+    pir_output_hookfuncs_ = hookfuncs;
   }
 
-  void SetInputHooks(const std::vector<HookFunc>& hookfuncs) override {
-    input_hookfuncs_ = hookfuncs;
+  void SetInputHooks(const std::vector<PirHookFunc>& hookfuncs) override {
+    pir_input_hookfuncs_ = hookfuncs;
   }
 
   std::string GetNameByValue(::pir::Value value) const;
@@ -138,9 +142,9 @@ class PirInterpreter : public InterpreterBaseImpl {
   void CheckCUDAGraphBeforeRun(const std::vector<std::string>& feed_names);
   void PrepareForCUDAGraphCapture();
 
-  void Build(
-      const std::vector<std::string>& feed_names,
-      std::vector<paddle::framework::OpFuncNode>* op_func_nodes) override;
+  void Build(const std::vector<std::string>& feed_names,
+             std::vector<paddle::framework::OpFuncNode>* op_func_nodes,
+             bool switch_stream = false) override;
 
   bool IsStaticBuild() const override { return static_build_; }
 
@@ -200,8 +204,8 @@ class PirInterpreter : public InterpreterBaseImpl {
   int64_t onednn_op_num_{-1};
   std::vector<size_t> trace_execute_order_;
 
-  std::vector<HookFunc> output_hookfuncs_;
-  std::vector<HookFunc> input_hookfuncs_;
+  std::vector<PirHookFunc> pir_output_hookfuncs_;
+  std::vector<PirHookFunc> pir_input_hookfuncs_;
 
   /// ======================== ///
   ///        For new ir        ///
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index 8991fd9c3a22d..0bca82f5016e1 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -29,7 +29,7 @@
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
 #ifdef PADDLE_WITH_DNNL
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #endif
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/phi/backends/device_manager.h"
@@ -150,7 +150,7 @@ FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
   is_in_op_profiling_mode_ = enable_op_profiling;
 
   std::vector<paddle::framework::OpFuncNode> op_func_nodes;
-  Build(feed_names, &op_func_nodes);
+  Build(feed_names, &op_func_nodes, switch_stream);
 
   if (!is_build_) {
     SetFeedVarsInplaceSkip(feed_names);
@@ -166,7 +166,7 @@ FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
   } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (switch_stream) {
-      BuildOpFuncNode(&op_func_nodes);
+      Convert(&op_func_nodes);
     }
 #endif
     RunImpl();
@@ -208,7 +208,8 @@ FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
 
 void ProgramInterpreter::Build(
     const std::vector<std::string>& feed_names,
-    std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
+    std::vector<paddle::framework::OpFuncNode>* op_func_nodes,
+    bool switch_stream) {
   SetDeviceId(place_);
   CheckCUDAGraphBeforeRun(feed_names);
 
@@ -216,7 +217,7 @@ void ProgramInterpreter::Build(
   platform::AttachPointerHashToMKLDNNKey(this, place_);
 #endif
 
-  if (!is_build_) {
+  if (!is_build_ || switch_stream) {
     LOG_FIRST_N(INFO, 1) << "New Executor is Running.";
     paddle::framework::interpreter::BuildVariableScope(
         block_, execution_config_, &var_scope_);
@@ -678,7 +679,42 @@ std::tuple<double, double> ProgramInterpreter::InterpreterRunTime() {
 void ProgramInterpreter::Convert(
     std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
   auto& vec_meta_info = var_scope_.MutableVecMetaInfo();
-  BuildOpFuncNode(op_func_nodes);
+  auto nodes = *op_func_nodes;
+  auto op_nums = nodes.size();
+  vec_instruction_.clear();
+  vec_instruction_.reserve(op_nums);
+  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
+    auto& op_func_node = nodes[op_idx];
+    stream_analyzer_.SetForceEventsToWaitInfo(force_events_to_wait_);
+    auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (FLAGS_new_executor_use_cuda_graph) {
+      auto& op = op_func_node.operator_base_;
+      auto& op_type = op->Type();
+      if (op_type == interpreter::kMemcpyD2H ||
+          op_type == interpreter::kMemcpyH2D) {
+        PADDLE_THROW(paddle::platform::errors::Fatal(
+            "Cuda memory copy d2h/h2d is not allowed while using cuda graph."));
+      }
+      PADDLE_ENFORCE_EQ(typeid(*dev_ctx_) == typeid(phi::GPUContext),
+                        true,
+                        platform::errors::InvalidArgument(
+                            "Device context of op %s must be [%s] while using "
+                            "cuda graph, but got [%s].",
+                            op_type,
+                            typeid(phi::GPUContext).name(),
+                            typeid(*dev_ctx_).name()));
+      // cuda graph needs to record all stream
+      phi::backends::gpu::CUDAGraphContextManager::Instance()
+          .RecordCapturingDeviceContext(dev_ctx_);
+    }
+#endif
+    vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    vec_instruction_.back().UpdateRecordStreamForGcInfo();
+#endif
+  }
 
   BuildOperatorDependences();
 
@@ -715,7 +751,6 @@ void ProgramInterpreter::Convert(
   }
 
   // calculate last_live_ops_
-  auto op_nums = (*op_func_nodes).size();
   for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
     Instruction& instr = vec_instruction_[op_idx];
     OpInOutInfo info;
@@ -852,46 +887,6 @@ void ProgramInterpreter::Convert(
   AnalyseExecuteOrderForTrace();
 }
 
-void ProgramInterpreter::BuildOpFuncNode(
-    std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
-  auto nodes = *op_func_nodes;
-  auto op_nums = nodes.size();
-  vec_instruction_.clear();
-  vec_instruction_.reserve(op_nums);
-  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
-    auto& op_func_node = nodes[op_idx];
-    stream_analyzer_.SetForceEventsToWaitInfo(force_events_to_wait_);
-    auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    if (FLAGS_new_executor_use_cuda_graph) {
-      auto& op = op_func_node.operator_base_;
-      auto& op_type = op->Type();
-      if (op_type == interpreter::kMemcpyD2H ||
-          op_type == interpreter::kMemcpyH2D) {
-        PADDLE_THROW(paddle::platform::errors::Fatal(
-            "Cuda memory copy d2h/h2d is not allowed while using cuda graph."));
-      }
-      PADDLE_ENFORCE_EQ(typeid(*dev_ctx_) == typeid(phi::GPUContext),
-                        true,
-                        platform::errors::InvalidArgument(
-                            "Device context of op %s must be [%s] while using "
-                            "cuda graph, but got [%s].",
-                            op_type,
-                            typeid(phi::GPUContext).name(),
-                            typeid(*dev_ctx_).name()));
-      // cuda graph needs to record all stream
-      phi::backends::gpu::CUDAGraphContextManager::Instance()
-          .RecordCapturingDeviceContext(dev_ctx_);
-    }
-#endif
-    vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    vec_instruction_.back().UpdateRecordStreamForGcInfo();
-#endif
-  }
-}
-
 void ProgramInterpreter::BuildSkipShareLoDInfo() {
   for (size_t i = 0; i < vec_instruction_.size(); ++i) {
     bool can_skip_lod = true;
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
index 7e956249e22a3..f72faf54f2b1d 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.h
+++ b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -60,9 +60,9 @@ class ProgramInterpreter : public InterpreterBaseImpl {
 
   std::shared_ptr<ProgramDesc> GetMutableCopyProgram() override;
 
-  void Build(
-      const std::vector<std::string>& feed_names,
-      std::vector<paddle::framework::OpFuncNode>* op_func_nodes) override;
+  void Build(const std::vector<std::string>& feed_names,
+             std::vector<paddle::framework::OpFuncNode>* op_func_nodes,
+             bool switch_stream = false) override;
 
   void ShareWorkQueueFrom(InterpreterBaseImpl* src) override;
 
@@ -101,6 +101,10 @@ class ProgramInterpreter : public InterpreterBaseImpl {
     input_hookfuncs_ = hookfuncs;
   }
 
+  void SetOutputHooks(const std::vector<PirHookFunc>& hookfuncs) override {}
+
+  void SetInputHooks(const std::vector<PirHookFunc>& hookfuncs) override {}
+
   std::unordered_map<std::string, std::shared_ptr<EventInter>>*
   GetForceEventsToWaitInfo() {
     return force_events_to_wait_;
@@ -127,8 +131,6 @@ class ProgramInterpreter : public InterpreterBaseImpl {
   void BuildSkipShareLoDInfo();
   void UpdateSyncOpNum();
   void AnalyseExecuteOrderForTrace();
-  void BuildOpFuncNode(
-      std::vector<paddle::framework::OpFuncNode>* op_func_nodes);
 
   // inplace
   void BuildInplace();
diff --git a/paddle/fluid/framework/new_executor/workqueue/event_count.h b/paddle/fluid/framework/new_executor/workqueue/event_count.h
index 9f80b02904dad..6918cc5a42edd 100644
--- a/paddle/fluid/framework/new_executor/workqueue/event_count.h
+++ b/paddle/fluid/framework/new_executor/workqueue/event_count.h
@@ -121,7 +121,7 @@ class EventCount {
       CheckState(state, true);
       uint64_t newstate;
       if ((state & kSignalMask) != 0) {
-        // Consume the signal and return immidiately.
+        // Consume the signal and return immediately.
         newstate = state - kWaiterInc - kSignalInc;
       } else {
         // Remove this thread from pre-wait counter and add to the waiter stack.
@@ -148,7 +148,7 @@ class EventCount {
       CheckState(state, true);
       uint64_t newstate = state - kWaiterInc;
       // We don't know if the thread was also notified or not,
-      // so we should not consume a signal unconditionaly.
+      // so we should not consume a signal unconditionally.
       // Only if number of waiters is equal to number of signals,
       // we know that the thread was notified and we must take away the signal.
       if (((state & kWaiterMask) >> kWaiterShift) ==
diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h
index ce0a138eb1a6a..4839592aa43b7 100644
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -107,7 +107,7 @@ inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) {
   bool ret =
       (l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r);
 #ifdef PADDLE_WITH_DNNL
-  // Layout transform needed for either non-MKLDNN to MKLDNN or vice versa
+  // Layout transform needed for either non-MKLDNN to OneDNN or vice versa
   ret |= (l != DataLayout::ONEDNN && r == DataLayout::ONEDNN);
   ret |= (l == DataLayout::ONEDNN && r != DataLayout::ONEDNN);
 #endif
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index fe10a16375f34..d5dab65d18d15 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -53,8 +53,8 @@ class DenseTensor;
 #endif
 
 #ifdef PADDLE_WITH_DNNL
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#include "paddle/fluid/platform/mkldnn_op_list.h"
+#include "paddle/fluid/platform/onednn_helper.h"
+#include "paddle/fluid/platform/onednn_op_list.h"
 #endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -504,7 +504,7 @@ void RuntimeInferShapeContext::ShareLoD(const std::string& in,
   // Workaround:
   //    Skip set_layout() when input layout is kMKLDNN
   //    This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN
-  //    OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called
+  //    OPKernel. In all OneDNN OPkernel, set_layout(kMKLDNN) should be called
   //    in Compute()
   if (in_tensor.layout() != DataLayout::ONEDNN)
 #endif
@@ -1571,12 +1571,12 @@ bool OperatorWithKernel::SupportsKernelType(
   }
 #endif
 
-// NOTE(jiahongyu): If MKLDNN can be used, the function SupportsKernelType needs
-// to check whether current op supports MKLDNN kernel. There are three
+// NOTE(jiahongyu): If OneDNN can be used, the function SupportsKernelType needs
+// to check whether current op supports OneDNN kernel. There are three
 // statements in if condition:
-// 1. Whether mkldnn kernel fallbacks to plain kernel;
+// 1. Whether onednn kernel fallbacks to plain kernel;
 // 2. Whether this op has specific implementation;
-// 3. Whether mkldnn kernel can be used.
+// 3. Whether onednn kernel can be used.
 #ifdef PADDLE_WITH_DNNL
   if (!this->DnnFallback() && !paddle::platform::in_mkldnn_white_list(type_) &&
       this->CanMKLDNNBeUsed(exe_ctx, kernel_type.data_type_)) {
@@ -1771,7 +1771,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   // TODO(chenweihang): Now we are still reusing a lot of the original fluid
   // implementation, this is a gradual replacement process
   // TODO(chenweihang): in the first phase of project, we only support CPU, CUDA
-  // and RCOM backend, the XPU, NPU and MKLDNN will be supported in the second
+  // and RCOM backend, the XPU, NPU and OneDNN will be supported in the second
   // phase
   phi::KernelKey phi_kernel_key;
   std::string phi_kernel_name;
@@ -1846,13 +1846,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       }
     } else {
       phi_kernel_name = kernel_signature_->name;
-// NOTE(jiahongyu): The registered MKLDNN kernel have library_type =
+// NOTE(jiahongyu): The registered OneDNN kernel have library_type =
 // LibraryType::kMKLDNN and data_layout_ = DataLayout::ONEDNN. But the default
 // values are kPlain, so we need to modify the library_type and data_layout_
 // here. There are three statements in if condition:
-// 1. Whether mkldnn kernel fallbacks to plain kernel;
+// 1. Whether onednn kernel fallbacks to plain kernel;
 // 2. Whether this op has specific implementation;
-// 3. Whether mkldnn kernel can be used.
+// 3. Whether onednn kernel can be used.
 #ifdef PADDLE_WITH_DNNL
       if (!this->DnnFallback() &&
           !paddle::platform::in_mkldnn_white_list(type_) &&
@@ -2121,7 +2121,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   }
 
   if (FLAGS_enable_unused_var_check) {
-    // skip op that uses mkldnn because it has different memory reuse strategy.
+    // skip op that uses onednn because it has different memory reuse strategy.
     // use attr here because some GradMakers (like ActivationGradOpMaker) add
     // input when use_mkldnn=true;
     if (!(HasAttr("use_mkldnn") && Attr<bool>("use_mkldnn"))) {
@@ -2181,12 +2181,12 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
       framework::TransPhiKernelKeyToOpKernelType(phi_kernel_key);
 
 // NOTE(jiahongyu): PADDLE_WITH_DNNL codes are moved outside function
-// GetExpectedKernelType, so that if MKLDNN can be used, the library_type_ and
+// GetExpectedKernelType, so that if OneDNN can be used, the library_type_ and
 // data_layout_ of expected_kernel_key need to be adjusted. There are three
 // statements in if condition:
-// 1. Whether mkldnn kernel fallbacks to plain kernel;
+// 1. Whether onednn kernel fallbacks to plain kernel;
 // 2. Whether this op has specific implementation;
-// 3. Whether mkldnn kernel can be used.
+// 3. Whether onednn kernel can be used.
 #ifdef PADDLE_WITH_DNNL
   if (!this->DnnFallback() && !paddle::platform::in_mkldnn_white_list(type_) &&
       this->CanMKLDNNBeUsed(ctx, expected_kernel_key.data_type_)) {
@@ -2815,7 +2815,7 @@ Scope* OperatorWithKernel::PrepareData(
       prepare_input_data(input_name, &ins_vector, &in_def, should_skip_input);
     }
 #ifdef PADDLE_WITH_DNNL
-    // For input that is Extra, only MKLDNN will use Extra Inputs
+    // For input that is Extra, only OneDNN will use Extra Inputs
     auto& extra_input_names =
         paddle::operators::ExtraInfoUtils::Instance().GetExtraInputNamesMap(
             Type());
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 3ad9ec6c9d698..dc025998cc099 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -921,7 +921,7 @@ class OperatorWithKernel : public OperatorBase {
   mutable std::mutex cache_update_mutex_;
   mutable bool enable_cache_transfer_scope_ = false;
   // NOTE(jiahongyu): Whether fallback to plain kernel after calling
-  // GetExpectedKernelType, use this bool flag to solve mkldnn and cudnn hard
+  // GetExpectedKernelType, use this bool flag to solve onednn and cudnn hard
   // code
   mutable bool dnn_fallback_ = false;
   // NOTE(chenweihang): Similar op members are used to adapt to
diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
index 4a0a869b8a2bd..2e4e5083caa36 100644
--- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
+++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
@@ -17,8 +17,8 @@ set(paddle2cinn_deps
     auto_schedule_proto
     parallel_executor
     common)
-if(WITH_MKLDNN)
-  set(paddle2cinn ${paddle2cinn} mkldnn)
+if(WITH_ONEDNN)
+  set(paddle2cinn ${paddle2cinn} onednn)
 endif()
 
 if(WITH_TESTING)
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
index 90567bd728cd5..ce4304b4ec228 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
@@ -36,7 +36,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.h"
-#include "paddle/fluid/operators/cinn/cinn_launch_op.h"
+#include "paddle/fluid/operators/cinn/cinn_op_helper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
 
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 31ab7e1b1bcaa..4b5051a8aadd0 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -79,7 +79,8 @@ cc_library(
        layout_autotune
        ops_extra_info
        phi
-       common)
+       common
+       global_utils)
 cc_library(
   basic_engine
   SRCS basic_engine.cc
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 5192e8c773888..a3c5b51b80b3b 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -27,7 +27,7 @@
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #ifdef PADDLE_WITH_DNNL
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #endif
 
 COMMON_DECLARE_bool(use_mkldnn);
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index a60c81a4c22d9..9f4f46c60cea4 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -26,7 +26,7 @@
 #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
 #endif
 #ifdef PADDLE_WITH_DNNL
-#include "paddle/fluid/platform/mkldnn_op_list.h"
+#include "paddle/fluid/platform/onednn_op_list.h"
 #endif
 #include "paddle/common/flags.h"
 #include "paddle/fluid/framework/library_type.h"
@@ -166,7 +166,7 @@ PreparedOp PrepareImpl(
   auto* dev_ctx = pool.Get(place);
 
 #ifdef PADDLE_WITH_DNNL
-  // MKLDNN variant of code reads attributes in some of GetKernelTypeForVar and
+  // OneDNN variant of code reads attributes in some of GetKernelTypeForVar and
   // GetKernelType functions, so we need to copy the attributes there.
   // Const qualifier of Attrs had to be discarded to overwrite it.
   if (FLAGS_use_mkldnn) {
@@ -190,13 +190,13 @@ PreparedOp PrepareImpl(
   phi::KernelSignature kernel_signature;
   std::string phi_kernel_name;
 
-// NOTE(jiahongyu): The registered MKLDNN kernel have library_type =
+// NOTE(jiahongyu): The registered OneDNN kernel have library_type =
 // LibraryType::kMKLDNN and data_layout_ = DataLayout::ONEDNN. But the default
 // values are kPlain, so we need to modify the library_type and data_layout_
 // here. There are three statements in if condition:
-// 1. Whether mkldnn kernel fallbacks to plain kernel;
+// 1. Whether onednn kernel fallbacks to plain kernel;
 // 2. Whether this op has specific implementation;
-// 3. Whether mkldnn kernel can be used.
+// 3. Whether onednn kernel can be used.
 #ifdef PADDLE_WITH_DNNL
   if (!op.DnnFallback() && !paddle::platform::in_mkldnn_white_list(op.Type()) &&
       op.CanMKLDNNBeUsed(dygraph_exe_ctx, expected_kernel_key.dtype())) {
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 3eff589fee703..7aa4652ec0058 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -36,8 +36,8 @@
 #include "paddle/utils/string/string_helper.h"
 
 COMMON_DECLARE_bool(use_mkldnn);
-COMMON_DECLARE_string(tracer_mkldnn_ops_on);
-COMMON_DECLARE_string(tracer_mkldnn_ops_off);
+COMMON_DECLARE_string(tracer_onednn_ops_on);
+COMMON_DECLARE_string(tracer_onednn_ops_off);
 COMMON_DECLARE_bool(use_stride_kernel);
 
 namespace paddle {
@@ -245,12 +245,12 @@ void Tracer::TraceOpImpl(const std::string& type,
     // if both lists are empty all ops are enabled (default for
     // FLAGS_use_mkldnn=1)
     // if ops_on list is not empty only ops from that list are enabled
-    if (!FLAGS_tracer_mkldnn_ops_on.empty()) {
-      auto is_on = FLAGS_tracer_mkldnn_ops_on.find(type) != std::string::npos;
+    if (!FLAGS_tracer_onednn_ops_on.empty()) {
+      auto is_on = FLAGS_tracer_onednn_ops_on.find(type) != std::string::npos;
       attrs["use_mkldnn"] = is_on;
     } else {
       // if ops_on list is empty all ops are enabled except types from off_list
-      auto is_off = FLAGS_tracer_mkldnn_ops_off.find(type) != std::string::npos;
+      auto is_off = FLAGS_tracer_onednn_ops_off.find(type) != std::string::npos;
       attrs["use_mkldnn"] = !is_off;
     }
   }
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 77052155efaa6..1f3544bf702b4 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -132,7 +132,7 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("graph_viz_path", new std::string(std::move(dot_file_path)));
       pass->Set("optim_cache_dir", new std::string(std::move(optim_cache_dir)));
       pass_num++;
-    } else if (pass_name == "mkldnn_placement_pass") {
+    } else if (pass_name == "onednn_placement_pass") {
       pass->Set("mkldnn_enabled_op_types",
                 new std::unordered_set<std::string>(
                     argument->mkldnn_enabled_op_types()));
@@ -364,13 +364,13 @@ void IRPassManager::CreatePasses(Argument *argument,
                     argument->nnadapter_model_cache_token()));
     } else if (pass_name == "fc_fuse_pass") {
       pass->Set("use_gpu", new bool(argument->use_gpu()));
-      bool fc_mkldnn_pass = false;
+      bool fc_onednn_pass = false;
       for (const std::string &pass_n : passes) {
-        if (pass_n == "fc_mkldnn_pass") {
-          fc_mkldnn_pass = true;
+        if (pass_n == "fc_onednn_pass") {
+          fc_onednn_pass = true;
         }
       }
-      bool use_fc_padding = !fc_mkldnn_pass && argument->use_fc_padding();
+      bool use_fc_padding = !fc_onednn_pass && argument->use_fc_padding();
       pass->Set("use_fc_padding", new bool(use_fc_padding));
     } else if (pass_name == "fused_multi_transformer_xpu_pass") {
       int quant_post_dynamic_weight_precision =
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 65a4bea5b1240..c559b6d7e8897 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -4,12 +4,12 @@ endif()
 
 add_subdirectory(details)
 
-if(WITH_MKLDNN)
+if(WITH_ONEDNN)
   set(mkldnn_quantizer_cfg mkldnn_quantizer_config)
-  set(mkldnn_quantizer_src ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn_quantizer.cc)
+  set(mkldnn_quantizer_src ${CMAKE_CURRENT_SOURCE_DIR}/onednn_quantizer.cc)
   cc_library(
     ${mkldnn_quantizer_cfg}
-    SRCS mkldnn_quantizer_config.cc
+    SRCS onednn_quantizer_config.cc
     DEPS lod_tensor paddle_pass_builder)
   set(mkldnn_quantizer_cfg
       ${mkldnn_quantizer_cfg}
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index efe7b83f7df16..b8570fa05e7c4 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -505,7 +505,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(dlnne_precision_mode_);
   CP_MEMBER(dlnne_disable_nodes_by_outputs_);
   CP_MEMBER(dlnne_input_shape_dict_);
-  // MKLDNN related.
+  // OneDNN related.
   CP_MEMBER(use_mkldnn_);
   CP_MEMBER(mkldnn_enabled_op_types_);
   CP_MEMBER(mkldnn_cache_capacity_);
@@ -991,18 +991,18 @@ void AnalysisConfig::Update() {
 #ifdef PADDLE_WITH_DNNL
   // Since EnableMKLDNN is default, the pass_builder has created in the first
   // time.
-  // Case1: User manually disable mkldnn after pass_builder
+  // Case1: User manually disable onednn after pass_builder
   // create.(config.disable_mkldnn())
   // Case2: User device is gpu/ipu/xpu, use
   // EnableXpu(), EnableCUDNN(), PassStrategy has been reset in the above code
   // block
   //  Case3: pass_builder_ has been created and belongs to
-  // GpuPassStrategy(or IpuPassStrategy), neither enable mkldnn and
-  // disable mkldnn will be executed
+  // GpuPassStrategy(or IpuPassStrategy), neither enable onednn and
+  // disable onednn will be executed
   if ((!use_gpu() && !use_xpu() && !use_ipu() && !use_mkldnn_) ||
       (use_mkldnn_ &&
        !phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx2))) {
-    // User manually disable mkldnn or disable when not support AVX2
+    // User manually disable onednn or disable when not support AVX2
     use_mkldnn_ = false;
     pass_builder()->DisableMKLDNN();
   }
@@ -1054,7 +1054,7 @@ void AnalysisConfig::Update() {
   if (!use_gpu() && !use_xpu() && !use_ipu()) {
     if (use_mkldnn_ && enable_ir_optim_) {
 #ifdef PADDLE_WITH_DNNL
-      // default enable mkldnn when device is cpu and enable_ir_optim
+      // default enable onednn when device is cpu and enable_ir_optim
       pass_builder()->EnableMKLDNN();
 #endif
     }
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 9420d84bab558..d4a73175b3222 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -31,6 +31,7 @@
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
@@ -79,7 +80,7 @@
 #endif
 
 #ifdef PADDLE_WITH_DNNL
-#include "paddle/fluid/inference/api/mkldnn_quantizer.h"
+#include "paddle/fluid/inference/api/onednn_quantizer.h"
 #endif
 
 #ifdef PADDLE_WITH_ONNXRUNTIME
@@ -107,6 +108,7 @@
 #ifdef PADDLE_WITH_CINN
 #include "paddle/cinn/hlir/dialect/operator/ir/op_dialect.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.h"
+#include "paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.h"
 #include "paddle/pir/include/dialect/shape/ir/shape_dialect.h"
 #endif
 
@@ -407,7 +409,7 @@ bool AnalysisPredictor::Init(
     root_predictor_id_ = predictor_id_;
   }
 
-  // no matter with or without MKLDNN
+  // no matter with or without OneDNN
   paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
 
   // Use Optimized model to inference
@@ -618,6 +620,9 @@ void AnalysisPredictor::ClearExtraParams() {
                            config_.shape_range_info_path_);
         }
       }
+      if (op_desc->HasAttr("predictor_id")) {
+        op_desc->SetAttr("predictor_id", predictor_id_);
+      }
     }
   }
 
@@ -780,10 +785,18 @@ bool AnalysisPredictor::PrepareProgram(
     executor_->CreateVariables(*inference_program_, 0, true, sub_scope_);
 
     // if enable_ir_optim_ is false,
-    // the analysis pass(op fuse, graph analysis, trt subgraph, mkldnn etc) will
+    // the analysis pass(op fuse, graph analysis, trt subgraph, onednn etc) will
     // not be executed.
     model_precision_ =
         paddle::inference::GetModelPrecision(*inference_program_);
+#ifdef PADDLE_WITH_TENSORRT
+    if (config_.tensorrt_engine_enabled()) {
+      inference::tensorrt::TensorRTEngine::predictor_id_per_thread =
+          predictor_id_;
+      VLOG(3) << "thread_local var predictor_id in TensorRTEngine is set to: "
+              << inference::tensorrt::TensorRTEngine::predictor_id_per_thread;
+    }
+#endif
     if (config_.use_optimized_model_) {
       LoadParameters();
       ClearExtraParams();
@@ -896,36 +909,35 @@ bool AnalysisPredictor::PrepareExecutor() {
       };
 
 #ifdef PADDLE_WITH_CINN
+      auto CreatePassMgr = [&] {
+        pir::IrContext *ctx = pir::IrContext::Instance();
+        ctx->GetOrRegisterDialect<cinn::dialect::OperatorDialect>();
+        ctx->GetOrRegisterDialect<pir::shape::ShapeDialect>();
+        auto pass_manager = std::make_shared<::pir::PassManager>(
+            ::pir::IrContext::Instance(), 2);
+        if (!config_.glog_info_disabled()) {
+          pass_manager->EnablePrintStatistics();
+        }
+        if (config_.ir_debug_) {
+          pass_manager->EnableIRPrinting(
+              std::make_unique<pir::PassManager::IRPrinterOption>(
+                  ir_printing_conditions, ir_printing_conditions));
+        }
+        return pass_manager;
+      };
+
       if (paddle::prim::PrimCommonUtils::IsFwdPrimEnabled()) {
         VLOG(4) << "[Prim] Decomp program in predictor begin.";
         DecompProgram decomp_object(pir_program_.get());
         decomp_object.decomp_program();
 
-        auto shape_pm = std::make_shared<::pir::PassManager>(
-            ::pir::IrContext::Instance(), 2);
-        ::pir::shape::AddShapeOptimizationPass(shape_pm, *pir_program_.get());
-        VLOG(4) << "[ShapeDialect] Run AddShapeOptimizationPass";
-        shape_pm->Run(pir_program_.get());
+        cinn::dialect::ir::CheckInferSymbolicIfNeed(pir_program_.get(),
+                                                    CreatePassMgr);
       }
 
       if (config_.cinn_enabled()) {
         VLOG(4) << "[CINN] Begin ApplyCinnPass";
-        cinn::dialect::ir::ApplyCinnPass(pir_program_.get(), [&] {
-          pir::IrContext *ctx = pir::IrContext::Instance();
-          ctx->GetOrRegisterDialect<cinn::dialect::OperatorDialect>();
-          ctx->GetOrRegisterDialect<pir::shape::ShapeDialect>();
-          auto pass_manager = std::make_shared<::pir::PassManager>(
-              ::pir::IrContext::Instance(), 2);
-          if (!config_.glog_info_disabled()) {
-            pass_manager->EnablePrintStatistics();
-          }
-          if (config_.ir_debug_) {
-            pass_manager->EnableIRPrinting(
-                std::make_unique<pir::PassManager::IRPrinterOption>(
-                    ir_printing_conditions, ir_printing_conditions));
-          }
-          return pass_manager;
-        });
+        cinn::dialect::ir::ApplyCinnPass(pir_program_.get(), CreatePassMgr);
       }
 #endif
 
@@ -934,16 +946,14 @@ bool AnalysisPredictor::PrepareExecutor() {
                                  config_.pm_opt_level_);
       if (!config_.custom_passes_.empty()) {
         for (const auto &custom_pass : config_.custom_passes_) {
-          pass_pm.AddPass(
-              std::move(pir::PassRegistry::Instance().Get(custom_pass)));
+          pass_pm.AddPass(pir::PassRegistry::Instance().Get(custom_pass));
         }
       }
       if (config_.use_gpu()) {
         // gpu
         if (!config_.custom_pass_only_) {
           for (const auto &gpu_pass : kPirGpuPasses) {
-            pass_pm.AddPass(
-                std::move(pir::PassRegistry::Instance().Get(gpu_pass)));
+            pass_pm.AddPass(pir::PassRegistry::Instance().Get(gpu_pass));
           }
         }
 
@@ -963,8 +973,7 @@ bool AnalysisPredictor::PrepareExecutor() {
         // mkldnn
         if (!config_.custom_pass_only_) {
           for (const auto &mkldnn_pass : kPirMkldnnPasses) {
-            pass_pm.AddPass(
-                std::move(pir::PassRegistry::Instance().Get(mkldnn_pass)));
+            pass_pm.AddPass(pir::PassRegistry::Instance().Get(mkldnn_pass));
           }
         }
 #endif
@@ -972,8 +981,7 @@ bool AnalysisPredictor::PrepareExecutor() {
         // cpu
         if (!config_.custom_pass_only_) {
           for (const auto &cpu_pass : kPirCpuPasses) {
-            pass_pm.AddPass(
-                std::move(pir::PassRegistry::Instance().Get(cpu_pass)));
+            pass_pm.AddPass(pir::PassRegistry::Instance().Get(cpu_pass));
           }
         }
       }
@@ -2014,14 +2022,6 @@ void AnalysisPredictor::PrepareArgument() {
 // NOTE All the members in AnalysisConfig should be copied to Argument.
 void AnalysisPredictor::OptimizeInferenceProgram() {
   PrepareArgument();
-#ifdef PADDLE_WITH_TENSORRT
-  if (config_.tensorrt_engine_enabled()) {
-    inference::tensorrt::TensorRTEngine::predictor_id_per_thread =
-        predictor_id_;
-    VLOG(3) << "thread_local var predictor_id in TensorRTEngine is set to: "
-            << inference::tensorrt::TensorRTEngine::predictor_id_per_thread;
-  }
-#endif
   Analyzer().Run(argument_.get());
   PADDLE_ENFORCE_EQ(
       argument_->scope_valid(),
@@ -3108,49 +3108,99 @@ void AnalysisPredictor::SaveOptimModel(const std::string &dir) {
   exe.Run(save_program, scope(), 0, true, true);
 }
 
-void AnalysisPredictor::RegisterInputHook(const InputTensorHookFunc &hookfunc) {
-  std::call_once(register_input_hook_flag_, [this] {
-    executor_->RegisterInputHook(
-        [this](framework::OperatorBase *op, framework::Scope *scope) {
-          for (auto &input : op->Inputs()) {
-            for (auto &var_name : input.second) {
+void AnalysisPredictor::RegisterOutputHook(
+    const OutputTensorHookFunc &hookfunc) {
+  if (config_.new_ir_enabled()) {
+    std::call_once(register_output_hook_flag_, [this] {
+      executor_->RegisterOutputHook(
+          [this](framework::InstructionBase *instr,
+                 framework::ValueExecutionInfo *value_exe_info,
+                 framework::Scope *scope) {
+            for (auto &output : instr->Outputs()) {
+              auto var_name = value_exe_info->GetVarName(output.first);
               auto *var = scope->FindVar(var_name);
               if (!var || !var->IsType<phi::DenseTensor>()) continue;
               auto dense_tensor = var->Get<phi::DenseTensor>();
               if (!dense_tensor.initialized()) continue;
               auto tensor = paddle::Tensor(
                   std::make_shared<phi::DenseTensor>(dense_tensor), var_name);
-              for (auto &hookfunc : this->input_hookfuncs_) {
-                hookfunc(op->Type(), var_name, tensor);
+              for (auto &hookfunc : this->output_hookfuncs_) {
+                hookfunc(instr->Name() + ":" + std::to_string(instr->Id()),
+                         var_name,
+                         tensor);
               }
             }
-          }
-        });
-  });
-  input_hookfuncs_.push_back(hookfunc);
+          });
+    });
+    output_hookfuncs_.push_back(hookfunc);
+  } else {
+    std::call_once(register_output_hook_flag_, [this] {
+      executor_->RegisterOutputHook(
+          [this](framework::OperatorBase *op, framework::Scope *scope) {
+            for (auto &output : op->Outputs()) {
+              for (auto &var_name : output.second) {
+                auto *var = scope->FindVar(var_name);
+                if (!var || !var->IsType<phi::DenseTensor>()) continue;
+                auto dense_tensor = var->Get<phi::DenseTensor>();
+                if (!dense_tensor.initialized()) continue;
+                auto tensor = paddle::Tensor(
+                    std::make_shared<phi::DenseTensor>(dense_tensor), var_name);
+                for (auto &hookfunc : this->output_hookfuncs_) {
+                  hookfunc(op->Type(), var_name, tensor);
+                }
+              }
+            }
+          });
+    });
+    output_hookfuncs_.push_back(hookfunc);
+  }
 }
 
-void AnalysisPredictor::RegisterOutputHook(
-    const OutputTensorHookFunc &hookfunc) {
-  std::call_once(register_output_hook_flag_, [this] {
-    executor_->RegisterOutputHook(
-        [this](framework::OperatorBase *op, framework::Scope *scope) {
-          for (auto &output : op->Outputs()) {
-            for (auto &var_name : output.second) {
+void AnalysisPredictor::RegisterInputHook(const InputTensorHookFunc &hookfunc) {
+  if (config_.new_ir_enabled()) {
+    std::call_once(register_input_hook_flag_, [this] {
+      executor_->RegisterInputHook(
+          [this](framework::InstructionBase *instr,
+                 framework::ValueExecutionInfo *value_exe_info,
+                 framework::Scope *scope) {
+            for (auto &input : instr->Inputs()) {
+              auto var_name = value_exe_info->GetVarName(input.first);
               auto *var = scope->FindVar(var_name);
               if (!var || !var->IsType<phi::DenseTensor>()) continue;
               auto dense_tensor = var->Get<phi::DenseTensor>();
               if (!dense_tensor.initialized()) continue;
               auto tensor = paddle::Tensor(
                   std::make_shared<phi::DenseTensor>(dense_tensor), var_name);
-              for (auto &hookfunc : this->output_hookfuncs_) {
-                hookfunc(op->Type(), var_name, tensor);
+              for (auto &hookfunc : this->input_hookfuncs_) {
+                hookfunc(instr->Name() + ":" + std::to_string(instr->Id()),
+                         var_name,
+                         tensor);
               }
             }
-          }
-        });
-  });
-  output_hookfuncs_.push_back(hookfunc);
+          });
+    });
+    input_hookfuncs_.push_back(hookfunc);
+  } else {
+    std::call_once(register_input_hook_flag_, [this] {
+      executor_->RegisterInputHook(
+          [this](framework::OperatorBase *op, framework::Scope *scope) {
+            for (auto &input : op->Inputs()) {
+              for (auto &var_name : input.second) {
+                auto *var = scope->FindVar(var_name);
+                if (!var || !var->IsType<phi::DenseTensor>()) continue;
+                auto dense_tensor = var->Get<phi::DenseTensor>();
+                if (!dense_tensor.initialized()) continue;
+                auto tensor = paddle::Tensor(
+                    std::make_shared<phi::DenseTensor>(dense_tensor), var_name);
+                for (auto &hookfunc : this->input_hookfuncs_) {
+                  hookfunc(op->Type(), var_name, tensor);
+                }
+              }
+            }
+          });
+    });
+    input_hookfuncs_.push_back(hookfunc);
+  }
 }
 
 template <>
@@ -3451,7 +3501,7 @@ uint64_t Predictor::TryShrinkMemory() { return predictor_->TryShrinkMemory(); }
 void Predictor::RegisterOutputHook(const OutputTensorHookFunc &hookfunc) {
   predictor_->RegisterOutputHook(hookfunc);
 }
-void Predictor::RegisterInputHook(const OutputTensorHookFunc &hookfunc) {
+void Predictor::RegisterInputHook(const InputTensorHookFunc &hookfunc) {
   predictor_->RegisterInputHook(hookfunc);
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index fe494cab93a90..d44ad5cec1a90 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -321,7 +321,7 @@ class AnalysisPredictor : public PaddlePredictor {
   void RegisterInputHook(const InputTensorHookFunc &hookfunc) override;
 
   ///
-  /// \brief Initialize mkldnn quantizer and execute mkldnn quantization pass
+  /// \brief Initialize onednn quantizer and execute onednn quantization pass
   ///
   /// \return Whether the function executed successfully
   ///
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 1ae582feb4acf..9ae284402f196 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -71,7 +71,7 @@ bool NativePaddlePredictor::Init(
     platform::EnableProfiler(tracking_device);
   }
 
-  // no matter with or without MKLDNN
+  // no matter with or without OneDNN
   paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
 
   if (config_.use_gpu) {
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 833fc98d36dba..5597057c3dc12 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -155,13 +155,13 @@ if(WITH_MKL)
         ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
         ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
   endif()
-  set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn")
-  if(EXISTS ${MKLDNN_PATH})
-    include_directories("${MKLDNN_PATH}/include")
+  set(ONEDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}onednn")
+  if(EXISTS ${ONEDNN_PATH})
+    include_directories("${ONEDNN_PATH}/include")
     if(WIN32)
-      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib)
+      set(MKLDNN_LIB ${ONEDNN_PATH}/lib/mkldnn.lib)
     else()
-      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libdnnl.so.3)
+      set(MKLDNN_LIB ${ONEDNN_PATH}/lib/libdnnl.so.3)
     endif()
   endif()
 else()
@@ -309,7 +309,7 @@ if(WIN32)
               ${LIB_PATH}
       COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/libiomp5md.dll
               ${LIB_PATH}
-      COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_PATH}/lib/mkldnn.dll
+      COMMAND ${CMAKE_COMMAND} -E copy ${ONEDNN_PATH}/lib/mkldnn.dll
               ${LIB_PATH})
   else()
     add_custom_command(
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc
index c3589f4251791..fda408b15df5f 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc
@@ -57,9 +57,10 @@ std::unique_ptr<Tensor> CreateTensor(paddle_infer::PlaceType place,
 
 template <typename T>
 struct RandomGenerator {
-  RandomGenerator(double min = (std::numeric_limits<T>::min)(),
-                  double max = (std::numeric_limits<T>::max)())
-      : dist_{static_cast<double>(min), static_cast<double>(max)} {}
+  RandomGenerator(
+      double min = static_cast<double>((std::numeric_limits<T>::min)()),
+      double max = static_cast<double>((std::numeric_limits<T>::max)()))
+      : dist_{min, max} {}
   T operator()() { return static_cast<T>(dist_(random_engine_)); }
 
  private:
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/onednn_quantizer.cc
similarity index 99%
rename from paddle/fluid/inference/api/mkldnn_quantizer.cc
rename to paddle/fluid/inference/api/onednn_quantizer.cc
index 76222b84d4624..aa6f52008ab24 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/onednn_quantizer.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/api/mkldnn_quantizer.h"
+#include "paddle/fluid/inference/api/onednn_quantizer.h"
 
 #include <algorithm>
 #include <limits>
@@ -29,7 +29,7 @@
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/utils/string/pretty_log.h"
 
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.h b/paddle/fluid/inference/api/onednn_quantizer.h
similarity index 100%
rename from paddle/fluid/inference/api/mkldnn_quantizer.h
rename to paddle/fluid/inference/api/onednn_quantizer.h
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc b/paddle/fluid/inference/api/onednn_quantizer_config.cc
similarity index 98%
rename from paddle/fluid/inference/api/mkldnn_quantizer_config.cc
rename to paddle/fluid/inference/api/onednn_quantizer_config.cc
index da20870eb0f5c..786d9463766e9 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
+++ b/paddle/fluid/inference/api/onednn_quantizer_config.cc
@@ -14,7 +14,7 @@
 
 #include <string>
 
-#include "paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h"
+#include "paddle/fluid/inference/api/paddle_onednn_quantizer_config.h"
 
 namespace paddle {
 
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 72df8efb095a6..019418f45b625 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -39,7 +39,7 @@
 #include "paddle_api.h"           // NOLINT
 #include "paddle_pass_builder.h"  // NOLINT
 #ifdef PADDLE_WITH_DNNL
-#include "paddle_mkldnn_quantizer_config.h"  // NOLINT
+#include "paddle_onednn_quantizer_config.h"  // NOLINT
 #endif
 
 namespace paddle {
@@ -970,19 +970,19 @@ struct PD_INFER_DECL AnalysisConfig {
   void SwitchIrDebug(int x = true, const std::vector<std::string>& passes = {});
 
   ///
-  /// \brief Turn on MKLDNN.
+  /// \brief Turn on OneDNN.
   ///
   ///
   void EnableMKLDNN();
 
   ///
-  /// \brief Turn down MKLDNN.
+  /// \brief Turn down OneDNN.
   ///
   ///
   void DisableMKLDNN();
 
   ///
-  /// \brief Set the cache capacity of different input shapes for MKLDNN.
+  /// \brief Set the cache capacity of different input shapes for OneDNN.
   /// Default value 0 means not caching any shape.
   /// Please see MKL-DNN Data Caching Design Document:
   /// https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/mkldnn/caching/caching.md
@@ -991,9 +991,9 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   void SetMkldnnCacheCapacity(int capacity);
   ///
-  /// \brief A boolean state telling whether to use the MKLDNN.
+  /// \brief A boolean state telling whether to use the OneDNN.
   ///
-  /// \return bool Whether to use the MKLDNN.
+  /// \return bool Whether to use the OneDNN.
   ///
   bool mkldnn_enabled() const { return use_mkldnn_; }
 
@@ -1021,7 +1021,7 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   NativeConfig ToNativeConfig() const;
   ///
-  /// \brief Specify the operator type list to use MKLDNN acceleration.
+  /// \brief Specify the operator type list to use OneDNN acceleration.
   ///
   /// \param op_list The operator type list.
   ///
@@ -1030,47 +1030,47 @@ struct PD_INFER_DECL AnalysisConfig {
   }
 
   ///
-  /// \brief Turn on MKLDNN quantization.
+  /// \brief Turn on OneDNN quantization.
   ///
   ///
   void EnableMkldnnQuantizer();
 
   ///
-  /// \brief Turn on MKLDNN int8.
+  /// \brief Turn on OneDNN int8.
   ///
   /// \param op_list The operator type list.
   ///
   void EnableMkldnnInt8(const std::unordered_set<std::string>& op_list = {});
 
   ///
-  /// \brief A boolean state telling whether to use the MKLDNN Int8.
+  /// \brief A boolean state telling whether to use the OneDNN Int8.
   ///
-  /// \return bool Whether to use the MKLDNN Int8.
+  /// \return bool Whether to use the OneDNN Int8.
   ///
   bool mkldnn_int8_enabled() const { return use_mkldnn_int8_; }
 
   ///
-  /// \brief Turn on MKLDNN bfloat16.
+  /// \brief Turn on OneDNN bfloat16.
   ///
   ///
   void EnableMkldnnBfloat16();
 
   ///
-  /// \brief Turn off MKLDNN fc passes.
+  /// \brief Turn off OneDNN fc passes.
   ///
   void DisableMkldnnFcPasses();
 
   ///
-  /// \brief A boolean state telling whether to disable the MKLDNN Fc passes.
+  /// \brief A boolean state telling whether to disable the OneDNN Fc passes.
   ///
-  /// \return bool Whether to disable the MKLDNN Fc passes.
+  /// \return bool Whether to disable the OneDNN Fc passes.
   ///
   bool mkldnn_fc_passes_disabled() const { return disable_mkldnn_fc_passes_; }
 
   ///
-  /// \brief A boolean state telling whether to use the MKLDNN Bfloat16.
+  /// \brief A boolean state telling whether to use the OneDNN Bfloat16.
   ///
-  /// \return bool Whether to use the MKLDNN Bfloat16.
+  /// \return bool Whether to use the OneDNN Bfloat16.
   ///
   bool mkldnn_bfloat16_enabled() const { return use_mkldnn_bfloat16_; }
 
@@ -1091,16 +1091,16 @@ struct PD_INFER_DECL AnalysisConfig {
   bool thread_local_stream_enabled() const { return thread_local_stream_; }
 
   ///
-  /// \brief A boolean state telling whether the MKLDNN quantization is enabled.
+  /// \brief A boolean state telling whether the OneDNN quantization is enabled.
   ///
-  /// \return bool Whether the MKLDNN quantization is enabled.
+  /// \return bool Whether the OneDNN quantization is enabled.
   ///
   bool mkldnn_quantizer_enabled() const { return use_mkldnn_quantizer_; }
 
   ///
-  /// \brief Get MKLDNN quantizer config.
+  /// \brief Get OneDNN quantizer config.
   ///
-  /// \return MkldnnQuantizerConfig* MKLDNN quantizer config.
+  /// \return MkldnnQuantizerConfig* OneDNN quantizer config.
   ///
   MkldnnQuantizerConfig* mkldnn_quantizer_config() const;
 
@@ -1250,8 +1250,16 @@ struct PD_INFER_DECL AnalysisConfig {
                           bool custom_pass_only = false);
 
   ///
-  /// \brief Set passmanager opt level.Pass level lower than
-  /// opt level which will be added to passmanager
+  /// \brief Set pir Optimization level.
+  /// \param opt_level The optimization level
+  /// The optimization Level in range [0,4], Default 2.
+  /// Higher optimization level allows the predictor to apply more passes.
+  /// If 0, Only basic pass support.
+  /// If 1, Additional support for functional pass.
+  /// If 2, Additional support the fusion logical pass,maybe affect precision
+  /// and speed.
+  /// If 3, support layout pass, etc.
+  /// If 4, add the radicaloptimization, maybe affect precision, etc.
   ///
   void SetOptimizationLevel(int opt_level);
 
@@ -1419,7 +1427,7 @@ struct PD_INFER_DECL AnalysisConfig {
   // NNAdapter related
   LiteNNAdapterConfig nnadapter_config_;
 
-  // mkldnn related.
+  // onednn related.
   int mkldnn_cache_capacity_{10};
   bool use_mkldnn_quantizer_{false};
   std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_;
diff --git a/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h b/paddle/fluid/inference/api/paddle_onednn_quantizer_config.h
similarity index 99%
rename from paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h
rename to paddle/fluid/inference/api/paddle_onednn_quantizer_config.h
index 1208c29c79a9c..c44f7a3e0d049 100644
--- a/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h
+++ b/paddle/fluid/inference/api/paddle_onednn_quantizer_config.h
@@ -53,7 +53,7 @@ enum class ScaleAlgo {
 ///
 /// \class MkldnnQuantizerConfig
 ///
-/// \brief Config for mkldnn quantize.
+/// \brief Config for onednn quantize.
 ///
 /// The MkldnnQuantizerConfig is used to configure Mkldnn's quantization
 /// parameters, including scale algorithm, warmup data, warmup batch size,
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 06f3d9d899659..e503f1133cb7b 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -228,7 +228,6 @@ const std::vector<std::string> kCINNCompilerPasses{
     "gpu_cpu_map_matmul_v2_to_mul_pass",
     "gpu_cpu_map_matmul_v2_to_matmul_pass",
     "gpu_cpu_map_matmul_to_mul_pass",
-    "build_cinn_pass",
 };
 
 const std::vector<std::string> CpuBasicPasses{
@@ -358,34 +357,34 @@ void CpuPassStrategy::EnableMKLDNN() {
 // TODO(Superjomn) Consider the way to mix CPU with GPU.
 #ifdef PADDLE_WITH_DNNL
   if (!use_mkldnn_) {
-    passes_.insert(passes_.begin(), "mkldnn_placement_pass");
+    passes_.insert(passes_.begin(), "onednn_placement_pass");
 
     for (auto &pass : std::vector<std::string>({
              "squeeze2_transpose2_onednn_fuse_pass",
-             "depthwise_conv_mkldnn_pass",    //
+             "depthwise_conv_onednn_pass",    //
              "conv_bn_fuse_pass",             // Execute BN passes again to
              "conv_eltwiseadd_bn_fuse_pass",  // preserve correct pass order
-             "conv_affine_channel_mkldnn_fuse_pass",    //
+             "conv_affine_channel_onednn_fuse_pass",    //
              "conv_transpose_bn_fuse_pass",             //
              "conv_transpose_eltwiseadd_bn_fuse_pass",  //
-             "conv_bias_mkldnn_fuse_pass",              //
-             "conv_transpose_bias_mkldnn_fuse_pass",
+             "conv_bias_onednn_fuse_pass",              //
+             "conv_transpose_bias_onednn_fuse_pass",
              // TODO(baoachun): Need to support 5-dimensional input.
-             // "conv3d_bias_mkldnn_fuse_pass",  //
-             "conv_elementwise_add_mkldnn_fuse_pass",
-             "conv_activation_mkldnn_fuse_pass",           //
+             // "conv3d_bias_onednn_fuse_pass",  //
+             "conv_elementwise_add_onednn_fuse_pass",
+             "conv_activation_onednn_fuse_pass",           //
              "scale_matmul_fuse_pass",                     //
-             "reshape_transpose_matmul_mkldnn_fuse_pass",  //
-             "matmul_transpose_reshape_mkldnn_fuse_pass",  //
-             "matmul_elementwise_add_mkldnn_fuse_pass",    //
-             "matmul_activation_mkldnn_fuse_pass",         //
+             "reshape_transpose_matmul_onednn_fuse_pass",  //
+             "matmul_transpose_reshape_onednn_fuse_pass",  //
+             "matmul_elementwise_add_onednn_fuse_pass",    //
+             "matmul_activation_onednn_fuse_pass",         //
              // Disabled due to topology-dependent speed-up
-             "fc_mkldnn_pass",
-             "fc_act_mkldnn_fuse_pass",
+             "fc_onednn_pass",
+             "fc_act_onednn_fuse_pass",
              "self_attention_fuse_pass",              //
              "batch_norm_act_fuse_pass",              //
              "softplus_activation_onednn_fuse_pass",  //
-             "shuffle_channel_mkldnn_detect_pass",    //
+             "shuffle_channel_onednn_detect_pass",    //
              "elementwise_act_onednn_fuse_pass",      //
              "operator_scale_onednn_fuse_pass",       //
              "operator_unsqueeze2_onednn_fuse_pass",  //
@@ -419,8 +418,8 @@ void CpuPassStrategy::EnableMkldnnQuantizer() {
 void CpuPassStrategy::EnableMkldnnBfloat16() {
 #ifdef PADDLE_WITH_DNNL
   if (!use_mkldnn_bfloat16_) {
-    passes_.emplace_back("fc_mkldnn_pass");
-    passes_.emplace_back("fc_act_mkldnn_fuse_pass");
+    passes_.emplace_back("fc_onednn_pass");
+    passes_.emplace_back("fc_act_onednn_fuse_pass");
 
     passes_.emplace_back("cpu_bfloat16_placement_pass");
     passes_.emplace_back("cpu_bfloat16_pass");
@@ -437,8 +436,8 @@ void CpuPassStrategy::EnableMkldnnInt8() {
   if (!use_mkldnn_int8_) {
     passes_.clear();
     passes_.emplace_back("simplify_with_basic_ops_pass");
-    passes_.emplace_back("quant_dequant_mkldnn_pass");
-    passes_.emplace_back("mkldnn_placement_pass");
+    passes_.emplace_back("quant_dequant_onednn_pass");
+    passes_.emplace_back("onednn_placement_pass");
     passes_.emplace_back("constant_folding_pass");
     passes_.emplace_back("squeeze2_transpose2_onednn_fuse_pass");
     passes_.emplace_back("layer_norm_fuse_pass");
@@ -462,27 +461,27 @@ void CpuPassStrategy::EnableMkldnnInt8() {
     passes_.emplace_back("matmul_scale_fuse_pass");
     passes_.emplace_back("gpu_cpu_map_matmul_to_mul_pass");
     passes_.emplace_back("repeated_fc_relu_fuse_pass");
-    passes_.emplace_back("depthwise_conv_mkldnn_pass");
+    passes_.emplace_back("depthwise_conv_onednn_pass");
     passes_.emplace_back("conv_bn_fuse_pass");
     passes_.emplace_back("conv_eltwiseadd_bn_fuse_pass");
-    passes_.emplace_back("conv_affine_channel_mkldnn_fuse_pass");
+    passes_.emplace_back("conv_affine_channel_onednn_fuse_pass");
     passes_.emplace_back("conv_transpose_bn_fuse_pass");
     passes_.emplace_back("conv_transpose_eltwiseadd_bn_fuse_pass");
-    passes_.emplace_back("conv_bias_mkldnn_fuse_pass");
-    passes_.emplace_back("conv_transpose_bias_mkldnn_fuse_pass");
-    passes_.emplace_back("conv_elementwise_add_mkldnn_fuse_pass");
-    passes_.emplace_back("conv_activation_mkldnn_fuse_pass");
+    passes_.emplace_back("conv_bias_onednn_fuse_pass");
+    passes_.emplace_back("conv_transpose_bias_onednn_fuse_pass");
+    passes_.emplace_back("conv_elementwise_add_onednn_fuse_pass");
+    passes_.emplace_back("conv_activation_onednn_fuse_pass");
     passes_.emplace_back("fc_fuse_pass");
     passes_.emplace_back("repeated_fc_relu_fuse_pass");
-    passes_.emplace_back("fc_mkldnn_pass");
-    passes_.emplace_back("fc_act_mkldnn_fuse_pass");
-    passes_.emplace_back("matmul_transpose_reshape_mkldnn_fuse_pass");
+    passes_.emplace_back("fc_onednn_pass");
+    passes_.emplace_back("fc_act_onednn_fuse_pass");
+    passes_.emplace_back("matmul_transpose_reshape_onednn_fuse_pass");
     passes_.emplace_back("batch_norm_act_fuse_pass");
     passes_.emplace_back("softplus_activation_onednn_fuse_pass");
-    passes_.emplace_back("compute_propagate_scales_mkldnn_pass");
+    passes_.emplace_back("compute_propagate_scales_onednn_pass");
     passes_.emplace_back("scale_matmul_fuse_pass");
-    passes_.emplace_back("reshape_transpose_matmul_mkldnn_fuse_pass");
-    passes_.emplace_back("matmul_elementwise_add_mkldnn_fuse_pass");
+    passes_.emplace_back("reshape_transpose_matmul_onednn_fuse_pass");
+    passes_.emplace_back("matmul_elementwise_add_onednn_fuse_pass");
     passes_.emplace_back("operator_scale_onednn_fuse_pass");
     passes_.emplace_back("operator_unsqueeze2_onednn_fuse_pass");
     passes_.emplace_back("operator_reshape2_onednn_fuse_pass");
@@ -510,7 +509,7 @@ void CpuPassStrategy::DisableMkldnnFcPasses() {
 
 void CpuPassStrategy::EraseFcMkldnnPasses() {
   std::vector<std::string> fc_passes_to_erase(
-      {"fc_mkldnn_pass", "fc_act_mkldnn_fuse_pass"});
+      {"fc_onednn_pass", "fc_act_onednn_fuse_pass"});
   for (const auto &pass : fc_passes_to_erase) {
     int idx = static_cast<int>(GetPassIndex(pass));
     if (idx != -1) {
@@ -538,6 +537,7 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) {
       "cast_embedding_trans_ids_to_int32_pass",
       "delete_elementwise_mul_op_pass",
       "generate_sequence_xpu_fuse_pass",
+      "group_norm_silu_xpu_fuse_pass",
       "embedding_with_eltwise_add_xpu_fuse_pass",
       "qk_qkv_attention_xpu_fuse_pass",
       "multi_encoder_xpu_fuse_pass",
@@ -545,6 +545,7 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) {
       "multi_encoder_xpu_slice_fuse_pass",
       "fused_multi_transformer_cachekv_layout_trans_pass",
       "fused_multi_transformer_int8_cachekv_layout_trans_pass",
+      "cross_attention_xpu_fuse_pass",
       "decoder_attention_xpu_fuse_pass",
       "one_beam_size_fuse_pass",
       "fold_interp_outsize_fuse_pass",
@@ -586,6 +587,7 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) {
       "xpu_quantize_op_pass",
       "xpu_quantize_squash_pass",
       "link_xpu_op_max_pass",
+      "spatial_transformer_resblock_xpu_fuse_pass",
       "delete_isolated_node_pass",
       "inplace_op_var_pass",
   });
@@ -611,22 +613,31 @@ const std::vector<std::string> kPirGpuPasses{
     "fc_elementwise_layernorm_fuse_pass",
     "matmul_scale_fuse_pass",
     "matmul_transpose_fuse_pass",
-    "transpose_flatten_concat_fuse_pass"};
+    "transpose_flatten_concat_fuse_pass",
+    "remove_redundant_transpose_pass"};
 
 const std::vector<std::string> kPirXpuPasses{// Functional pass
                                              "map_op_to_another_pass",
                                              "identity_op_clean_pass",
                                              // Operator fusion pass
-                                             "add_layernorm_xpu_fuse_pass"};
+                                             "add_layernorm_xpu_fuse_pass",
+                                             "group_norm_silu_xpu_fuse_pass"};
 
 const std::vector<std::string> kPirMkldnnPasses{
+    "depthwise_conv_onednn_pass",
+    "squeeze_transpose_onednn_fuse_pass",
     "conv2d_bias_fuse_pass",
     "conv2d_transpose_bias_fuse_pass",
     "conv3d_bias_fuse_pass",
     "batch_norm_act_fuse_pass",
+    "scale_matmul_fuse_pass",
+    "reshape_transpose_matmul_fuse_pass",
+    "matmul_transpose_reshape_fuse_pass",
     "matmul_elementwise_add_fuse_pass",
     "matmul_activation_fuse_pass",
-    "conv_elementwise_add_mkldnn_fuse_pass"};
+    "conv_elementwise_add_onednn_fuse_pass",
+    "conv_activation_onednn_fuse_pass",
+    "conv_concat_activation_onednn_fuse_pass"};
 
 const std::vector<std::string> kPirCpuPasses{};
 
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 79ef68c853cfb..013fb8d477924 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -43,7 +43,7 @@ namespace paddle {
 /// Example Usage:
 ///     Build a new pass.
 /// \code{cpp}
-/// const vector<string> passes(1, "conv_relu_mkldnn_fuse_pass");
+/// const vector<string> passes(1, "conv_relu_onednn_fuse_pass");
 /// PaddlePassBuilder builder(passes);
 /// \endcode
 class PD_INFER_DECL PaddlePassBuilder {
@@ -139,24 +139,24 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   /// \brief Enable the use of cuDNN kernel.
   virtual void EnableCUDNN() {}
 
-  /// \brief Enable the use of MKLDNN.
-  /// The MKLDNN control exists in both CPU and GPU mode, because there can
+  /// \brief Enable the use of OneDNN.
+  /// The OneDNN control exists in both CPU and GPU mode, because there can
   /// still be some CPU kernels running in GPU mode.
   virtual void EnableMKLDNN() {}
 
-  /// \brief Disable the use of MKLDNN.
+  /// \brief Disable the use of OneDNN.
   virtual void DisableMKLDNN() {}
 
-  /// \brief Enable MKLDNN quantize optimization.
+  /// \brief Enable OneDNN quantize optimization.
   virtual void EnableMkldnnQuantizer() {}
 
-  /// \brief Enable MKLDNN bfloat16.
+  /// \brief Enable OneDNN bfloat16.
   virtual void EnableMkldnnBfloat16() {}
 
-  /// \brief Enable MKLDNN int8.
+  /// \brief Enable OneDNN int8.
   virtual void EnableMkldnnInt8() {}
 
-  /// \brief Disable MKLDNN fc passes.
+  /// \brief Disable OneDNN fc passes.
   virtual void DisableMkldnnFcPasses() {}
 
   /// \brief Check if we are using gpu.
@@ -214,26 +214,26 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy {
   /// \brief Enable the use of cuDNN kernel.
   void EnableCUDNN() override;
 
-  /// \brief Enable the use of MKLDNN.
+  /// \brief Enable the use of OneDNN.
   void EnableMKLDNN() override;
 
-  /// \brief Disable the use of MKLDNN.
+  /// \brief Disable the use of OneDNN.
   void DisableMKLDNN() override;
 
-  /// \brief Enable MKLDNN quantize optimization.
+  /// \brief Enable OneDNN quantize optimization.
   void EnableMkldnnQuantizer() override;
 
-  /// \brief Enable MKLDNN bfloat16.
+  /// \brief Enable OneDNN bfloat16.
   void EnableMkldnnBfloat16() override;
 
-  /// \brief Enable MKLDNN int8.
+  /// \brief Enable OneDNN int8.
   void EnableMkldnnInt8() override;
 
-  /// \brief Disable MKLDNN fc passes.
+  /// \brief Disable OneDNN fc passes.
   void DisableMkldnnFcPasses() override;
 
  protected:
-  /// \brief Erase MKLDNN fc passes.
+  /// \brief Erase OneDNN fc passes.
   void EraseFcMkldnnPasses();
 
   /// \cond Protected
@@ -276,7 +276,7 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
   /// \brief Not supported in GPU mode yet.
   void EnableMkldnnInt8() override;
 
-  /// \brief Disable MKLDNN fc passes.
+  /// \brief Disable OneDNN fc passes.
   void DisableMkldnnFcPasses() override;
 
   /// \brief Default destructor.
diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h
index 427e9b95ac499..f1bfe828cbcf2 100644
--- a/paddle/fluid/inference/capi_exp/pd_config.h
+++ b/paddle/fluid/inference/capi_exp/pd_config.h
@@ -526,14 +526,14 @@ PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigLiteEngineEnabled(
 PADDLE_CAPI_EXPORT extern void PD_ConfigSwitchIrDebug(
     __pd_keep PD_Config* pd_config, PD_Bool x);
 ///
-/// \brief Turn on MKLDNN.
+/// \brief Turn on OneDNN.
 ///
 /// \param[in] pd_config config
 ///
 PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMKLDNN(
     __pd_keep PD_Config* pd_config);
 ///
-/// \brief Set the cache capacity of different input shapes for MKLDNN.
+/// \brief Set the cache capacity of different input shapes for OneDNN.
 /// Default value 0 means not caching any shape.
 /// Please see MKL-DNN Data Caching Design Document:
 /// https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/mkldnn/caching/caching.md
@@ -544,10 +544,10 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMKLDNN(
 PADDLE_CAPI_EXPORT extern void PD_ConfigSetMkldnnCacheCapacity(
     __pd_keep PD_Config* pd_config, int32_t capacity);
 ///
-/// \brief A boolean state telling whether to use the MKLDNN.
+/// \brief A boolean state telling whether to use the OneDNN.
 ///
 /// \param[in] pd_config config
-/// \return Whether to use the MKLDNN.
+/// \return Whether to use the OneDNN.
 ///
 PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnEnabled(
     __pd_keep PD_Config* pd_config);
@@ -570,7 +570,7 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigSetCpuMathLibraryNumThreads(
 PADDLE_CAPI_EXPORT extern int32_t PD_ConfigGetCpuMathLibraryNumThreads(
     __pd_keep PD_Config* pd_config);
 ///
-/// \brief Specify the operator type list to use MKLDNN acceleration.
+/// \brief Specify the operator type list to use OneDNN acceleration.
 ///
 /// \param[in] pd_config config
 /// \param[in] ops_num The number of operator type list.
@@ -579,32 +579,32 @@ PADDLE_CAPI_EXPORT extern int32_t PD_ConfigGetCpuMathLibraryNumThreads(
 PADDLE_CAPI_EXPORT extern void PD_ConfigSetMkldnnOp(
     __pd_keep PD_Config* pd_config, size_t ops_num, const char** op_list);
 ///
-/// \brief Turn on MKLDNN quantization.
+/// \brief Turn on OneDNN quantization.
 ///
 /// \param[in] pd_config config
 ///
 PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMkldnnQuantizer(
     __pd_keep PD_Config* pd_config);
 ///
-/// \brief A boolean state telling whether the MKLDNN quantization is enabled.
+/// \brief A boolean state telling whether the OneDNN quantization is enabled.
 ///
 /// \param[in] pd_config config
-/// \return Whether the MKLDNN quantization is enabled.
+/// \return Whether the OneDNN quantization is enabled.
 ///
 PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnQuantizerEnabled(
     __pd_keep PD_Config* pd_config);
 ///
-/// \brief Turn on MKLDNN bfloat16.
+/// \brief Turn on OneDNN bfloat16.
 ///
 /// \param[in] pd_config config
 ///
 PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMkldnnBfloat16(
     __pd_keep PD_Config* pd_config);
 ///
-/// \brief A boolean state telling whether to use the MKLDNN Bfloat16.
+/// \brief A boolean state telling whether to use the OneDNN Bfloat16.
 ///
 /// \param[in] pd_config config
-/// \return Whether to use the MKLDNN Bfloat16.
+/// \return Whether to use the OneDNN Bfloat16.
 ///
 PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnBfloat16Enabled(
     __pd_keep PD_Config* pd_config);
@@ -617,17 +617,17 @@ PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnBfloat16Enabled(
 PADDLE_CAPI_EXPORT extern void PD_ConfigSetBfloat16Op(
     __pd_keep PD_Config* pd_config, size_t ops_num, const char** op_list);
 ///
-/// \brief Turn on MKLDNN int8.
+/// \brief Turn on OneDNN int8.
 ///
 /// \param[in] pd_config config
 ///
 PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMkldnnInt8(
     __pd_keep PD_Config* pd_config);
 ///
-/// \brief A boolean state telling whether to use the MKLDNN int8.
+/// \brief A boolean state telling whether to use the OneDNN int8.
 ///
 /// \param[in] pd_config config
-/// \return Whether to use the MKLDNN int8.
+/// \return Whether to use the OneDNN int8.
 ///
 PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnInt8Enabled(
     __pd_keep PD_Config* pd_config);
diff --git a/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Config.cpp b/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Config.cpp
index 0d585f938be8c..b4cf4a0953169 100644
--- a/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Config.cpp
+++ b/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Config.cpp
@@ -122,7 +122,7 @@ Java_com_baidu_paddle_inference_Config_cpuMathLibraryNumThreads(
   return mathThreadsNum;
 }
 
-// 5. MKLDNN settings
+// 5. OneDNN settings
 
 JNIEXPORT void JNICALL Java_com_baidu_paddle_inference_Config_enableMKLDNN(
     JNIEnv* env, jobject obj, jlong cppPaddleConfigPointer) {
diff --git a/paddle/fluid/inference/experimental/javaapi/src/main/java/com/baidu/paddle/inference/Config.java b/paddle/fluid/inference/experimental/javaapi/src/main/java/com/baidu/paddle/inference/Config.java
index a312cc73fde22..e9bef0d271f05 100644
--- a/paddle/fluid/inference/experimental/javaapi/src/main/java/com/baidu/paddle/inference/Config.java
+++ b/paddle/fluid/inference/experimental/javaapi/src/main/java/com/baidu/paddle/inference/Config.java
@@ -208,7 +208,7 @@ public void resetCppPaddleConfigPointer() {
 
     private native int cpuMathLibraryNumThreads(long cppPaddleConfigPointer);
 
-    // 5. MKLDNN settings
+    // 5. OneDNN settings
 
     private native void enableMKLDNN(long cppPaddleConfigPointer);
 
diff --git a/paddle/fluid/inference/goapi/config.go b/paddle/fluid/inference/goapi/config.go
index 9d0a1e5864418..c2e2b410e4061 100644
--- a/paddle/fluid/inference/goapi/config.go
+++ b/paddle/fluid/inference/goapi/config.go
@@ -554,14 +554,14 @@ func (config *Config) SwitchIrDebug(x bool) {
 }
 
 ///
-/// \brief Turn on MKLDNN.
+/// \brief Turn on OneDNN.
 ///
 func (config *Config) EnableMKLDNN() {
 	C.PD_ConfigEnableMKLDNN(config.c)
 }
 
 ///
-/// \brief Set the cache capacity of different input shapes for MKLDNN.
+/// \brief Set the cache capacity of different input shapes for OneDNN.
 /// Default value 0 means not caching any shape.
 /// Please see MKL-DNN Data Caching Design Document:
 /// https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/mkldnn/caching/caching.md
@@ -573,9 +573,9 @@ func (config *Config) SetMkldnnCacheCapacity(capacity int32) {
 }
 
 ///
-/// \brief A boolean state telling whether to use the MKLDNN.
+/// \brief A boolean state telling whether to use the OneDNN.
 ///
-/// \return bool Whether to use the MKLDNN.
+/// \return bool Whether to use the OneDNN.
 ///
 func (config *Config) MkldnnEnabled() bool {
 	return cvtPDBoolToGo(C.PD_ConfigMkldnnEnabled(config.c))
@@ -609,7 +609,7 @@ func (config *Config) CpuMathLibraryNumThreads() int32 {
 // NativeConfig ToNativeConfig() const;
 
 ///
-/// \brief Specify the operator type list to use MKLDNN acceleration.
+/// \brief Specify the operator type list to use OneDNN acceleration.
 ///
 /// \param opList The operator type list.
 ///
@@ -627,23 +627,23 @@ func (config *Config) SetMKLDNNOp(opList []string) {
 }
 
 ///
-/// \brief Turn on MKLDNN quantization.
+/// \brief Turn on OneDNN quantization.
 ///
 func (config *Config) EnableMkldnnQuantizer() {
 	C.PD_ConfigEnableMkldnnQuantizer(config.c)
 }
 
 ///
-/// \brief Turn on MKLDNN bfloat16.
+/// \brief Turn on OneDNN bfloat16.
 ///
 func (config *Config) EnableMkldnnBfloat16() {
 	C.PD_ConfigEnableMkldnnBfloat16(config.c)
 }
 
 ///
-/// \brief A boolean state telling whether to use the MKLDNN Bfloat16.
+/// \brief A boolean state telling whether to use the OneDNN Bfloat16.
 ///
-/// \return bool Whether to use the MKLDNN Bfloat16.
+/// \return bool Whether to use the OneDNN Bfloat16.
 ///
 func (config *Config) MkldnnBfloat16Enabled() bool {
 	return cvtPDBoolToGo(C.PD_ConfigMkldnnBfloat16Enabled(config.c))
@@ -677,9 +677,9 @@ func (config *Config) ThreadLocalStreamEnabled() bool {
 }
 
 ///
-/// \brief A boolean state telling whether the MKLDNN quantization is enabled.
+/// \brief A boolean state telling whether the OneDNN quantization is enabled.
 ///
-/// \return bool Whether the MKLDNN quantization is enabled.
+/// \return bool Whether the OneDNN quantization is enabled.
 ///
 func (config *Config) MkldnnQuantizerEnabled() bool {
 	return cvtPDBoolToGo(C.PD_ConfigMkldnnQuantizerEnabled(config.c))
diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc
index f41a25fe9717c..d3d9174e84c48 100644
--- a/paddle/fluid/ir_adaptor/translator/op_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc
@@ -2023,6 +2023,19 @@ struct FillConstant2FullWithTensorTranscriber : public OpTranscriber {
       const OpInputInfoList& input_infos,
       pir::Block* block) override {
     std::vector<pir::Value> op_inputs;
+    if (op_desc.HasInput("ValueTensor", true) &&
+        op_desc.Input("ValueTensor", true).size() > 0) {
+      auto value_tensor_vars = op_desc.Input("ValueTensor", true);
+      auto defining_info = (*param_map)[value_tensor_vars[0]];
+      op_inputs.push_back(defining_info.value);
+    } else {
+      float value = PADDLE_GET_CONST(float, op_desc.GetAttr("value"));
+      pir::Attribute new_attr = pir::FloatAttribute::get(ctx, value);
+      auto defining_op =
+          InsertFullOperationForAttributeInput(ctx, block, new_attr);
+      op_inputs.push_back(defining_op->result(0));
+    }
+
     if (op_desc.HasInput("ShapeTensor", true) &&
         op_desc.Input("ShapeTensor", true).size() > 0) {
       auto shape_tensor_vars = op_desc.Input("ShapeTensor", true);
@@ -2044,18 +2057,6 @@ struct FillConstant2FullWithTensorTranscriber : public OpTranscriber {
       op_inputs.push_back(defining_op->result(0));
     }
 
-    if (op_desc.HasInput("ValueTensor", true) &&
-        op_desc.Input("ValueTensor", true).size() > 0) {
-      auto value_tensor_vars = op_desc.Input("ValueTensor", true);
-      auto defining_info = (*param_map)[value_tensor_vars[0]];
-      op_inputs.push_back(defining_info.value);
-    } else {
-      float value = PADDLE_GET_CONST(float, op_desc.GetAttr("value"));
-      pir::Attribute new_attr = pir::FloatAttribute::get(ctx, value);
-      auto defining_op =
-          InsertFullOperationForAttributeInput(ctx, block, new_attr);
-      op_inputs.push_back(defining_op->result(0));
-    }
     return op_inputs;
   }
 
diff --git a/paddle/fluid/ir_adaptor/translator/utils.cc b/paddle/fluid/ir_adaptor/translator/utils.cc
index 07bbb644c6b72..4015d358930b4 100644
--- a/paddle/fluid/ir_adaptor/translator/utils.cc
+++ b/paddle/fluid/ir_adaptor/translator/utils.cc
@@ -95,7 +95,7 @@ std::vector<std::string> CheckUnregisteredOperationInBlock(
     OpTranscriber general_handler;
     try {
       general_handler.LookUpOpInfo(ctx, *op);
-    } catch (pir::IrNotMetException& e) {
+    } catch (common::enforce::EnforceNotMet& e) {
       unregistered_ops.push_back(op->Type());
     }
   }
diff --git a/paddle/fluid/jit/engine/interpreter_engine.cc b/paddle/fluid/jit/engine/interpreter_engine.cc
index e8f622641c33b..7d575ff838f4f 100644
--- a/paddle/fluid/jit/engine/interpreter_engine.cc
+++ b/paddle/fluid/jit/engine/interpreter_engine.cc
@@ -52,11 +52,11 @@ void InterpreterEngine::CreateInterpreterCore() {
       framework::ir::PassRegistry::Instance().Get("delete_dropout_op_x_pass");
   pass->Apply(&graph);
 #ifdef PADDLE_WITH_DNNL
-  auto mkldnn_pass =
-      framework::ir::PassRegistry::Instance().Get("mkldnn_placement_pass");
-  mkldnn_pass->Set("mkldnn_enabled_op_types",
+  auto onednn_pass =
+      framework::ir::PassRegistry::Instance().Get("onednn_placement_pass");
+  onednn_pass->Set("mkldnn_enabled_op_types",
                    new std::unordered_set<std::string>({}));
-  mkldnn_pass->Apply(&graph);
+  onednn_pass->Apply(&graph);
 #endif
 
   GraphToProgram(graph, &converted_prog_, nullptr);
diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
index 18bd48d0cd2e1..2fd782875856e 100644
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -2,12 +2,12 @@ add_subdirectory(allocation)
 
 file(GLOB fluid_memory_srcs "*.cc")
 
-if(WITH_MKLDNN)
-  set(MKLDNN_CTX_DEPS mkldnn)
+if(WITH_ONEDNN)
+  set(ONEDNN_CTX_DEPS onednn)
 else()
-  set(MKLDNN_CTX_DEPS)
+  set(ONEDNN_CTX_DEPS)
 endif()
-set(fluid_memory_deps place enforce common allocator ${MKLDNN_CTX_DEPS})
+set(fluid_memory_deps place enforce common allocator ${ONEDNN_CTX_DEPS})
 
 cc_library(
   fluid_memory
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index 693c1cd47b0de..9d960845198f7 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -147,7 +147,10 @@ static T&& FillValue(T&& allocation) {
 #if defined(PADDLE_WITH_CUDA)
   if (allocation != nullptr) {
     if (FLAGS_sync_after_alloc || FLAGS_alloc_fill_value >= 0) {
-      PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+      bool need_sync = !platform::is_cpu_place(allocation->place());
+      if (need_sync) {
+        PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+      }
       if (FLAGS_alloc_fill_value >= 0) {
         VLOG(10) << "Set " << FLAGS_alloc_fill_value << " on "
                  << allocation->ptr() << " " << allocation->place() << " "
@@ -159,7 +162,9 @@ static T&& FillValue(T&& allocation) {
           std::memset(
               allocation->ptr(), FLAGS_alloc_fill_value, allocation->size());
         }
-        PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+        if (need_sync) {
+          PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+        }
       }
     }
   }
diff --git a/paddle/fluid/memory/allocation/custom_allocator.cc b/paddle/fluid/memory/allocation/custom_allocator.cc
index b4c3ebe1b2926..36848ff9cf0b0 100644
--- a/paddle/fluid/memory/allocation/custom_allocator.cc
+++ b/paddle/fluid/memory/allocation/custom_allocator.cc
@@ -16,6 +16,10 @@
 
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/trace_event.h"
+
+COMMON_DECLARE_bool(custom_device_mem_record);
 
 namespace paddle {
 namespace memory {
@@ -33,6 +37,14 @@ void CustomAllocator::FreeImpl(phi::Allocation* allocation) {
     phi::DeviceManager::GetDeviceWithPlace(place_)->MemoryDeallocate(
         allocation->ptr(), allocation->size());
   }
+  if (FLAGS_custom_device_mem_record) {
+    DEVICE_MEMORY_STAT_UPDATE(
+        Reserved, place_.GetDeviceId(), -allocation->size());
+    platform::RecordMemEvent(allocation->ptr(),
+                             place_,
+                             allocation->size(),
+                             platform::TracerMemEventType::ReservedFree);
+  }
   delete allocation;
 }
 
@@ -42,6 +54,11 @@ phi::Allocation* CustomAllocator::AllocateImpl(size_t size) {
   void* ptr =
       phi::DeviceManager::GetDeviceWithPlace(place_)->MemoryAllocate(size);
   if (LIKELY(ptr)) {
+    if (FLAGS_custom_device_mem_record) {
+      DEVICE_MEMORY_STAT_UPDATE(Reserved, place_.GetDeviceId(), size);
+      platform::RecordMemEvent(
+          ptr, place_, size, platform::TracerMemEventType::ReservedAllocate);
+    }
     return new Allocation(ptr, size, place_);
   }
 
diff --git a/paddle/fluid/memory/allocation/mmap_allocator.cc b/paddle/fluid/memory/allocation/mmap_allocator.cc
index a4a05df1dcaa9..f9647032a6a59 100644
--- a/paddle/fluid/memory/allocation/mmap_allocator.cc
+++ b/paddle/fluid/memory/allocation/mmap_allocator.cc
@@ -54,11 +54,14 @@ struct CountInfo {
   std::atomic<int> refcount;
 };
 
-void AllocateMemoryMap(
-    std::string filename, int flags, size_t size, void **map_ptr_, int *fd_) {
+void AllocateMemoryMap(std::string filename,
+                       int *shared_fd,
+                       int flags,
+                       size_t size,
+                       void **map_ptr_) {
   // TODO(@ZHUI): support win32
   int file_flags = 0;
-  int fd = -1;
+  int fd = *shared_fd;
   if (flags & MAPPED_SHAREDMEM) {
     file_flags = O_RDWR | O_CREAT;
   } else {
@@ -71,7 +74,7 @@ void AllocateMemoryMap(
     file_flags &= ~O_CREAT;
   }
 
-  if (!(flags & MAPPED_FROMFD)) {
+  if (!(flags & MAPPED_FROMFD) && fd == -1) {
     if (flags & MAPPED_SHAREDMEM) {
       fd = shm_open(filename.c_str(), file_flags, (mode_t)0600);
       PADDLE_ENFORCE_NE(
@@ -83,8 +86,6 @@ void AllocateMemoryMap(
       VLOG(6) << "shm_open: " << filename;
       MemoryMapFdSet::Instance().Insert(filename);
     }
-  } else {
-    fd = -1;
   }
 
   PADDLE_ENFORCE_EQ(ftruncate(fd, size),
@@ -98,32 +99,38 @@ void AllocateMemoryMap(
     *map_ptr_ = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
   }
 
+  if (flags & MAPPED_UNLINK) {
+    VLOG(6) << "shm_unlink: " << filename;
+    shm_unlink(filename.c_str());
+  }
+
   PADDLE_ENFORCE_NE(*map_ptr_,
                     MAP_FAILED,
                     platform::errors::Unavailable(
                         "Memory map failed when create shared memory."));
-
   if (flags & MAPPED_KEEPFD) {
-    *fd_ = fd;
+    *shared_fd = fd;
+    VLOG(6) << "keep fd: " << *shared_fd;
   } else {
     PADDLE_ENFORCE_NE(::close(fd),
                       -1,
                       platform::errors::Unavailable(
                           "Error closing memory mapped file <", filename, ">"));
 
-    *fd_ = -1;
+    *shared_fd = -1;
   }
 }
 
 std::shared_ptr<RefcountedMemoryMapAllocation>
 AllocateRefcountedMemoryMapAllocation(std::string filename,
+                                      int shared_fd,
                                       int flags,
                                       size_t size,
                                       int buffer_id) {
-  int fd = -1;
+  int fd = shared_fd;
   void *base_ptr = nullptr;
   if (buffer_id == -1) {
-    AllocateMemoryMap(filename, flags, size + mmap_alignment, &base_ptr, &fd);
+    AllocateMemoryMap(filename, &fd, flags, size + mmap_alignment, &base_ptr);
     VLOG(4) << "Create and mmap a new shm: " << filename;
   } else {
     base_ptr = MemoryMapAllocationPool::Instance().GetById(buffer_id).mmap_ptr_;
@@ -132,7 +139,7 @@ AllocateRefcountedMemoryMapAllocation(std::string filename,
   void *aligned_base_ptr =
       static_cast<void *>(static_cast<char *>(base_ptr) + mmap_alignment);
   return std::make_shared<RefcountedMemoryMapAllocation>(
-      aligned_base_ptr, size, filename, flags, fd, buffer_id);
+      aligned_base_ptr, size, filename, fd, flags, buffer_id);
 }
 
 RefcountedMemoryMapAllocation::RefcountedMemoryMapAllocation(
@@ -145,11 +152,22 @@ RefcountedMemoryMapAllocation::RefcountedMemoryMapAllocation(
     : MemoryMapAllocation(ptr, size, ipc_name, fd, flags) {
   // must reset base ptr first.
   buffer_id_ = buffer_id;
+  fd_ = fd;
+  flags_ = flags;
   resetBaseptr();
   initializeRefercount();
 }
 
 void MemoryMapAllocation::close() {
+  if (!closed_fd_) {
+    closed_fd_ = true;
+    if (flags_ & MAPPED_KEEPFD) {
+      PADDLE_ENFORCE_NE(::close(fd_),
+                        -1,
+                        platform::errors::Unavailable(
+                            "Error closing file descriptor <", fd_, ">"));
+    }
+  }
   if (closed_) {
     return;
   }
@@ -193,6 +211,15 @@ void RefcountedMemoryMapAllocation::close() {
   void *data = map_ptr_;
   CountInfo *info = reinterpret_cast<CountInfo *>(data);
   --info->refcount;
+  if (flags_ & MAPPED_KEEPFD) {
+    closed_fd_ = true;
+    PADDLE_ENFORCE_NE(::close(fd_),
+                      -1,
+                      platform::errors::Unavailable(
+                          "Error closing file descriptor <", fd_, ">"));
+    VLOG(6) << "close fd: " << fd_;
+  }
+
   if (FLAGS_use_shm_cache && buffer_id_ != -1) {
     return;
   } else {
@@ -260,6 +287,7 @@ std::shared_ptr<MemoryMapWriterAllocation> AllocateMemoryMapWriterAllocation(
   const std::string &ipc_name = GetIPCName();
   int flags = O_RDWR | O_CREAT;
   int fd = shm_open(ipc_name.c_str(), flags, 0600);
+
   PADDLE_ENFORCE_NE(fd,
                     -1,
                     platform::errors::Unavailable(
@@ -283,7 +311,6 @@ std::shared_ptr<MemoryMapReaderAllocation> RebuildMemoryMapReaderAllocation(
     const std::string &ipc_name, size_t size) {
   int flags = O_RDWR | O_CREAT;
   flags &= ~O_CREAT;
-
   int fd = shm_open(ipc_name.c_str(), flags, 0600);
   PADDLE_ENFORCE_NE(fd,
                     -1,
diff --git a/paddle/fluid/memory/allocation/mmap_allocator.h b/paddle/fluid/memory/allocation/mmap_allocator.h
index 412e3a3545769..64a3ae9de7658 100644
--- a/paddle/fluid/memory/allocation/mmap_allocator.h
+++ b/paddle/fluid/memory/allocation/mmap_allocator.h
@@ -44,13 +44,17 @@ enum MappedModes {
 
 class MemoryMapAllocation : public Allocation {
  public:
-  explicit MemoryMapAllocation(void *ptr, size_t size, std::string ipc_name)
+  explicit MemoryMapAllocation(void *ptr,
+                               size_t size,
+                               std::string ipc_name,
+                               int fd)
       : Allocation(ptr, size, platform::CPUPlace()),
         ipc_name_(std::move(ipc_name)),
+        fd_(fd),
         map_ptr_(ptr),
         map_size_(size) {}
   explicit MemoryMapAllocation(
-      void *ptr, size_t size, std::string ipc_name, int flags, int fd)
+      void *ptr, size_t size, std::string ipc_name, int fd, int flags)
       : Allocation(ptr, size, platform::CPUPlace()),
         ipc_name_(std::move(ipc_name)),
         fd_(fd),
@@ -59,6 +63,7 @@ class MemoryMapAllocation : public Allocation {
         map_size_(size) {}
 
   inline const std::string &ipc_name() const { return ipc_name_; }
+  inline const int shared_fd() const { return fd_; }
 
   virtual void close();
 
@@ -71,6 +76,7 @@ class MemoryMapAllocation : public Allocation {
   void *map_ptr_ = nullptr;
   size_t map_size_ = 0;
   bool closed_ = false;
+  bool closed_fd_ = false;
 };
 
 class RefcountedMemoryMapAllocation : public MemoryMapAllocation {
@@ -93,11 +99,15 @@ class RefcountedMemoryMapAllocation : public MemoryMapAllocation {
   void resetBaseptr();
 };
 
-void AllocateMemoryMap(
-    std::string filename, int flags, size_t size, void **base_ptr_, int *fd_);
+void AllocateMemoryMap(std::string filename,
+                       int *shared_fd,
+                       int flags,
+                       size_t size,
+                       void **base_ptr_);
 
 std::shared_ptr<RefcountedMemoryMapAllocation>
 AllocateRefcountedMemoryMapAllocation(std::string filename,
+                                      int shared_fd,
                                       int flags,
                                       size_t size,
                                       int buffer_id = -1);
@@ -111,11 +121,13 @@ class MemoryMapWriterAllocation : public Allocation {
         ipc_name_(std::move(ipc_name)) {}
 
   inline const std::string &ipc_name() const { return ipc_name_; }
+  inline const int shared_fd() const { return fd_; }
 
   ~MemoryMapWriterAllocation() override;
 
  private:
   std::string ipc_name_;
+  int fd_ = -1;
 };
 
 class MemoryMapReaderAllocation : public Allocation {
@@ -127,11 +139,13 @@ class MemoryMapReaderAllocation : public Allocation {
         ipc_name_(std::move(ipc_name)) {}
 
   inline const std::string &ipc_name() const { return ipc_name_; }
+  inline const int shared_fd() const { return fd_; }
 
   ~MemoryMapReaderAllocation() override;
 
  private:
   std::string ipc_name_;
+  int fd_ = -1;
 };
 
 std::shared_ptr<MemoryMapWriterAllocation> AllocateMemoryMapWriterAllocation(
diff --git a/paddle/fluid/memory/allocation/stream_safe_custom_device_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_custom_device_allocator.cc
index 218068aeb9c97..bbc0915fe10ce 100644
--- a/paddle/fluid/memory/allocation/stream_safe_custom_device_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_custom_device_allocator.cc
@@ -50,35 +50,11 @@ void StreamSafeCustomDeviceAllocation::RecordStream(
     outstanding_event_map_[stream]->Init(place());
     VLOG(9) << "Create a new event "
             << outstanding_event_map_[stream]->raw_event();
-    auto stream_wrapper = phi::stream::Stream(place(), stream);
-    VLOG(8) << "Record event " << outstanding_event_map_[stream]->raw_event()
-            << " to stream " << stream;
-    outstanding_event_map_[stream]->Record(&stream_wrapper);
-  }
-}
-
-void StreamSafeCustomDeviceAllocation::MarkAsWillBeFreed() {
-  std::lock_guard<SpinLock> lock_guard(outstanding_event_map_lock_);
-  if (!will_be_freed_) {
-    will_be_freed_ = false;
-    VLOG(8) << "ptr: " << ptr() << " will be freed";
-    if (phi::DeviceManager::HasDeviceType(place_.GetDeviceType()) &&
-        outstanding_event_map_.find(owning_stream_) ==
-            outstanding_event_map_.end()) {
-      std::call_once(once_flag_,
-                     [this] { phi::DeviceManager::SetDevice(place_); });
-      outstanding_event_map_.insert(
-          {owning_stream_, std::make_shared<phi::event::Event>()});
-      outstanding_event_map_[owning_stream_]->Init(place_);
-      VLOG(9) << "Create a new event "
-              << outstanding_event_map_[owning_stream_]->raw_event();
-      auto stream_wrapper = phi::stream::Stream(place_, owning_stream_);
-      VLOG(8) << "Record event "
-              << outstanding_event_map_[owning_stream_]->raw_event()
-              << " to stream " << owning_stream_;
-      outstanding_event_map_[owning_stream_]->Record(&stream_wrapper);
-    }
   }
+  auto stream_wrapper = phi::stream::Stream(place(), stream);
+  VLOG(8) << "Record event " << outstanding_event_map_[stream]->raw_event()
+          << " to stream " << stream;
+  outstanding_event_map_[stream]->Record(&stream_wrapper);
 }
 
 bool StreamSafeCustomDeviceAllocation::CanBeFreed() {
@@ -190,7 +166,6 @@ void StreamSafeCustomDeviceAllocator::FreeImpl(phi::Allocation* allocation) {
                               phi::DeviceContextPool::Instance().Get(place_))
                               ->stream());
   }
-  stream_safe_cuda_allocation->MarkAsWillBeFreed();
   if (stream_safe_cuda_allocation->CanBeFreed()) {
     VLOG(9) << "Directly delete allocation";
     delete stream_safe_cuda_allocation;
diff --git a/paddle/fluid/memory/allocation/stream_safe_custom_device_allocator.h b/paddle/fluid/memory/allocation/stream_safe_custom_device_allocator.h
index 75f25fc0cfc2a..6d6bea9b2535c 100644
--- a/paddle/fluid/memory/allocation/stream_safe_custom_device_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_custom_device_allocator.h
@@ -37,7 +37,6 @@ class StreamSafeCustomDeviceAllocation : public Allocation {
 
   void RecordStream(phi::stream::stream_t stream);
   bool CanBeFreed();
-  void MarkAsWillBeFreed();
   phi::stream::stream_t GetOwningStream() const;
   void SetOwningStream(phi::stream::stream_t s);
 
diff --git a/paddle/fluid/memory/allocation/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc
index 8fd7967e9752d..a6e19b84ba8d1 100644
--- a/paddle/fluid/memory/allocation/system_allocator.cc
+++ b/paddle/fluid/memory/allocation/system_allocator.cc
@@ -41,6 +41,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler/mem_tracing.h"
 
 COMMON_DECLARE_bool(use_pinned_memory);
+COMMON_DECLARE_bool(custom_device_mem_record);
 COMMON_DECLARE_double(fraction_of_gpu_memory_to_use);
 COMMON_DECLARE_uint64(initial_gpu_memory_in_mb);
 COMMON_DECLARE_uint64(reallocate_gpu_memory_in_mb);
@@ -298,6 +299,11 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {
     VLOG(4) << "CustomAllocator::Alloc " << p << " size " << size;
     *index = 0;
     plug_alloc_size += size;
+    if (FLAGS_custom_device_mem_record) {
+      DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, size);
+      platform::RecordMemEvent(
+          p, place, size, platform::TracerMemEventType::ReservedAllocate);
+    }
   } else {
     size_t avail, total;
 
@@ -332,6 +338,11 @@ void CustomAllocator::Free(void* p, size_t size, size_t index) {
   auto place = platform::CustomPlace(dev_type_, dev_id_);
   auto device = phi::DeviceManager::GetDeviceWithPlace(place);
   device->MemoryDeallocate(p, size);
+  if (FLAGS_custom_device_mem_record) {
+    DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, size);
+    platform::RecordMemEvent(
+        p, place, size, platform::TracerMemEventType::ReservedFree);
+  }
 }
 
 bool CustomAllocator::UseGpu() const { return true; }
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 5d03c833a87c7..9126023d389be 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -19,12 +19,10 @@ add_subdirectory(controlflow)
 add_subdirectory(detection)
 add_subdirectory(elementwise)
 add_subdirectory(fused)
-add_subdirectory(metrics)
 add_subdirectory(optimizers)
 add_subdirectory(reduce_ops)
 add_subdirectory(sequence_ops)
 add_subdirectory(string)
-add_subdirectory(prim_ops)
 
 
 if(WITH_DISTRIBUTE)
@@ -35,8 +33,6 @@ if (WITH_PSCORE)
     add_subdirectory(pscore)
 endif()
 
-add_subdirectory(amp)
-
 add_subdirectory(reader)
 
 if (NOT WIN32)
@@ -51,10 +47,6 @@ if (WITH_DLNNE)
     add_subdirectory(dlnne)
 endif()
 
-if (WITH_LITE)
-    add_subdirectory(lite)
-endif()
-
 if(WITH_CINN)
     add_subdirectory(cinn)
 endif()
diff --git a/paddle/fluid/operators/activation_op.cu.h b/paddle/fluid/operators/activation_op.cu.h
index d9b1545abce4c..37fd511d7de17 100644
--- a/paddle/fluid/operators/activation_op.cu.h
+++ b/paddle/fluid/operators/activation_op.cu.h
@@ -13,9 +13,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/kernels/funcs/activation_functor.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 38432f8768f59..399ea6963dd0b 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -30,7 +30,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
 #include "paddle/phi/kernels/funcs/activation_functor.h"
@@ -53,13 +53,13 @@ inline void ExtractActivationTensor(const framework::ExecutionContext& context,
                                     phi::DenseTensor** Out) {
   auto x_var = context.InputVar("X");
   auto out_var = context.OutputVar("Out");
-  PADDLE_ENFORCE_NOT_NULL(x_var,
-                          platform::errors::NotFound(
-                              "Cannot get input Variable X, variable name = %s",
-                              context.InputName("X")));
+  PADDLE_ENFORCE_NOT_NULL(
+      x_var,
+      phi::errors::NotFound("Cannot get input Variable X, variable name = %s",
+                            context.InputName("X")));
   PADDLE_ENFORCE_NOT_NULL(
       out_var,
-      platform::errors::NotFound(
+      phi::errors::NotFound(
           "Cannot get output Variable Out, variable name = %s",
           context.OutputName("Out")));
   if (CanBeUsedBySelectedRows.count(context.Type())) {
@@ -73,9 +73,9 @@ inline void ExtractActivationTensor(const framework::ExecutionContext& context,
 
   PADDLE_ENFORCE_NOT_NULL(
       *Out,
-      platform::errors::NotFound("Cannot get the tensor from the Variable "
-                                 "Output(Out), variable name = %s",
-                                 context.OutputName("Out")));
+      phi::errors::NotFound("Cannot get the tensor from the Variable "
+                            "Output(Out), variable name = %s",
+                            context.OutputName("Out")));
 }
 
 template <ActBwdOpFwdDeps kDepValue>
@@ -94,23 +94,21 @@ inline void ExtractActivationGradTensor(
     out_var = context.InputVar("Out");
     PADDLE_ENFORCE_NOT_NULL(
         out_var,
-        platform::errors::NotFound(
+        phi::errors::NotFound(
             "Cannot get input Variable Out, variable name = %s",
             context.InputName("Out")));
   }
 
   PADDLE_ENFORCE_NOT_NULL(
       out_grad_var,
-      platform::errors::NotFound(
-          "Cannot get input Variable %s, variable name = %s",
-          framework::GradVarName("Out"),
-          context.InputName(framework::GradVarName("Out"))));
+      phi::errors::NotFound("Cannot get input Variable %s, variable name = %s",
+                            framework::GradVarName("Out"),
+                            context.InputName(framework::GradVarName("Out"))));
   PADDLE_ENFORCE_NOT_NULL(
       x_grad_var,
-      platform::errors::NotFound(
-          "Cannot get output Variable %s, variable name = %s",
-          framework::GradVarName("X"),
-          context.OutputName(framework::GradVarName("X"))));
+      phi::errors::NotFound("Cannot get output Variable %s, variable name = %s",
+                            framework::GradVarName("X"),
+                            context.OutputName(framework::GradVarName("X"))));
 
   if (CanBeUsedBySelectedRows.count(context.Type())) {
     *dOut = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(
@@ -137,19 +135,19 @@ inline void ExtractActivationGradTensor(
     }
   }
 
-  PADDLE_ENFORCE_NOT_NULL(*dX,
-                          platform::errors::NotFound(
-                              "Cannot get the tensor from the Variable "
-                              "Output(Out), variable name = %s",
-                              context.OutputName(framework::GradVarName("X"))));
+  PADDLE_ENFORCE_NOT_NULL(
+      *dX,
+      phi::errors::NotFound("Cannot get the tensor from the Variable "
+                            "Output(Out), variable name = %s",
+                            context.OutputName(framework::GradVarName("X"))));
 
   if (static_cast<int>(kDepValue) & static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
     auto x_var = context.InputVar("X");
     PADDLE_ENFORCE_NOT_NULL(
         x_var,
-        platform::errors::NotFound("Cannot get the tensor from the "
-                                   "Variable Input(X), variable name = %s",
-                                   context.InputName("X")));
+        phi::errors::NotFound("Cannot get the tensor from the "
+                              "Variable Input(X), variable name = %s",
+                              context.InputName("X")));
     if (CanBeUsedBySelectedRows.count(context.Type())) {
       *X = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var);
     } else {
@@ -384,16 +382,15 @@ inline void ExtractDoubleGradTensorWithInputDOut(
   auto ddo_var = ctx.OutputVar("DDOut");
   PADDLE_ENFORCE_NOT_NULL(
       ddx_var,
-      platform::errors::NotFound(
-          "Cannot get input Variable Out, variable name = %s",
-          ctx.InputName("DDX")));
+      phi::errors::NotFound("Cannot get input Variable Out, variable name = %s",
+                            ctx.InputName("DDX")));
   *ddX = ctx.Input<phi::DenseTensor>("DDX");
   if (ddo_var) {
     *ddOut = ctx.Output<phi::DenseTensor>("DDOut");
   }
   PADDLE_ENFORCE_NOT_NULL(
       ddX,
-      platform::errors::NotFound(
+      phi::errors::NotFound(
           "Cannot get the tensor from the Variable DDX, variable name = %s",
           ctx.OutputName("DDX")));
 
@@ -401,9 +398,8 @@ inline void ExtractDoubleGradTensorWithInputDOut(
   auto x_var = ctx.InputVar("X");
   PADDLE_ENFORCE_NOT_NULL(
       x_var,
-      platform::errors::NotFound(
-          "Cannot get input Variable Out, variable name = %s",
-          ctx.InputName("X")));
+      phi::errors::NotFound("Cannot get input Variable Out, variable name = %s",
+                            ctx.InputName("X")));
   auto dx_var = ctx.OutputVar("DX");
   *X = ctx.Input<phi::DenseTensor>("X");
   if (dx_var) {
diff --git a/paddle/fluid/operators/add_position_encoding_op.cc b/paddle/fluid/operators/add_position_encoding_op.cc
index ad24d37b90d81..13d6f7449f6dd 100644
--- a/paddle/fluid/operators/add_position_encoding_op.cc
+++ b/paddle/fluid/operators/add_position_encoding_op.cc
@@ -72,7 +72,7 @@ class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker {
           PADDLE_ENFORCE_GE(
               alpha,
               0.0f,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "Attribute 'alpha' must be greater than or equal to 0.0."));
         });
     AddAttr<float>("beta", "The scale of Position Embedding.")
@@ -81,7 +81,7 @@ class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker {
           PADDLE_ENFORCE_GE(
               beta,
               0.0f,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "Attribute 'beta' must be greater than or equal to 0.0."));
         });
     AddComment(R"DOC(
diff --git a/paddle/fluid/operators/add_position_encoding_op.h b/paddle/fluid/operators/add_position_encoding_op.h
index 4547f6321a01d..009e40efeae38 100644
--- a/paddle/fluid/operators/add_position_encoding_op.h
+++ b/paddle/fluid/operators/add_position_encoding_op.h
@@ -41,7 +41,7 @@ class AddPositionEncodingKernel : public framework::OpKernel<T> {
     if (x_lod.empty()) {
       PADDLE_ENFORCE_EQ(x_dim.size(),
                         3,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The input(X)'s dimension of AddPositionEncodingOp "
                             "should be equal to "
                             "3, but received %d. ",
@@ -52,14 +52,14 @@ class AddPositionEncodingKernel : public framework::OpKernel<T> {
     } else {
       PADDLE_ENFORCE_EQ(x_dim.size(),
                         2,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The input(X)'s dimension of AddPositionEncodingOp "
                             "should be equal to "
                             "2, but received %d. ",
                             x_dim.size()));
       PADDLE_ENFORCE_EQ(x_lod.size(),
                         1,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The input(X)'s lod level of AddPositionEncodingOp "
                             "should be equal to "
                             "1, but received %d. ",
@@ -70,13 +70,13 @@ class AddPositionEncodingKernel : public framework::OpKernel<T> {
       enc_size = x_dim[1];
     }
 
-    PADDLE_ENFORCE_EQ(enc_size % 2,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The input(X)'s feature size of "
-                          "AddPositionEncodingOp only support even, "
-                          "but received an odd number: %d. ",
-                          enc_size));
+    PADDLE_ENFORCE_EQ(
+        enc_size % 2,
+        0,
+        phi::errors::InvalidArgument("The input(X)'s feature size of "
+                                     "AddPositionEncodingOp only support even, "
+                                     "but received an odd number: %d. ",
+                                     enc_size));
 
     const int half_size = enc_size / 2;
     for (int i = 0; i < batch_size; ++i) {
diff --git a/paddle/fluid/operators/affine_channel_op.cc b/paddle/fluid/operators/affine_channel_op.cc
index f44c181cca097..b80672216efe3 100644
--- a/paddle/fluid/operators/affine_channel_op.cc
+++ b/paddle/fluid/operators/affine_channel_op.cc
@@ -80,13 +80,13 @@ class AffineChannelOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         scale_dims.size(),
         1UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The dimensions of Input(Scale) must be 1,"
             "But received the dimensions of Input(Scale) is [%d] ",
             scale_dims.size()));
     PADDLE_ENFORCE_EQ(b_dims.size(),
                       1UL,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dimensions of Input(Bias) must be 1,"
                           "But received the dimensions of Input(Bias) is [%d] ",
                           scale_dims.size()));
@@ -94,7 +94,7 @@ class AffineChannelOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           scale_dims[0],
           C,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The first dimension value of Input(Scale) must be [%d],"
               "But received [%d].",
               C,
@@ -104,7 +104,7 @@ class AffineChannelOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           b_dims[0],
           C,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The first dimension value of Input(Bias) must be [%d],"
               "But received [%d].",
               C,
diff --git a/paddle/fluid/operators/affine_channel_op_xpu.cc b/paddle/fluid/operators/affine_channel_op_xpu.cc
index 799bb87cf9892..9024dab8f98c2 100644
--- a/paddle/fluid/operators/affine_channel_op_xpu.cc
+++ b/paddle/fluid/operators/affine_channel_op_xpu.cc
@@ -70,7 +70,7 @@ class AffineChannelXPUKernel : public framework::OpKernel<T> {
         dev_ctx.x_context(), x_d, scale_d, y_d, x_shape, b_shape);
     PADDLE_ENFORCE_EQ(r,
                       xpu::Error_t::SUCCESS,
-                      platform::errors::External(
+                      phi::errors::External(
                           "The broadcast_mul XPU OP return wrong value[%d %s]",
                           r,
                           XPUAPIErrorMsg[r]));
@@ -78,7 +78,7 @@ class AffineChannelXPUKernel : public framework::OpKernel<T> {
         dev_ctx.x_context(), y_d, bias_d, y_d, x_shape, b_shape);
     PADDLE_ENFORCE_EQ(r,
                       xpu::Error_t::SUCCESS,
-                      platform::errors::External(
+                      phi::errors::External(
                           "The broadcast_add XPU OP return wrong value[%d %s]",
                           r,
                           XPUAPIErrorMsg[r]));
@@ -140,28 +140,28 @@ class AffineChannelGradXPUKernel : public framework::OpKernel<T> {
           dev_ctx.x_context(), dy_d, dbias_d, x_shape, rdims);
       PADDLE_ENFORCE_EQ(r,
                         xpu::Error_t::SUCCESS,
-                        platform::errors::External(
+                        phi::errors::External(
                             "The reduce_sum XPU OP return wrong value[%d %s]",
                             r,
                             XPUAPIErrorMsg[r]));
       xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
       T* tmp = RAII_GUARD.alloc_l3_or_gm<T>(dy->numel());
       PADDLE_ENFORCE_NOT_NULL(
-          tmp, platform::errors::External("XPU has no enough memory"));
+          tmp, phi::errors::External("XPU has no enough memory"));
 
       r = xpu::mul<T>(
           dev_ctx.x_context(), dy_d, x->data<T>(), tmp, dy->numel());
       PADDLE_ENFORCE_EQ(
           r,
           xpu::Error_t::SUCCESS,
-          platform::errors::External("The mul XPU OP return wrong value[%d %s]",
-                                     r,
-                                     XPUAPIErrorMsg[r]));
+          phi::errors::External("The mul XPU OP return wrong value[%d %s]",
+                                r,
+                                XPUAPIErrorMsg[r]));
       r = xpu::reduce_sum<T>(
           dev_ctx.x_context(), tmp, dscale_d, x_shape, rdims);
       PADDLE_ENFORCE_EQ(r,
                         xpu::Error_t::SUCCESS,
-                        platform::errors::External(
+                        phi::errors::External(
                             "The reduce_sum XPU OP return wrong value[%d %s]",
                             r,
                             XPUAPIErrorMsg[r]));
@@ -172,7 +172,7 @@ class AffineChannelGradXPUKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           r,
           xpu::Error_t::SUCCESS,
-          platform::errors::External(
+          phi::errors::External(
               "The broadcast_mul XPU OP return wrong value[%d %s]",
               r,
               XPUAPIErrorMsg[r]));
diff --git a/paddle/fluid/operators/amp/CMakeLists.txt b/paddle/fluid/operators/amp/CMakeLists.txt
deleted file mode 100644
index cbd9c8b2768b4..0000000000000
--- a/paddle/fluid/operators/amp/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-include(operators)
-if(WITH_UNITY_BUILD)
-  # Load Unity Build rules for operators in paddle/fluid/operators/amp.
-  include(unity_build_rule.cmake)
-endif()
-register_operators()
diff --git a/paddle/fluid/operators/amp/alloc_float_status_op.cc b/paddle/fluid/operators/amp/alloc_float_status_op.cc
deleted file mode 100644
index 2c1b4b201e5c3..0000000000000
--- a/paddle/fluid/operators/amp/alloc_float_status_op.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cstring>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class AllocFloatStatusOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasOutput("FloatStatus"),
-                   "Output",
-                   "FloatStatus",
-                   "alloc_float_status");
-    ctx->SetOutputDim("FloatStatus", {8});
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(framework::proto::VarType::FP32, ctx.GetPlace());
-  }
-};
-
-class AllocFloatStatusMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddOutput("FloatStatus",
-              "(Tensor) of shape {8} that holds the float status.");
-    AddComment(R"DOC(
-      Produces a float Tensor that holds the float status
-)DOC");
-  }
-};
-
-template <typename T, typename DeviceContext>
-class AllocFloatStatusKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Operator alloc_float_status is not supported on CPU"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CPU = phi::CPUContext;
-
-REGISTER_OPERATOR(
-    alloc_float_status,
-    ops::AllocFloatStatusOp,
-    ops::AllocFloatStatusMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-PD_REGISTER_STRUCT_KERNEL(
-    alloc_float_status, CPU, ALL_LAYOUT, ops::AllocFloatStatusKernel, float) {}
diff --git a/paddle/fluid/operators/amp/clear_float_status_op.cc b/paddle/fluid/operators/amp/clear_float_status_op.cc
deleted file mode 100644
index d595a26e5575a..0000000000000
--- a/paddle/fluid/operators/amp/clear_float_status_op.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cstring>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class ClearFloatStatusOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasOutput("FloatStatusOut"),
-                   "Output",
-                   "FloatStatusOut",
-                   "clear_float_status");
-    ctx->SetOutputDim("FloatStatusOut", ctx->GetInputDim("FloatStatus"));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(framework::proto::VarType::FP32, ctx.GetPlace());
-  }
-};
-
-class ClearFloatStatusMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("FloatStatus",
-             "(Tensor) of shape {8} that holds the float status.");
-    AddOutput(
-        "FloatStatusOut",
-        "(Tensor) of shape {8} that holds the float status, which is cleared.");
-    AddComment(R"DOC(
-      Clear the float status
-)DOC");
-  }
-};
-
-template <typename T, typename DeviceContext>
-class ClearFloatStatusKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Operator clear_float_status is not supported on CPU"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    clear_float_status,
-    ops::ClearFloatStatusOp,
-    ops::ClearFloatStatusMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-PD_REGISTER_STRUCT_KERNEL(
-    clear_float_status, CPU, ALL_LAYOUT, ops::ClearFloatStatusKernel, float) {}
diff --git a/paddle/fluid/operators/amp/get_float_status_op.cc b/paddle/fluid/operators/amp/get_float_status_op.cc
deleted file mode 100644
index 8700d82976f01..0000000000000
--- a/paddle/fluid/operators/amp/get_float_status_op.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cstring>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class GetFloatStatusOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasOutput("FloatStatusOut"),
-                   "Output",
-                   "FloatStatusOut",
-                   "get_float_status");
-    ctx->SetOutputDim("FloatStatusOut", ctx->GetInputDim("FloatStatus"));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(framework::proto::VarType::FP32, ctx.GetPlace());
-  }
-};
-
-class GetFloatStatusMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("FloatStatus",
-             "(Tensor) of shape {8} that holds the float status.");
-    AddOutput("FloatStatusOut",
-              "(Tensor) of shape {8} that holds the get float status.");
-    AddComment(R"DOC(
-      Get the float status
-)DOC");
-  }
-};
-
-template <typename T, typename DeviceContext>
-class GetFloatStatusKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Operator get_float_status is not supported on CPU"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CPU = phi::CPUContext;
-
-REGISTER_OPERATOR(
-    get_float_status,
-    ops::GetFloatStatusOp,
-    ops::GetFloatStatusMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-PD_REGISTER_STRUCT_KERNEL(
-    get_float_status, CPU, ALL_LAYOUT, ops::GetFloatStatusKernel, float) {}
diff --git a/paddle/fluid/operators/amp/unity_build_rule.cmake b/paddle/fluid/operators/amp/unity_build_rule.cmake
deleted file mode 100644
index fa460e33c8068..0000000000000
--- a/paddle/fluid/operators/amp/unity_build_rule.cmake
+++ /dev/null
@@ -1,10 +0,0 @@
-# This file records the Unity Build compilation rules.
-# The source files in a `register_unity_group` called are compiled in a unity
-# file.
-# Generally, the combination rules in this file do not need to be modified.
-# If there are some redefined error in compiling with the source file which
-# in combination rule, you can remove the source file from the following rules.
-register_unity_group(cc check_finite_and_unscale_op.cc
-                     update_loss_scaling_op.cc)
-register_unity_group(cu check_finite_and_unscale_op.cu
-                     update_loss_scaling_op.cu)
diff --git a/paddle/fluid/operators/array_operator.h b/paddle/fluid/operators/array_operator.h
index c7b8ce3f381d1..3d8a08b6725f1 100644
--- a/paddle/fluid/operators/array_operator.h
+++ b/paddle/fluid/operators/array_operator.h
@@ -34,16 +34,15 @@ class ArrayOp : public framework::OperatorBase {
   size_t GetOffset(const framework::Scope &scope,
                    const platform::Place &place) const {
     auto *i = scope.FindVar(Input("I"));
-    PADDLE_ENFORCE_NOT_NULL(
-        i, platform::errors::NotFound("Input(I) is not found."));
+    PADDLE_ENFORCE_NOT_NULL(i, phi::errors::NotFound("Input(I) is not found."));
     auto &i_tensor = i->Get<phi::DenseTensor>();
-    PADDLE_ENFORCE_EQ(i_tensor.numel(),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "Input(I) must have numel 1. "
-                          "But received %d, and it's shape is [%s].",
-                          i_tensor.numel(),
-                          i_tensor.dims()));
+    PADDLE_ENFORCE_EQ(
+        i_tensor.numel(),
+        1,
+        phi::errors::InvalidArgument("Input(I) must have numel 1. "
+                                     "But received %d, and it's shape is [%s].",
+                                     i_tensor.numel(),
+                                     i_tensor.dims()));
 
     // get device context from pool
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index 275dc6a99d63e..fae4ecbf9eb2b 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -59,7 +59,7 @@ struct ArrayToLoDFunctor {
       Apply(static_cast<phi::GPUContext *>(pool.Get(place)));
 #else
       PADDLE_THROW(
-          platform::errors::Unavailable("Paddle is not compiled with CUDA."));
+          phi::errors::Unavailable("Paddle is not compiled with CUDA."));
 #endif
     }
   }
@@ -101,7 +101,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
     // dim
     PADDLE_ENFORCE_EQ(x.empty(),
                       false,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "There's no element in the input array."));
     int rank = x[0].dims().size();
     platform::Place place = x[0].place();
@@ -116,7 +116,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
       PADDLE_ENFORCE_EQ(
           ins_i_dims,
           ins_dims,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The dimension of the %zu'th element in LoDTensorArray "
               "differs from previous ones."
               "The current dimension is %d, and the previous dimension is %d.",
@@ -126,7 +126,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
       PADDLE_ENFORCE_EQ(
           x[i].place(),
           place,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The place class of the %zu'th element in LoDTensorArray "
               "differs from previous ones."
               "The current place is %d, and the previous place is %d.",
@@ -136,7 +136,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
       PADDLE_ENFORCE_EQ(
           x[i].dtype(),
           data_type,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The date type of the %zu'th element in LoDTensorArray "
               "differs from previous ones."
               "The current data type is %d, and the previous data type is %d.",
@@ -172,7 +172,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
       cur_level_lod.push_back(cur_level_lod.back() + table_items[idx].length);
       PADDLE_ENFORCE_LE(table_items[idx].length,
                         x.size(),
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The RankTable items length should less than or "
                             "equal to Input(X) size,"
                             "but receive TankTable items length is %d , longer "
@@ -194,7 +194,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
         PADDLE_ENFORCE_GE(
             end_offset,
             start_offset,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "The lod data start offset should smaller or equal to the end "
                 "offset,"
                 "but the start offset is %d, larger than end offset %d.",
@@ -243,11 +243,11 @@ class ArrayToLoDTensorInferShape : public framework::InferShapeBase {
     PADDLE_ENFORCE_EQ(
         context->HasInput("X"),
         true,
-        platform::errors::NotFound("Input(X) of BmmOp should not be null."));
-    PADDLE_ENFORCE_EQ(context->HasInput("RankTable"),
-                      true,
-                      platform::errors::NotFound(
-                          "Input(RankTable) of BmmOp should not be null."));
+        phi::errors::NotFound("Input(X) of BmmOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        context->HasInput("RankTable"),
+        true,
+        phi::errors::NotFound("Input(RankTable) of BmmOp should not be null."));
     // For compile-time, the first dim of input X and output Out should be -1.
     // For runtime, the first dim of output Out should be the sum of all
     // elements's first dim in input X. The output's dims will be re-computed in
diff --git a/paddle/fluid/operators/assert_op.cc b/paddle/fluid/operators/assert_op.cc
index 4ab60914908da..5a9fb09d44807 100644
--- a/paddle/fluid/operators/assert_op.cc
+++ b/paddle/fluid/operators/assert_op.cc
@@ -56,14 +56,14 @@ class AssertOp : public framework::OperatorBase {
                const platform::Place &dev_place) const override {
     const framework::Variable *cond_var_ptr =
         scope.FindVar(Input(kCond.data()));
-    PADDLE_ENFORCE_NOT_NULL(cond_var_ptr,
-                            platform::errors::NotFound(
-                                "Input(Condition) of AssertOp is not found."));
+    PADDLE_ENFORCE_NOT_NULL(
+        cond_var_ptr,
+        phi::errors::NotFound("Input(Condition) of AssertOp is not found."));
     const phi::DenseTensor &cond = cond_var_ptr->Get<phi::DenseTensor>();
     PADDLE_ENFORCE_EQ(
         cond.numel(),
         1,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The numel of Input(Condition) of AssertOp must be 1. But now "
             "the Condition's shape is %s.",
             cond.dims().to_str()));
@@ -83,7 +83,7 @@ class AssertOp : public framework::OperatorBase {
       formatter.Print(x_tensor, name);
     }
 
-    PADDLE_THROW(platform::errors::InvalidArgument(
+    PADDLE_THROW(phi::errors::InvalidArgument(
         "The condition variable '%s' of AssertOp must be "
         "true, but received false",
         Input(kCond.data())));
diff --git a/paddle/fluid/operators/assign_op.h b/paddle/fluid/operators/assign_op.h
index 6efc621120929..36cee420f0c36 100644
--- a/paddle/fluid/operators/assign_op.h
+++ b/paddle/fluid/operators/assign_op.h
@@ -64,7 +64,7 @@ class AssignFunctor {
     PADDLE_ENFORCE_EQ(
         true,
         false,
-        platform::errors::PermissionDenied(
+        phi::errors::PermissionDenied(
             "Not support type for assign op with type %s", typeid(T).name()));
   }
 
diff --git a/paddle/fluid/operators/assign_pos_op.cc b/paddle/fluid/operators/assign_pos_op.cc
index 66c453885e4a9..7def3a0cac503 100644
--- a/paddle/fluid/operators/assign_pos_op.cc
+++ b/paddle/fluid/operators/assign_pos_op.cc
@@ -41,11 +41,11 @@ class AssignPosOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_EQ(cum_count_dtype,
                       X_dtype,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dtype of the cum_count and X should be same"));
     PADDLE_ENFORCE_EQ(cum_count_dtype,
                       framework::proto::VarType::INT64,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dtype of the cum_count_dtype, eff_num_len and "
                           "X should be same as int64"));
     return phi::KernelKey(cum_count_dtype, ctx.device_context().GetPlace());
diff --git a/paddle/fluid/operators/assign_value_op.h b/paddle/fluid/operators/assign_value_op.h
index 5ba8b9367e64e..d147575773c06 100644
--- a/paddle/fluid/operators/assign_value_op.h
+++ b/paddle/fluid/operators/assign_value_op.h
@@ -119,7 +119,7 @@ class AssignValueKernel : public framework::OpKernel<T> {
         value_name = "int8_values";
         break;
       default:
-        PADDLE_THROW(platform::errors::Unimplemented(
+        PADDLE_THROW(phi::errors::Unimplemented(
             "Unsupported data type(code %d) for AssignValue operator, only "
             "supports bool, int32, float32, float64, int8 and int64.",
             dtype));
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index 6a0775e6331a7..3a5b50a7906e5 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -54,7 +54,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
   const int M = static_cast<int>(x_dims[1]);
   PADDLE_ENFORCE_EQ(x_dims.size(),
                     2,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Expected input(X)'s dimension is 2. But received %d.",
                         x_dims.size()));
 
@@ -63,39 +63,39 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE_EQ(
       w_dims.size(),
       2,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Expected input(LSTMWeight)'s dimension is 2.But received %d.",
           w_dims.size()));
   PADDLE_ENFORCE_EQ(
       w_dims[0],
       D + M,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "LSTMWeight dims should be (%d + %d) * %d.", D, M, 4 * D));
 
   auto b_dims = ctx->GetInputDim("LSTMBias");
   PADDLE_ENFORCE_EQ(
       b_dims.size(),
       2,
-      platform::errors::InvalidArgument("Input(LSTMBias)'s rank must be 2."));
-  PADDLE_ENFORCE_EQ(b_dims[0],
-                    1,
-                    platform::errors::InvalidArgument(
-                        "LSTMBias dims should be 1 x %d.", 4 * D));
-  PADDLE_ENFORCE_EQ(b_dims[1],
-                    4 * D,
-                    platform::errors::InvalidArgument(
-                        "LSTMBias dims should be 1 x %d.", 4 * D));
+      phi::errors::InvalidArgument("Input(LSTMBias)'s rank must be 2."));
+  PADDLE_ENFORCE_EQ(
+      b_dims[0],
+      1,
+      phi::errors::InvalidArgument("LSTMBias dims should be 1 x %d.", 4 * D));
+  PADDLE_ENFORCE_EQ(
+      b_dims[1],
+      4 * D,
+      phi::errors::InvalidArgument("LSTMBias dims should be 1 x %d.", 4 * D));
 
   auto c_dims = ctx->GetInputDim("C0");
   PADDLE_ENFORCE_EQ(
       c_dims.size(),
       2,
-      platform::errors::InvalidArgument("Input(C0)'s rank must be 2."));
+      phi::errors::InvalidArgument("Input(C0)'s rank must be 2."));
   if (ctx->IsRuntime()) {
     PADDLE_ENFORCE_EQ(
         c_dims[1],
         D,
-        platform::errors::InvalidArgument("C0 dims should be N x %d.", D));
+        phi::errors::InvalidArgument("C0 dims should be N x %d.", D));
   }
 
   if (ctx->HasInput("H0")) {
@@ -103,27 +103,27 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
     PADDLE_ENFORCE_EQ(
         h_dims.size(),
         2UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Expected input(H0)'s dimension is 2. But received %d.",
             h_dims.size()));
     if (ctx->IsRuntime() ||
         (common::product(c_dims) > 0 && common::product(h_dims) > 0)) {
       PADDLE_ENFORCE_EQ(h_dims,
                         c_dims,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The dimension of Input(H0) and Input(C0) "
                             "should be the same."));
     }
   }
 
   auto atten_w_dims = ctx->GetInputDim("AttentionWeight");
-  PADDLE_ENFORCE_EQ(atten_w_dims.size(),
-                    2,
-                    platform::errors::InvalidArgument(
-                        "Input(AttentionWeight)'s rank must be 2."));
+  PADDLE_ENFORCE_EQ(
+      atten_w_dims.size(),
+      2,
+      phi::errors::InvalidArgument("Input(AttentionWeight)'s rank must be 2."));
   PADDLE_ENFORCE_EQ(atten_w_dims[0],
                     M + D,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Expected `AttentionWeight` shape is [(%d + %d), 1]. "
                         "But received shape = [%d, 1], shape[0] is not %d.",
                         M,
@@ -132,39 +132,39 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
                         M + D));
   PADDLE_ENFORCE_EQ(atten_w_dims[1],
                     1,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "AttentionWeight shapes must be (%d + %d) * 1.", M, D));
 
   if (ctx->HasInput("AttentionBias")) {
     auto atten_b_dims = ctx->GetInputDim("AttentionBias");
-    PADDLE_ENFORCE_EQ(atten_b_dims.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "Input(AttentionBias)'s rank must be 2."));
-    PADDLE_ENFORCE_EQ(atten_b_dims[0],
-                      1,
-                      platform::errors::InvalidArgument(
-                          "AttentionBias shapes must be 1 * 1."));
-    PADDLE_ENFORCE_EQ(atten_b_dims[1],
-                      1,
-                      platform::errors::InvalidArgument(
-                          "AttentionBias shapes must be 1 * 1."));
+    PADDLE_ENFORCE_EQ(
+        atten_b_dims.size(),
+        2,
+        phi::errors::InvalidArgument("Input(AttentionBias)'s rank must be 2."));
+    PADDLE_ENFORCE_EQ(
+        atten_b_dims[0],
+        1,
+        phi::errors::InvalidArgument("AttentionBias shapes must be 1 * 1."));
+    PADDLE_ENFORCE_EQ(
+        atten_b_dims[1],
+        1,
+        phi::errors::InvalidArgument("AttentionBias shapes must be 1 * 1."));
   }
 
   if (ctx->HasInput("AttentionScalar")) {
     auto dims = ctx->GetInputDim("AttentionScalar");
     PADDLE_ENFORCE_EQ(dims.size(),
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input(AttentionScalar)'s rank must be 2."));
-    PADDLE_ENFORCE_EQ(dims[0],
-                      1,
-                      platform::errors::InvalidArgument(
-                          "AttentionScalar shapes must be 1 * 1."));
-    PADDLE_ENFORCE_EQ(dims[1],
-                      1,
-                      platform::errors::InvalidArgument(
-                          "AttentionScalar shapes must be 1 * 1."));
+    PADDLE_ENFORCE_EQ(
+        dims[0],
+        1,
+        phi::errors::InvalidArgument("AttentionScalar shapes must be 1 * 1."));
+    PADDLE_ENFORCE_EQ(
+        dims[1],
+        1,
+        phi::errors::InvalidArgument("AttentionScalar shapes must be 1 * 1."));
   }
 
   if (ctx->HasInput("AttentionScalarBias")) {
@@ -175,15 +175,15 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
                    "AttentionLstm");
     PADDLE_ENFORCE_EQ(dims.size(),
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input(AttentionScalarBias)'s rank must be 2."));
     PADDLE_ENFORCE_EQ(dims[0],
                       1,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "AttentionScalarBias shapes must be 1 * 1."));
     PADDLE_ENFORCE_EQ(dims[1],
                       1,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "AttentionScalarBias shapes must be 1 * 1."));
   }
 
@@ -381,11 +381,11 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         x_lod.size(),
         1UL,
-        platform::errors::InvalidArgument("Input(X)'s lod size must be 1."));
+        phi::errors::InvalidArgument("Input(X)'s lod size must be 1."));
     PADDLE_ENFORCE_EQ(
         c0->dims()[0],
         N,
-        platform::errors::InvalidArgument("C0 dims should be %d x %d.", N, D));
+        phi::errors::InvalidArgument("C0 dims should be %d x %d.", N, D));
     fc_out->Resize({max_seq_len, 1});
 
     std::function<void(const int, const T*, T*)> act_gate, act_cell, act_cand;
diff --git a/paddle/fluid/operators/batch_fc_op.cc b/paddle/fluid/operators/batch_fc_op.cc
index 706cb17e40f34..2eea44e05b057 100644
--- a/paddle/fluid/operators/batch_fc_op.cc
+++ b/paddle/fluid/operators/batch_fc_op.cc
@@ -27,49 +27,49 @@ class BatchFCOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("Input"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "X(Input) of Batch Fully Connected should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("Out"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Out(Output) of Batch Fully Connected should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("W"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "W(Input) of Batch Fully Connected should not be null."));
 
     auto input_dims = ctx->GetInputDim("Input");
     auto w_dims = ctx->GetInputDim("W");
 
-    PADDLE_ENFORCE_EQ(input_dims.size(),
-                      3,
-                      platform::errors::InvalidArgument(
-                          "Input of BatchFCOp should have 3D."));
+    PADDLE_ENFORCE_EQ(
+        input_dims.size(),
+        3,
+        phi::errors::InvalidArgument("Input of BatchFCOp should have 3D."));
     PADDLE_ENFORCE_EQ(
         w_dims.size(),
         3,
-        platform::errors::InvalidArgument("W of BatchFCOp should have 3D."));
+        phi::errors::InvalidArgument("W of BatchFCOp should have 3D."));
     PADDLE_ENFORCE_EQ(
         input_dims[0],
         w_dims[0],
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input.dim[0] and W.dim[0] of BatchFCOp should be same."));
     PADDLE_ENFORCE_EQ(
         input_dims[2],
         w_dims[1],
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input.dim[2] and W.dim[1] of BatchFCOp should be same."));
 
     auto bias_dims = ctx->GetInputDim("Bias");
     PADDLE_ENFORCE_EQ(bias_dims[0],
                       input_dims[0],
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Bias.dim[0] should be same as input.dim[0]."));
     PADDLE_ENFORCE_EQ(bias_dims[1],
                       w_dims[2],
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Bias.dim[1] should be same as input.dim[2]."));
 
     ctx->SetOutputDim("Out", {input_dims[0], input_dims[1], w_dims[2]});
@@ -89,14 +89,13 @@ class BatchFCGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Input"),
-        true,
-        platform::errors::InvalidArgument("Input should not be null"));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Input"),
+                      true,
+                      phi::errors::InvalidArgument("Input should not be null"));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("W"),
         true,
-        platform::errors::InvalidArgument("Input(W) should not be null"));
+        phi::errors::InvalidArgument("Input(W) should not be null"));
 
     ctx->SetOutputDim(framework::GradVarName("Input"),
                       ctx->GetInputDim("Input"));
diff --git a/paddle/fluid/operators/batch_fc_op.h b/paddle/fluid/operators/batch_fc_op.h
index ca8c22243dbe4..5db142d5da6ba 100644
--- a/paddle/fluid/operators/batch_fc_op.h
+++ b/paddle/fluid/operators/batch_fc_op.h
@@ -26,7 +26,7 @@ class BatchFCKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(ctx.GetPlace()),
         true,
-        platform::errors::Unimplemented("BatchFC only supports GPU now."));
+        phi::errors::Unimplemented("BatchFC only supports GPU now."));
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 996c6af070631..31a21f2138e6f 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_layout.h"
 #ifdef PADDLE_WITH_DNNL
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #endif
 
 #include "paddle/fluid/prim/api/composite_backward/composite_backward_api.h"
@@ -57,12 +57,12 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
   // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
   PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0],
                     ctx->Outputs("MeanOut")[0],
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Mean and MeanOut should share the same memory"));
   PADDLE_ENFORCE_EQ(
       ctx->Inputs("Variance")[0],
       ctx->Outputs("VarianceOut")[0],
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Variance and VarianceOut should share the same memory"));
 
   const auto x_dims = ctx->GetInputDim("X");
@@ -71,7 +71,7 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
     PADDLE_ENFORCE_EQ(
         (x_dims[i] == -1) || (x_dims[i] > 0),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Each dimension of input tensor is expected to be -1 or a "
             "positive number, but received %d. Input's shape is [%s].",
             x_dims[i],
@@ -85,7 +85,7 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
     auto mom = ctx->Inputs("MomentumTensor");
     PADDLE_ENFORCE_EQ(mom.size(),
                       1,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The input tensor MomentumTensor's size must be 1"
                           "But received: MomentumTensor's size is [%d]",
                           mom.size()));
@@ -94,7 +94,7 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
   PADDLE_ENFORCE_GE(
       x_dims.size(),
       2,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "ShapeError: the dimension of input "
           "X must greater than or equal to 2. But received: the shape of input "
           "X = [%s], the dimension of input X =[%d]",
@@ -103,7 +103,7 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
   PADDLE_ENFORCE_LE(
       x_dims.size(),
       5,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "ShapeError: the dimension of input X "
           "must smaller than or equal to 5. But received: the shape of input X "
           "= [%s], the dimension of input X = [%d]",
@@ -121,7 +121,7 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
     PADDLE_ENFORCE_EQ(
         scale_dim.size(),
         1UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "ShapeError: the dimension of scale must equal to 1."
             "But received: the shape of scale is [%s], the dimension "
             "of scale is [%d]",
@@ -134,7 +134,7 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
     PADDLE_ENFORCE_EQ(
         bias_dim.size(),
         1UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "ShapeError: the dimension of bias must equal to 1."
             "But received: the shape of bias is [%s],the dimension "
             "of bias is [%d]",
@@ -153,14 +153,14 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
   if (check) {
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0],
                       C,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "ShapeError: the shape of scale must equal to [%d]"
                           "But received: the shape of scale is [%d]",
                           C,
                           ctx->GetInputDim("Scale")[0]));
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0],
                       C,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "ShapeError: the shape of bias must equal to [%d]"
                           "But received: the shape of bias is [%d]",
                           C,
@@ -191,29 +191,29 @@ phi::KernelKey BatchNormOp::GetExpectedKernelType(
     bn_param_type = framework::proto::VarType::FP64;
   }
   if (ctx.HasInput("Scale")) {
-    PADDLE_ENFORCE_EQ(bn_param_type,
-                      framework::TransToProtoVarType(
-                          ctx.Input<phi::DenseTensor>("Scale")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Scale input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        bn_param_type,
+        framework::TransToProtoVarType(
+            ctx.Input<phi::DenseTensor>("Scale")->dtype()),
+        phi::errors::InvalidArgument("Scale input should be of float type"));
   }
   if (ctx.HasInput("Bias")) {
-    PADDLE_ENFORCE_EQ(bn_param_type,
-                      framework::TransToProtoVarType(
-                          ctx.Input<phi::DenseTensor>("Bias")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Bias input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        bn_param_type,
+        framework::TransToProtoVarType(
+            ctx.Input<phi::DenseTensor>("Bias")->dtype()),
+        phi::errors::InvalidArgument("Bias input should be of float type"));
   }
   PADDLE_ENFORCE_EQ(
       bn_param_type,
       framework::TransToProtoVarType(
           ctx.Input<phi::DenseTensor>("Mean")->dtype()),
-      platform::errors::InvalidArgument("Mean input should be of float type"));
-  PADDLE_ENFORCE_EQ(bn_param_type,
-                    framework::TransToProtoVarType(
-                        ctx.Input<phi::DenseTensor>("Variance")->dtype()),
-                    platform::errors::InvalidArgument(
-                        "Variance input should be of float type"));
+      phi::errors::InvalidArgument("Mean input should be of float type"));
+  PADDLE_ENFORCE_EQ(
+      bn_param_type,
+      framework::TransToProtoVarType(
+          ctx.Input<phi::DenseTensor>("Variance")->dtype()),
+      phi::errors::InvalidArgument("Variance input should be of float type"));
   return phi::KernelKey(input_data_type, ctx.GetPlace());
 }
 
@@ -254,11 +254,11 @@ void BatchNormOpMaker::Make() {
         PADDLE_ENFORCE_GE(
             epsilon,
             0.0f,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "'epsilon' should be greater or equal than 0.0."));
         PADDLE_ENFORCE_LE(epsilon,
                           0.001f,
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "'epsilon' should be less or equal than 0.001."));
       });
   AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
@@ -349,7 +349,7 @@ void BatchNormGradOp::InferShape(framework::InferShapeContext *ctx) const {
 
   PADDLE_ENFORCE_EQ((has_scale_grad == has_bias_grad),
                     true,
-                    platform::errors::NotFound(
+                    phi::errors::NotFound(
                         "Output(Scale@GRAD) and Output(Bias@GRAD) must be null "
                         "or not be null at same time. But now, "
                         "has Scale@Grad=[%d], has Bias@GRAD=[%d]",
@@ -361,7 +361,7 @@ void BatchNormGradOp::InferShape(framework::InferShapeContext *ctx) const {
     PADDLE_ENFORCE_EQ(
         !ctx->Attrs().Get<bool>("use_mkldnn"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Using global stats during training is not supported "
             "in oneDNN version of batch_norm_gradient kernel now."));
   }
@@ -391,7 +391,7 @@ phi::KernelKey BatchNormGradOp::GetExpectedKernelType(
   const auto *var = ctx.InputVar(framework::GradVarName("Y"));
   if (var == nullptr) {
     PADDLE_THROW(
-        platform::errors::InvalidArgument("can't find gradient variable of Y"));
+        phi::errors::InvalidArgument("can't find gradient variable of Y"));
   }
   const phi::DenseTensor *t = nullptr;
   if (var->IsType<phi::DenseTensor>()) {
@@ -399,7 +399,7 @@ phi::KernelKey BatchNormGradOp::GetExpectedKernelType(
   }
   if (t == nullptr) {
     PADDLE_THROW(
-        platform::errors::InvalidArgument("gradient variable of Y is empty"));
+        phi::errors::InvalidArgument("gradient variable of Y is empty"));
   }
 
   auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
@@ -532,8 +532,7 @@ phi::KernelKey BatchNormDoubleGradOp::GetExpectedKernelType(
     const framework::ExecutionContext &ctx) const {
   const auto *var = ctx.InputVar("DY");
   if (var == nullptr) {
-    PADDLE_THROW(
-        platform::errors::NotFound("cannot find gradient variable of Y"));
+    PADDLE_THROW(phi::errors::NotFound("cannot find gradient variable of Y"));
   }
   const phi::DenseTensor *t = nullptr;
   if (var->IsType<phi::DenseTensor>()) {
@@ -541,7 +540,7 @@ phi::KernelKey BatchNormDoubleGradOp::GetExpectedKernelType(
   }
   if (t == nullptr) {
     PADDLE_THROW(
-        platform::errors::InvalidArgument("gradient variable of Y is empty"));
+        phi::errors::InvalidArgument("gradient variable of Y is empty"));
   }
   return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
                         ctx.GetPlace());
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index cc013bd0b406e..9d48d7858f41a 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -26,8 +26,8 @@ namespace cub = hipcub;
 #include "paddle/common/flags.h"
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 COMMON_DECLARE_bool(cudnn_batchnorm_spatial_persistent);
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index ec3ced614bd92..50a69e6390302 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -114,6 +114,6 @@ PD_REGISTER_STRUCT_KERNEL(beam_search_decode,
                           ops::BeamSearchDecodeOpKernel,
                           float,
                           double,
-                          paddle::platform::float16,
+                          phi::dtype::float16,
                           int,
                           int64_t) {}
diff --git a/paddle/fluid/operators/beam_search_decode_op.cu.cc b/paddle/fluid/operators/beam_search_decode_op.cu.cc
index bab5423c99b05..beeb13725c6b1 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cu.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cu.cc
@@ -23,6 +23,6 @@ PD_REGISTER_STRUCT_KERNEL(beam_search_decode,
                           ops::BeamSearchDecodeOpKernel,
                           float,
                           double,
-                          paddle::platform::float16,
+                          phi::dtype::float16,
                           int,
                           int64_t) {}
diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h
index 7347e228780b4..99735e98276e6 100644
--- a/paddle/fluid/operators/beam_search_decode_op.h
+++ b/paddle/fluid/operators/beam_search_decode_op.h
@@ -85,7 +85,7 @@ struct BeamSearchDecodeFunctor {
   template <typename T>
   void apply_mix() const {
     if (std::is_same<bool, T>::value) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "beam search decode op does not support bool!"));
 
     } else {
@@ -125,7 +125,7 @@ class BeamSearchDecodeOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_GT(
         step_num,
         0UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "beam search steps, which is the"
             "size of Input(Ids) LoDTensorArray. beam search steps should "
             "be larger than 0, but received %d. ",
@@ -134,7 +134,7 @@ class BeamSearchDecodeOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_GT(
         source_num,
         0UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "source_num is the sequence number of the"
             "first decoding step, indicating by Input(Ids)[0].lod[0].size. "
             "The number of source_num should be larger than"
@@ -145,7 +145,7 @@ class BeamSearchDecodeOpKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           ids->at(i).lod().size(),
           2UL,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "For the i step in beam search steps,"
               "the size of Input(Ids)[i].lod() should larger than 2,"
               "but received %d. ",
diff --git a/paddle/fluid/operators/beam_search_decode_op_def.h b/paddle/fluid/operators/beam_search_decode_op_def.h
index d358d8255fcf3..ff16e093e0bf5 100644
--- a/paddle/fluid/operators/beam_search_decode_op_def.h
+++ b/paddle/fluid/operators/beam_search_decode_op_def.h
@@ -90,7 +90,7 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
   PADDLE_ENFORCE_NE(
       src_num,
       0,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "src_num is the sequence number of the first decoding step"
           ", indicating by Input(Ids)[0].lod[0].size."
           "src_num has wrong value."
@@ -162,12 +162,12 @@ void BeamSearchDecoder<T>::Backtrace(const LoDTensorArray& step_ids,
   PADDLE_ENFORCE_NE(
       step_ids.empty(),
       true,
-      platform::errors::InvalidArgument("Input(Ids) should not be empty."
-                                        "But the Input(Ids) is empty."));
+      phi::errors::InvalidArgument("Input(Ids) should not be empty."
+                                   "But the Input(Ids) is empty."));
   PADDLE_ENFORCE_EQ(
       step_ids.size(),
       step_scores.size(),
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The size of Input(Ids) and Input(Scores) should be "
           "the same. But the size of Input(Ids) and Input(Scores) "
           "are not equal."));
diff --git a/paddle/fluid/operators/beam_search_decode_op_xpu.cc b/paddle/fluid/operators/beam_search_decode_op_xpu.cc
index 5fd2b2fc6fa35..c438070ce07f9 100644
--- a/paddle/fluid/operators/beam_search_decode_op_xpu.cc
+++ b/paddle/fluid/operators/beam_search_decode_op_xpu.cc
@@ -30,7 +30,7 @@ class BeamSearchDecodeXPUKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_GT(
         step_num,
         0UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "beam search steps, which is the"
             "size of Input(Ids) LoDTensorArray. beam search steps should "
             "be larger than 0, but received %d. ",
@@ -40,7 +40,7 @@ class BeamSearchDecodeXPUKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_GT(
         source_num,
         0UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "source_num is the sequence number of the"
             "first decoding step, indicating by Input(Ids)[0].lod[0].size. "
             "The number of source_num should be larger than"
@@ -51,7 +51,7 @@ class BeamSearchDecodeXPUKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           ids->at(i).lod().size(),
           2UL,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "For the i step in beam search steps,"
               "the size of Input(Ids)[i].lod() should larger than 2,"
               "but received %d. ",
@@ -91,7 +91,7 @@ class BeamSearchDecodeXPUKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           r,
           xpu::Error_t::SUCCESS,
-          platform::errors::External(
+          phi::errors::External(
               "Execute function CopyTensorByXPU failed by [%d]", r));
 
       r = CopyTensorByType(
@@ -99,7 +99,7 @@ class BeamSearchDecodeXPUKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           r,
           xpu::Error_t::SUCCESS,
-          platform::errors::External(
+          phi::errors::External(
               "Execute function CopyTensorByXPU failed by [%d]", r));
       sentenceIds_temp->set_lod(sentenceIds->lod());
       sentenceScores_temp->set_lod(sentenceScores->lod());
@@ -119,7 +119,7 @@ PD_REGISTER_STRUCT_KERNEL(beam_search_decode,
                           ops::BeamSearchDecodeXPUKernel,
                           float,
                           double,
-                          plat::float16,
+                          phi::dtype::float16,
                           int,
                           int64_t) {}
 #endif
diff --git a/paddle/fluid/operators/beam_search_decode_op_xpu.h b/paddle/fluid/operators/beam_search_decode_op_xpu.h
index 5e63627c6f88c..863b92e9f2b7d 100644
--- a/paddle/fluid/operators/beam_search_decode_op_xpu.h
+++ b/paddle/fluid/operators/beam_search_decode_op_xpu.h
@@ -45,7 +45,7 @@ int CopyTensorByXPU(const phi::DenseTensor& srcTensor,
   PADDLE_ENFORCE_EQ(
       r,
       xpu::Error_t::SUCCESS,
-      platform::errors::External("Execute function SetMeta failed by [%d]", r));
+      phi::errors::External("Execute function SetMeta failed by [%d]", r));
 
   if (flag == 0) {
     T* dstData =
@@ -75,8 +75,7 @@ const int CopyTensorByType(const phi::DenseTensor& srcTensor,
   if (srcTensor.dtype() == phi::DataType::FLOAT32)
     r = CopyTensorByXPU<float>(srcTensor, dstTensor, flag, place);
   else if (srcTensor.dtype() == phi::DataType::FLOAT16)
-    r = CopyTensorByXPU<paddle::platform::float16>(
-        srcTensor, dstTensor, flag, place);
+    r = CopyTensorByXPU<phi::dtype::float16>(srcTensor, dstTensor, flag, place);
   else if (srcTensor.dtype() == phi::DataType::FLOAT64)
     r = CopyTensorByXPU<double>(srcTensor, dstTensor, flag, place);
   else if (srcTensor.dtype() == phi::DataType::INT32)
@@ -88,7 +87,7 @@ const int CopyTensorByType(const phi::DenseTensor& srcTensor,
 
   PADDLE_ENFORCE_EQ(r,
                     xpu::Error_t::SUCCESS,
-                    platform::errors::External(
+                    phi::errors::External(
                         "Execute function CopyTensorByXPU failed by [%d]", r));
 
   return xpu::Error_t::SUCCESS;
@@ -117,7 +116,7 @@ struct BeamSearchDecodeXPUFunctor {
           PADDLE_ENFORCE_EQ(
               r,
               xpu::Error_t::SUCCESS,
-              platform::errors::External(
+              phi::errors::External(
                   "Execute function CopyTensorByXPU failed by [%d]", r));
         }
 
@@ -135,7 +134,7 @@ struct BeamSearchDecodeXPUFunctor {
           PADDLE_ENFORCE_EQ(
               r,
               xpu::Error_t::SUCCESS,
-              platform::errors::External(
+              phi::errors::External(
                   "Execute function CopyTensorByType failed by [%d]", r));
         }
 
@@ -148,7 +147,7 @@ struct BeamSearchDecodeXPUFunctor {
   template <typename T>
   void apply_xpu() const {
     if (std::is_same<bool, T>::value) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "beam search decode op does not support bool!"));
     } else {
       BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h
index fea706bb54a93..0beeb0cc407fe 100644
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -29,16 +29,15 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
     auto* pre_ids = context.Input<phi::DenseTensor>("pre_ids");
     auto* pre_scores = context.Input<phi::DenseTensor>("pre_scores");
 
-    PADDLE_ENFORCE_NOT_NULL(scores,
-                            platform::errors::NotFound(
-                                "Input(scores) of BeamSearchOp is not found."));
+    PADDLE_ENFORCE_NOT_NULL(
+        scores,
+        phi::errors::NotFound("Input(scores) of BeamSearchOp is not found."));
     PADDLE_ENFORCE_NOT_NULL(
         pre_ids,
-        platform::errors::NotFound(
-            "Input(pre_ids) of BeamSearchOp is not found."));
+        phi::errors::NotFound("Input(pre_ids) of BeamSearchOp is not found."));
     PADDLE_ENFORCE_NOT_NULL(
         pre_scores,
-        platform::errors::NotFound(
+        phi::errors::NotFound(
             "Input(pre_scores) of BeamSearchOp is not found."));
 
     size_t level = context.Attr<int>("level");
@@ -51,11 +50,11 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
     auto* parent_idx = context.Output<phi::DenseTensor>("parent_idx");
     PADDLE_ENFORCE_NOT_NULL(
         selected_ids,
-        platform::errors::NotFound(
+        phi::errors::NotFound(
             "Output(selected_ids) of BeamSearchOp is not found."));
     PADDLE_ENFORCE_NOT_NULL(
         selected_scores,
-        platform::errors::NotFound(
+        phi::errors::NotFound(
             "Output(selected_scores) of BeamSearchOp is not found."));
 
     math::BeamSearchFunctor<DeviceContext, T> alg;
diff --git a/paddle/fluid/operators/bernoulli_op.h b/paddle/fluid/operators/bernoulli_op.h
index ffa2722ccbb60..f5ca225a49d26 100644
--- a/paddle/fluid/operators/bernoulli_op.h
+++ b/paddle/fluid/operators/bernoulli_op.h
@@ -25,14 +25,14 @@ namespace operators {
 
 template <typename T>
 inline HOSTDEVICE T BernoulliFunctor(T p, T rand) {
-  PADDLE_ENFORCE_LE(p,
-                    1.0,
-                    platform::errors::OutOfRange(
-                        "The probability should be <= 1, but got %f", p));
-  PADDLE_ENFORCE_GE(p,
-                    0.0,
-                    platform::errors::OutOfRange(
-                        "The probability should be >= 0, but got %f", p));
+  PADDLE_ENFORCE_LE(
+      p,
+      1.0,
+      phi::errors::OutOfRange("The probability should be <= 1, but got %f", p));
+  PADDLE_ENFORCE_GE(
+      p,
+      0.0,
+      phi::errors::OutOfRange("The probability should be >= 0, but got %f", p));
   return static_cast<T>(rand < p);
 }
 
diff --git a/paddle/fluid/operators/bilateral_slice_op.cc b/paddle/fluid/operators/bilateral_slice_op.cc
index 111f128fc3cc6..1b4624e3594f7 100644
--- a/paddle/fluid/operators/bilateral_slice_op.cc
+++ b/paddle/fluid/operators/bilateral_slice_op.cc
@@ -37,7 +37,7 @@ class BilateralSliceOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         dim_x.size(),
         4,
-        platform::errors::Unimplemented(
+        phi::errors::Unimplemented(
             "Input(X) dimension must be 4, but got dimension = %d .",
             dim_x.size()));
 
@@ -58,7 +58,7 @@ class BilateralSliceOp : public framework::OperatorWithKernel {
       if (has_offset) {
         PADDLE_ENFORCE_EQ((coeffs_chans % (input_chans + 1)),
                           0,
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "Slicing with affine offset, coefficients grid "
                               "should have n_out*(n_in+1) channels, but got %d",
                               coeffs_chans));
@@ -67,7 +67,7 @@ class BilateralSliceOp : public framework::OperatorWithKernel {
         PADDLE_ENFORCE_EQ(
             (coeffs_chans % input_chans),
             0,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "Slicing without affine offset, coefficients grid "
                 "should have n_out*n_in channels, but got %d .",
                 coeffs_chans));
@@ -179,10 +179,10 @@ template <typename T, typename DeviceContext>
 class BilateralSliceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()),
-                      true,
-                      platform::errors::Unimplemented(
-                          "BilateralSlice only supports GPU now."));
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()),
+        true,
+        phi::errors::Unimplemented("BilateralSlice only supports GPU now."));
   }
 };
 
diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc
deleted file mode 100644
index c628bad0aa3c0..0000000000000
--- a/paddle/fluid/operators/bpr_loss_op.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/bpr_loss_op.h"
-
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class BprLossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BprLoss");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "BprLoss");
-    OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "BprLoss");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto label_dims = ctx->GetInputDim("Label");
-    int rank = x_dims.size();
-    PADDLE_ENFORCE_EQ(
-        rank,
-        label_dims.size(),
-        platform::errors::InvalidArgument(
-            "Input(X) and Input(Label) shall have the same rank."));
-
-    if (ctx->IsRuntime() ||
-        (common::product(x_dims) > 0 && common::product(label_dims) > 0)) {
-      PADDLE_ENFORCE_EQ(
-          common::slice_ddim(x_dims, 0, rank - 1),
-          common::slice_ddim(label_dims, 0, rank - 1),
-          platform::errors::InvalidArgument(
-              "Input(X) and Input(Label) shall have the same shape "
-              "except the last dimension."));
-    }
-
-    auto y_dims = x_dims;
-    y_dims[rank - 1] = 1;
-    ctx->SetOutputDim("Y", y_dims);
-    ctx->ShareLoD("X", /*->*/ "Y");
-  }
-
- protected:
-  // Explicitly set that the data type of computation kernel of Seq-bpr
-  // is determined by its input "X".
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          platform::CPUPlace());
-  }
-};
-
-class BprLossGradientOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BprLossGradient");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "BprLossGradient");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")),
-                   "Input",
-                   framework::GradVarName("Y"),
-                   "BprLossGradient");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output",
-                   framework::GradVarName("X"),
-                   "BprLossGradient");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto label_dims = ctx->GetInputDim("Label");
-    auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
-    int rank = x_dims.size();
-    PADDLE_ENFORCE_EQ(
-        dy_dims.size(),
-        rank,
-        platform::errors::InvalidArgument(
-            "Input(Y@Grad) and Input(X) should have the same rank."));
-    PADDLE_ENFORCE_EQ(
-        label_dims.size(),
-        rank,
-        platform::errors::InvalidArgument(
-            "Input(Label) and Input(X) should have the same rank."));
-    PADDLE_ENFORCE_EQ(common::slice_ddim(x_dims, 0, rank - 1),
-                      common::slice_ddim(label_dims, 0, rank - 1),
-                      platform::errors::InvalidArgument(
-                          "The Input(X) and Input(Label) should have the same "
-                          "shape except the last dimension."));
-    PADDLE_ENFORCE_EQ(common::slice_ddim(x_dims, 0, rank - 1),
-                      common::slice_ddim(dy_dims, 0, rank - 1),
-                      platform::errors::InvalidArgument(
-                          "The Input(X) and Input(Y@Grad) should have the same "
-                          "shape except the last dimension."));
-    PADDLE_ENFORCE_EQ(dy_dims[rank - 1],
-                      1,
-                      platform::errors::InvalidArgument(
-                          "The last dimension of Input(Y@Grad) should be 1."));
-    PADDLE_ENFORCE_EQ(label_dims[rank - 1],
-                      1,
-                      platform::errors::InvalidArgument(
-                          " the last dimension of Input(Label) should be 1."));
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    ctx->ShareLoD("X", framework::GradVarName("X"));
-  }
-
- protected:
-  // Explicitly set that the data type of computation kernel of cross_entropy
-  // is determined by its input "X".
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          platform::CPUPlace());
-  }
-};
-
-class BprLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, default Tensor<float>), a tensor whose last dimension "
-             "size is equal to the number of classes. This input is a "
-             "real number.");
-    AddInput(
-        "Label",
-        "(Tensor), the tensor which represents the ground truth. It has the "
-        "same shape with 'X' except the last dimension. the last dimension "
-        "size is 1.");
-    AddOutput("Y",
-              "(Tensor, default Tensor<float>), a tensor whose shape is same "
-              "with 'X' except that the last dimension size is 1. It "
-              "represents the sequence bpr loss.");
-    AddComment(R"DOC(
-Bayesian Personalized Ranking Loss Operator.
-
-This operator belongs to pairwise ranking loss. Label is the desired item.
-The loss at a given point in one session is defined as:
-$Y[i] = -\frac{1}{N_{i}} * \sum_{j=0}^{N_{i}}\log(\sigma(X[i, Label[i]]-X[i, j]))$
-
-Learn more details by reading paper <session-based recommendations with recurrent
-neural networks>(https://arxiv.org/abs/1511.06939)
-
-)DOC");
-  }
-};
-
-template <typename T>
-class BprLossGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("bpr_loss_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("Label", this->Input("Label"));
-    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(bpr_loss,
-                  ops::BprLossOp,
-                  ops::BprLossOpMaker,
-                  ops::BprLossGradMaker<paddle::framework::OpDesc>,
-                  ops::BprLossGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(bpr_loss_grad, ops::BprLossGradientOp);
-
-PD_REGISTER_STRUCT_KERNEL(
-    bpr_loss, CPU, ALL_LAYOUT, ops::BprLossOpKernel, float, double) {}
-PD_REGISTER_STRUCT_KERNEL(bpr_loss_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::BprLossGradientOpKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h
deleted file mode 100644
index 1a1bddc17c2e5..0000000000000
--- a/paddle/fluid/operators/bpr_loss_op.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-/*Todo:
- *Find a way to adapt TolerableValue, using blas or eigen.
- */
-template <typename T>
-struct TolerableValue {
-  HOSTDEVICE T operator()(const T& x) const {
-    const T kApproInf = 1e20;
-    if (x == INFINITY) return kApproInf;
-    if (x == -INFINITY) return -kApproInf;
-    return x;
-  }
-};
-
-template <typename T, typename DeviceContext>
-class BprLossOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* label = ctx.Input<phi::DenseTensor>("Label");
-    auto* y = ctx.Output<phi::DenseTensor>("Y");
-    y->mutable_data<T>(ctx.GetPlace());
-    int rank = x->dims().size();
-
-    phi::DenseTensor x_2d = phi::ReshapeToMatrix(*x, rank - 1);
-    phi::DenseTensor labels_2d = phi::ReshapeToMatrix(*label, rank - 1);
-    phi::DenseTensor y_2d = phi::ReshapeToMatrix(*y, rank - 1);
-
-    const phi::DenseTensor* logits = &x_2d;
-    const phi::DenseTensor* labels = &labels_2d;
-    phi::DenseTensor* out = &y_2d;
-
-    const int step_size = logits->dims()[0];
-    const int class_num = logits->dims()[1];
-    const T* logits_data = logits->data<T>();
-    T* loss_data = out->data<T>();
-
-    const int64_t* label_data = labels->data<int64_t>();
-    for (int i = 0; i < step_size; ++i) {
-      int lbl_pos = label_data[i];
-      PADDLE_ENFORCE_GE(lbl_pos,
-                        0,
-                        platform::errors::InvalidArgument(
-                            "label data %d is illegal.", lbl_pos));
-      PADDLE_ENFORCE_LT(lbl_pos,
-                        class_num,
-                        platform::errors::InvalidArgument(
-                            "label data %d is illegal.", lbl_pos));
-      int index_pos = i * class_num + lbl_pos;
-      T sum = static_cast<T>(0);
-      for (int j = 0; j < class_num; j++) {
-        if (j == lbl_pos) continue;
-        int index_neg = i * class_num + j;
-        sum += TolerableValue<T>()(-std::log(
-            1.0f + TolerableValue<T>()(std::exp(logits_data[index_neg] -
-                                                logits_data[index_pos]))));
-      }
-      loss_data[i] = -sum / (class_num - 1);
-    }
-  }
-};
-
-template <typename T, typename DeviceContext>
-class BprLossGradientOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    auto* label = ctx.Input<phi::DenseTensor>("Label");
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    const size_t step_size = static_cast<size_t>(x->dims()[0]);
-    const size_t num_classes = static_cast<size_t>(x->dims()[1]);
-    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    const T* dy_data = dy->data<T>();
-    const T* x_data = x->data<T>();
-    const int64_t* label_data = label->data<int64_t>();
-
-    for (size_t sample_id = 0; sample_id < step_size; sample_id++) {
-      for (size_t x_offset = sample_id * num_classes;
-           x_offset < (sample_id + 1) * num_classes;
-           x_offset++) {
-        dx_data[x_offset] = static_cast<T>(0);
-      }
-      auto p_index = sample_id * num_classes + label_data[sample_id];
-      for (size_t ni = 0; ni < num_classes; ni++) {
-        if (label_data[sample_id] == static_cast<int>(ni)) continue;
-        auto n_index = sample_id * num_classes + ni;
-        auto grad_ = -dy_data[sample_id] /
-                     ((num_classes - 1) *
-                      (1.0f + TolerableValue<T>()(std::exp(x_data[p_index] -
-                                                           x_data[n_index]))));
-        dx_data[p_index] += grad_;
-        dx_data[n_index] -= grad_;
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
index b83ebc6f899ef..6b587aba3dbf1 100644
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/float16.h"
 
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
@@ -94,7 +94,7 @@ class CastOp : public framework::OperatorWithKernel {
     auto *tensor = ctx.Input<phi::DenseTensor>("X");
     PADDLE_ENFORCE_EQ(tensor->IsInitialized(),
                       true,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "The tensor of Input(X) is not initialized."));
     auto &tensor_place = tensor->place();
     // NOTE: cuda pinned tensor need to copy its data to target place
diff --git a/paddle/fluid/operators/chunk_eval_op.cc b/paddle/fluid/operators/chunk_eval_op.cc
deleted file mode 100644
index 5e95d0cdda3f8..0000000000000
--- a/paddle/fluid/operators/chunk_eval_op.cc
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/chunk_eval_op.h"
-
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-class ChunkEvalOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("Inference"), "Input", "Inference", "chunk_eval");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "chunk_eval");
-
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Precision"), "Output", "Precision", "chunk_eval");
-    OP_INOUT_CHECK(ctx->HasOutput("Recall"), "Output", "Recall", "chunk_eval");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("F1-Score"), "Output", "F1-Score", "chunk_eval");
-    OP_INOUT_CHECK(ctx->HasOutput("NumInferChunks"),
-                   "Output",
-                   "NumInferChunks",
-                   "chunk_eval");
-    OP_INOUT_CHECK(ctx->HasOutput("NumLabelChunks"),
-                   "Output",
-                   "NumLabelChunks",
-                   "chunk_eval");
-    OP_INOUT_CHECK(ctx->HasOutput("NumCorrectChunks"),
-                   "Output",
-                   "NumCorrectChunks",
-                   "chunk_eval");
-
-    auto inference_dim = ctx->GetInputDim("Inference");
-    auto label_dim = ctx->GetInputDim("Label");
-
-    PADDLE_ENFORCE_EQ(
-        inference_dim,
-        label_dim,
-        platform::errors::InvalidArgument(
-            "Input(Inference)'s shape must be the same as Input(Label)'s "
-            "shape, but received [%s] (Inference) vs [%s] (Label).",
-            inference_dim,
-            label_dim));
-
-    bool use_padding = ctx->HasInput("SeqLength");
-    if (use_padding) {
-      PADDLE_ENFORCE_EQ(
-          (inference_dim.size() == 3 && inference_dim[2] == 1) ||
-              inference_dim.size() == 2,
-          true,
-          platform::errors::InvalidArgument(
-              "when Input(SeqLength) is provided, Input(Inference) "
-              "should be of dim 3 (batch_size, bucket, 1) or dim 2 "
-              "(batch_size, bucket), but received [%s].",
-              inference_dim));
-      auto seq_length_dim = ctx->GetInputDim("SeqLength");
-      PADDLE_ENFORCE_LE(seq_length_dim.size(),
-                        2,
-                        platform::errors::InvalidArgument(
-                            "Input(SeqLength)'s rank should not be greater "
-                            "than 2, but received %d.",
-                            seq_length_dim.size()));
-    }
-
-    ctx->SetOutputDim("Precision", {1});
-    ctx->SetOutputDim("Recall", {1});
-    ctx->SetOutputDim("F1-Score", {1});
-    ctx->SetOutputDim("NumInferChunks", {1});
-    ctx->SetOutputDim("NumLabelChunks", {1});
-    ctx->SetOutputDim("NumCorrectChunks", {1});
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return phi::KernelKey(framework::proto::VarType::FP32,
-                          platform::CPUPlace());
-  }
-};
-
-class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Inference",
-             "(Tensor, default: Tensor<int64_t>). "
-             "Predictions from the network.");
-    AddInput("Label",
-             "(Tensor, default: Tensor<int64_t>). The true tag sequences.");
-    AddInput("SeqLength",
-             "(Tensor, default: Tensor<int64_t>). The length of each sequence, "
-             "used when Inference and Label are Tensor type .")
-        .AsDispensable();
-    AddOutput("Precision",
-              "(float). The evaluated precision (called positive predictive "
-              "value) of chunks on the given mini-batch.");
-    AddOutput("Recall",
-              "(float). The evaluated recall (true positive rate or "
-              "sensitivity) of chunks on the given mini-batch.");
-    AddOutput("F1-Score",
-              "(float). The evaluated F1-Score on the given mini-batch.");
-    AddOutput("NumInferChunks",
-              "(int64_t). The number of chunks in Inference on the given "
-              "mini-batch.");
-    AddOutput(
-        "NumLabelChunks",
-        "(int64_t). The number of chunks in Label on the given mini-batch.");
-    AddOutput(
-        "NumCorrectChunks",
-        "(int64_t). The number of chunks both in Inference and Label on the "
-        "given mini-batch.");
-    AddAttr<int>("num_chunk_types",
-                 "The number of chunk type. See the description for details.");
-    AddAttr<std::string>("chunk_scheme",
-                         "The labeling scheme indicating "
-                         "how to encode the chunks. Must be IOB, IOE, IOBES or "
-                         "plain. See the description"
-                         "for details.")
-        .SetDefault("IOB");
-    AddAttr<std::vector<int>>("excluded_chunk_types",
-                              "A list including chunk type ids "
-                              "indicating chunk types that are not counted. "
-                              "See the description for details.")
-        .SetDefault(std::vector<int>{});
-    AddComment(R"DOC(
-For some basics of chunking, please refer to
-'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
-
-ChunkEvalOp computes the precision, recall, and F1-score of chunk detection,
-and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
-Here is a NER example of labeling for these tagging schemes:
-
-          Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
-   IO     I-PER  I-PER   O      O   I-ORG          I-ORG  I-ORG I-ORG  O   I-LOC
-   IOB    B-PER  I-PER   O      O   B-ORG          I-ORG  I-ORG I-ORG  O   B-LOC
-   IOE    I-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   E-LOC
-   IOBES  B-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   S-LOC
-
-There are three chunk types(named entity types) including PER(person), ORG(organization)
-and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chunk type>.
-
-Since the calculations actually use label ids rather than labels, extra attention
-should be paid when mapping labels to ids to make CheckEvalOp work. The key point
-is that the listed equations are satisfied by ids.
-
-   tag_type = label % num_tag_type
-   chunk_type = label / num_tag_type
-
-where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type`
-is the num of chunk types, and `tag_type` get its value from the following table.
-
-   Scheme Begin Inside End   Single
-    plain   0     -      -     -
-    IOB     0     1      -     -
-    IOE     -     0      1     -
-    IOBES   0     1      2     3
-
-Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG,
-PER and LOC. To satisfy the above equations, the label map can be like this:
-
-   B-ORG  0
-   I-ORG  1
-   B-PER  2
-   I-PER  3
-   B-LOC  4
-   I-LOC  5
-   O      6
-
-It's not hard to verify the equations noting that the num of chunk types
-is 3 and the num of tag types in IOB scheme is 2. For example, the label
-id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of
-I-LOC is 2, which consistent with the results from the equations.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(chunk_eval,
-                             ops::ChunkEvalOp,
-                             ops::ChunkEvalOpMaker);
-
-PD_REGISTER_STRUCT_KERNEL(
-    chunk_eval, CPU, ALL_LAYOUT, ops::ChunkEvalKernel, float) {}
diff --git a/paddle/fluid/operators/chunk_eval_op.h b/paddle/fluid/operators/chunk_eval_op.h
deleted file mode 100644
index baad8719db37f..0000000000000
--- a/paddle/fluid/operators/chunk_eval_op.h
+++ /dev/null
@@ -1,358 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <set>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class ChunkEvalKernel : public framework::OpKernel<T> {
- public:
-  struct Segment {
-    int begin;
-    int end;
-    int type;
-    bool operator==(const Segment& y) const {
-      return begin == y.begin && end == y.end && type == y.type;
-    }
-  };
-
-  void GetSegments(const int64_t* label,
-                   int length,
-                   std::vector<Segment>* segments,
-                   int num_chunk_types,
-                   int num_tag_types,
-                   int other_chunk_type,
-                   int tag_begin,
-                   int tag_inside,
-                   int tag_end,
-                   int tag_single) const {
-    segments->clear();
-    segments->reserve(length);
-    int chunk_start = 0;
-    bool in_chunk = false;
-    int tag = -1;
-    int type = other_chunk_type;
-    for (int i = 0; i < length; ++i) {
-      int prev_tag = tag;
-      int prev_type = type;
-      PADDLE_ENFORCE_LE(
-          label[i],
-          num_chunk_types * num_tag_types,
-          platform::errors::InvalidArgument(
-              "The value of Input(Label) should be less than the number of "
-              "chunk types times the number of tag types, but received %d "
-              "(Label) vs %d (chunk types) * %d (tag types).",
-              label[i],
-              num_chunk_types,
-              num_tag_types));
-      tag = label[i] % num_tag_types;
-      type = label[i] / num_tag_types;
-      if (in_chunk && ChunkEnd(prev_tag,
-                               prev_type,
-                               tag,
-                               type,
-                               other_chunk_type,
-                               tag_begin,
-                               tag_inside,
-                               tag_end,
-                               tag_single)) {
-        Segment segment{
-            chunk_start,  // begin
-            i - 1,        // end
-            prev_type,
-        };
-        segments->push_back(segment);
-        in_chunk = false;
-      }
-      if (ChunkBegin(prev_tag,
-                     prev_type,
-                     tag,
-                     type,
-                     other_chunk_type,
-                     tag_begin,
-                     tag_inside,
-                     tag_end,
-                     tag_single)) {
-        chunk_start = i;
-        in_chunk = true;
-      }
-    }
-    if (in_chunk) {
-      Segment segment{
-          chunk_start,  // begin
-          length - 1,   // end
-          type,
-      };
-      segments->push_back(segment);
-    }
-  }
-
-  bool ChunkEnd(int prev_tag,
-                int prev_type,
-                int tag,
-                int type,
-                int other_chunk_type,
-                int tag_begin,
-                int tag_inside,
-                int tag_end,
-                int tag_single) const {
-    if (prev_type == other_chunk_type) return false;
-    if (type == other_chunk_type) return true;
-    if (type != prev_type) return true;
-    if (prev_tag == tag_begin) return tag == tag_begin || tag == tag_single;
-    if (prev_tag == tag_inside) return tag == tag_begin || tag == tag_single;
-    if (prev_tag == tag_end) return true;
-    if (prev_tag == tag_single) return true;
-    return false;
-  }
-
-  bool ChunkBegin(int prev_tag,
-                  int prev_type,
-                  int tag,
-                  int type,
-                  int other_chunk_type,
-                  int tag_begin,
-                  int tag_inside,
-                  int tag_end,
-                  int tag_single) const {
-    if (prev_type == other_chunk_type) return type != other_chunk_type;
-    if (type == other_chunk_type) return false;
-    if (type != prev_type) return true;
-    if (tag == tag_begin) return true;
-    if (tag == tag_inside) return prev_tag == tag_end || prev_tag == tag_single;
-    if (tag == tag_end) return prev_tag == tag_end || prev_tag == tag_single;
-    if (tag == tag_single) return true;
-    return false;
-  }
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    // initialize to parse configurations
-    int num_chunk_types, num_tag_types;
-    int other_chunk_type;
-    int tag_begin, tag_inside, tag_end, tag_single;
-    std::vector<Segment> label_segments;
-    std::vector<Segment> output_segments;
-    std::set<int> excluded_chunk_types;
-
-    if (context.Attr<std::string>("chunk_scheme") == "IOB") {
-      num_tag_types = 2;
-      tag_begin = 0;
-      tag_inside = 1;
-      tag_end = -1;
-      tag_single = -1;
-    } else if (context.Attr<std::string>("chunk_scheme") == "IOE") {
-      num_tag_types = 2;
-      tag_begin = -1;
-      tag_inside = 0;
-      tag_end = 1;
-      tag_single = -1;
-    } else if (context.Attr<std::string>("chunk_scheme") == "IOBES") {
-      num_tag_types = 4;
-      tag_begin = 0;
-      tag_inside = 1;
-      tag_end = 2;
-      tag_single = 3;
-    } else if (context.Attr<std::string>("chunk_scheme") == "plain") {
-      num_tag_types = 1;
-      tag_begin = -1;
-      tag_inside = -1;
-      tag_end = -1;
-      tag_single = -1;
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument("Unknown chunk scheme."));
-    }
-    other_chunk_type = num_chunk_types = context.Attr<int>("num_chunk_types");
-    excluded_chunk_types.insert(
-        context.Attr<std::vector<int>>("excluded_chunk_types").begin(),
-        context.Attr<std::vector<int>>("excluded_chunk_types").end());
-
-    auto* inference = context.Input<phi::DenseTensor>("Inference");
-    auto place = inference->place();
-    auto* label = context.Input<phi::DenseTensor>("Label");
-    auto* precision = context.Output<phi::DenseTensor>("Precision");
-    auto* recall = context.Output<phi::DenseTensor>("Recall");
-    auto* f1 = context.Output<phi::DenseTensor>("F1-Score");
-    auto* num_infer_chunks = context.Output<phi::DenseTensor>("NumInferChunks");
-    auto* num_label_chunks = context.Output<phi::DenseTensor>("NumLabelChunks");
-    auto* num_correct_chunks =
-        context.Output<phi::DenseTensor>("NumCorrectChunks");
-
-    const int64_t* inference_data = inference->data<int64_t>();
-    const int64_t* label_data = label->data<int64_t>();
-    T* precision_data = precision->mutable_data<T>(place);
-    T* recall_data = recall->mutable_data<T>(place);
-    T* f1_data = f1->mutable_data<T>(place);
-    int64_t* num_infer_chunks_data =
-        num_infer_chunks->mutable_data<int64_t>(place);
-    int64_t* num_label_chunks_data =
-        num_label_chunks->mutable_data<int64_t>(place);
-    int64_t* num_correct_chunks_data =
-        num_correct_chunks->mutable_data<int64_t>(place);
-    *num_infer_chunks_data = 0;
-    *num_label_chunks_data = 0;
-    *num_correct_chunks_data = 0;
-
-    auto lod = label->lod();
-    bool use_padding = lod.empty();
-    int num_sequences = 0;
-
-    if (use_padding) {
-      auto dim1 = inference->dims()[1];
-      auto* seq_length_t = context.Input<phi::DenseTensor>("SeqLength");
-      auto* seq_length_data = seq_length_t->data<int64_t>();
-      num_sequences = seq_length_t->dims()[0];
-
-      for (int i = 0; i < num_sequences; ++i) {
-        int seq_length = seq_length_data[i];
-        EvalOneSeq(inference_data + i * dim1,
-                   label_data + i * dim1,
-                   seq_length,
-                   &output_segments,
-                   &label_segments,
-                   num_infer_chunks_data,
-                   num_label_chunks_data,
-                   num_correct_chunks_data,
-                   num_chunk_types,
-                   num_tag_types,
-                   other_chunk_type,
-                   tag_begin,
-                   tag_inside,
-                   tag_end,
-                   tag_single,
-                   excluded_chunk_types);
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(
-          lod.size(),
-          1UL,
-          platform::errors::InvalidArgument(
-              "Only support one level LoD sequence now, but received %d.",
-              lod.size()));
-      PADDLE_ENFORCE_EQ(
-          lod,
-          inference->lod(),
-          platform::errors::InvalidArgument(
-              "Input(Inference) and Input(Label) of Op(chunk_eval) should have "
-              "same LoD information."));
-      num_sequences = lod[0].size() - 1;
-
-      for (int i = 0; i < num_sequences; ++i) {
-        int seq_length = lod[0][i + 1] - lod[0][i];
-        EvalOneSeq(inference_data + lod[0][i],
-                   label_data + lod[0][i],
-                   seq_length,
-                   &output_segments,
-                   &label_segments,
-                   num_infer_chunks_data,
-                   num_label_chunks_data,
-                   num_correct_chunks_data,
-                   num_chunk_types,
-                   num_tag_types,
-                   other_chunk_type,
-                   tag_begin,
-                   tag_inside,
-                   tag_end,
-                   tag_single,
-                   excluded_chunk_types);
-      }
-    }
-
-    *precision_data = !(*num_infer_chunks_data)
-                          ? 0
-                          : static_cast<T>(*num_correct_chunks_data) /
-                                (*num_infer_chunks_data);
-    *recall_data = !(*num_label_chunks_data)
-                       ? 0
-                       : static_cast<T>(*num_correct_chunks_data) /
-                             (*num_label_chunks_data);
-    *f1_data = !(*num_correct_chunks_data)
-                   ? 0
-                   : 2 * (*precision_data) * (*recall_data) /
-                         ((*precision_data) + (*recall_data));
-  }
-
-  void EvalOneSeq(const int64_t* output,
-                  const int64_t* label,
-                  int length,
-                  std::vector<Segment>* output_segments,
-                  std::vector<Segment>* label_segments,
-                  int64_t* num_output_segments,
-                  int64_t* num_label_segments,
-                  int64_t* num_correct,
-                  int num_chunk_types,
-                  int num_tag_types,
-                  int other_chunk_type,
-                  int tag_begin,
-                  int tag_inside,
-                  int tag_end,
-                  int tag_single,
-                  const std::set<int>& excluded_chunk_types) const {
-    GetSegments(output,
-                length,
-                output_segments,
-                num_chunk_types,
-                num_tag_types,
-                other_chunk_type,
-                tag_begin,
-                tag_inside,
-                tag_end,
-                tag_single);
-    GetSegments(label,
-                length,
-                label_segments,
-                num_chunk_types,
-                num_tag_types,
-                other_chunk_type,
-                tag_begin,
-                tag_inside,
-                tag_end,
-                tag_single);
-    size_t i = 0, j = 0;
-    while (i < output_segments->size() && j < label_segments->size()) {
-      if (output_segments->at(i) == label_segments->at(j) &&
-          excluded_chunk_types.count(output_segments->at(i).type) != 1) {
-        ++(*num_correct);
-      }
-      if (output_segments->at(i).end < label_segments->at(j).end) {
-        ++i;
-      } else if (output_segments->at(i).end > label_segments->at(j).end) {
-        ++j;
-      } else {
-        ++i;
-        ++j;
-      }
-    }
-    for (auto& segment : (*label_segments)) {
-      if (excluded_chunk_types.count(segment.type) != 1) {
-        ++(*num_label_segments);
-      }
-    }
-    for (auto& segment : (*output_segments)) {
-      if (excluded_chunk_types.count(segment.type) != 1) {
-        ++(*num_output_segments);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
deleted file mode 100644
index 5d4e3ae331596..0000000000000
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/cinn/cinn_instruction_run_op.h"
-
-#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
-#include "paddle/fluid/operators/cinn/cinn_launch_context.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle::operators {
-
-class CinnInstructionRunOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    // The cinn-graph may hasn't input for CINN now support fill_constant,
-    // and its all inputs may generated by fill_constant instead of by fetch.
-    // OP_INOUT_CHECK(ctx->HasInputs(kX), "Input", kX, "CinnInstructionRun");
-    OP_INOUT_CHECK(
-        ctx->HasOutputs(kOutputs), "Output", kOutputs, "CinnInstructionRun");
-    const CinnCompiledObject& compiled_object =
-        CinnCompiler::GetInstance()->GetCompiledObject(
-            ctx->Attrs().Get<int64_t>(kCachedIndex));
-
-    details::CinnLaunchContext* launch_context =
-        compiled_object.launch_context.get();
-    std::vector<std::string> output_args = ctx->Outputs(kOutputs);
-    std::vector<framework::DDim> output_dims(output_args.size());
-    std::transform(output_args.begin(),
-                   output_args.end(),
-                   output_dims.begin(),
-                   [launch_context](const std::string& var_name) {
-                     cinn_buffer_t* buffer =
-                         launch_context->GetCinnBufferOfVar(var_name);
-                     return framework::DDim(buffer->dims, buffer->dimensions);
-                   });
-    ctx->SetOutputsDim(kOutputs, output_dims);
-  }
-
- protected:
-  /* [Why use single type kernel]:
-   *
-   * Whether the kernel data type is int, float or other type,
-   * which has no effect on its execution logic, so directly
-   * specified a data type here.
-   *
-   */
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(framework::proto::VarType::FP32, ctx.GetPlace());
-  }
-};
-
-class CinnInstructionRunOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(kX,
-             "(vector<phi::DenseTensor>)"
-             "which are the input arguments of this cinn instruction")
-        .AsDuplicable();
-    AddOutput(kOutputs,
-              "(vector<phi::DenseTensor>)"
-              "which are the output arguments of this cinn instruction")
-        .AsDuplicable();
-    AddAttr<int64_t>(
-        kCachedIndex,
-        "(int64_t)"
-        "the stored index of the cached compilation result in CinnCompiler,"
-        "which is used to fetch the CinnCompiledObject where this cinn "
-        "instruction is included");
-    AddAttr<int64_t>(
-        kInstructionIndex,
-        "(int64_t)"
-        "the index of this instruction to the cinn runtime program");
-    AddComment(R"DOC(
-CinnInstructionRun Operator.
-
-This operator is used to launch a
-CINN(https://github.com/PaddlePaddle/CINN/blob/develop/README.md) instruction execution
-
-Both the input and output of this operator are a set of variables
-which are the input and output arguments of the bound cinn instruction respectively.
-In addition, there is an attribute named 'cached_index' should be
-set necessarily to get the CinnCompiledObject where the instruction is included
-and 'instruction_index' is fetch the instruction object from complied runtime program.
-
-It accomplishes the execution of the instruction according to the following steps:
-  0. Set the shapes ot the output variables at InferShape function with
-     compilation result.
-  1. Fetch the cinn instruction bound to this operator by 'cached_index'
-     and 'instruction_index' from CinnCompiler.
-  2. Prepare the input and output variables of the instruction in Paddle and share
-     their buffers to CINN by setting 'memory' of according cinn_buffer_t.
-  3. Launch CINN runtime to execute the instruction.
-
-)DOC");
-  }
-};
-
-}  // namespace paddle::operators
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    cinn_instruction_run,
-    ops::CinnInstructionRunOp,
-    ops::CinnInstructionRunOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-PD_REGISTER_STRUCT_KERNEL(cinn_instruction_run,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::CinnInstructionRunOpKernel,
-                          float) {}
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc
deleted file mode 100644
index d5d236dbd6529..0000000000000
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/cinn/cinn_instruction_run_op.h"
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace ops = paddle::operators;
-/* see [Why use single type kernel] */
-PD_REGISTER_STRUCT_KERNEL(cinn_instruction_run,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::CinnInstructionRunOpKernel,
-                          float) {}
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.h b/paddle/fluid/operators/cinn/cinn_instruction_run_op.h
deleted file mode 100644
index 3a7779ae83338..0000000000000
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.h
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <iterator>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/cinn/hlir/framework/graph_compiler.h"
-#include "paddle/cinn/hlir/framework/instruction.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
-#include "paddle/fluid/framework/paddle2cinn/transform_type.h"
-#include "paddle/fluid/operators/cinn/cinn_launch_context.h"
-#include "paddle/fluid/operators/cinn/cinn_op_helper.h"
-
-namespace paddle::operators {
-
-using CinnInstruction = ::cinn::hlir::framework::Instruction;
-using CinnCompiledObject = framework::paddle2cinn::CinnCompiledObject;
-using CinnCompiler = framework::paddle2cinn::CinnCompiler;
-
-template <typename T, typename DeviceContext>
-class CinnInstructionRunOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // step 1: fetch the cinn instruction bound to this operator
-    auto cached_index = ctx.template Attr<int64_t>(kCachedIndex);
-    auto ins_index = ctx.template Attr<int64_t>(kInstructionIndex);
-    const CinnCompiledObject& compiled_object =
-        CinnCompiler::GetInstance()->GetCompiledObject(cached_index);
-    const std::vector<std::unique_ptr<CinnInstruction>>& instructions =
-        compiled_object.runtime_program->GetRunInstructions();
-    PADDLE_ENFORCE_LT(ins_index,
-                      instructions.size(),
-                      platform::errors::InvalidArgument(
-                          "Index(%ld) > instructions.size(%ld).",
-                          ins_index,
-                          instructions.size()));
-    auto&& instruction = instructions.at(ins_index);
-
-    // step 2: prepare the input and output arguments of the instruction
-    details::CinnLaunchContext* launch_context =
-        compiled_object.launch_context.get();
-    auto share_argument_buffer_fn = [launch_context,
-                                     &ctx](const std::string& var_name) {
-      cinn_buffer_t* buffer = launch_context->GetCinnBufferOfVar(var_name);
-      std::string revise_var_name = launch_context->RedirectVarName(var_name);
-      framework::Variable* var = ctx.scope().GetVar(revise_var_name);
-      auto* tensor = var->template GetMutable<phi::DenseTensor>();
-      buffer->memory = reinterpret_cast<uint8_t*>(tensor->mutable_data(
-          ctx.GetPlace(),
-          framework::paddle2cinn::TransToPaddleDataType(buffer->type)));
-    };
-    std::vector<std::string> in_args = ctx.InputNames(kX);
-    std::for_each(in_args.begin(), in_args.end(), share_argument_buffer_fn);
-    std::vector<std::string> out_args = ctx.OutputNames(kOutputs);
-    std::for_each(out_args.begin(), out_args.end(), share_argument_buffer_fn);
-
-    // step 3: launch CINN runtime to execute the instruction
-    // TODO(CtfGo): simplify format of arguments package as a vector in CINN
-    // and update this usage call
-    instruction->Run(&launch_context->FinalizeArguments(),
-                     false,
-                     details::GetStream<DeviceContext>(ctx));
-  }
-};
-
-}  // namespace paddle::operators
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc
index 734987ce92235..aefc3f8111e54 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -179,7 +179,7 @@ void CinnLaunchContext::BuildVarNameMap(
     PADDLE_ENFORCE_EQ(
         res.second,
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Cinn variable(%s) maps to more than one paddle variable(%s,%s)",
             x.second,
             res.first->second,
@@ -198,7 +198,7 @@ void CinnLaunchContext::BuildVarNameMap(
   PADDLE_ENFORCE_EQ(
       paddle2cinn_varmap_.size(),
       cinn2paddle_varmap_.size(),
-      platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "Size of variables is not equal, paddle[%ld] vs cinn[%ld]",
           paddle2cinn_varmap_.size(),
           cinn2paddle_varmap_.size()));
@@ -236,7 +236,7 @@ CinnTensor CinnLaunchContext::GetCinnTensorOfVar(const std::string& var_name) {
   PADDLE_ENFORCE_EQ(
       IsVariableUsed(var_name),
       true,
-      platform::errors::NotFound("Variable(%s) not applied in CINN", var_name));
+      phi::errors::NotFound("Variable(%s) not applied in CINN", var_name));
   const auto& arg_name = paddle2cinn_varmap_.at(var_name);
   return cinn_scope_->GetTensor(arg_name);
 }
@@ -276,7 +276,7 @@ void CinnLaunchContext::CheckTensorEquivalent(
     const std::string& var_name, const phi::DenseTensor& paddle_tensor) {
   PADDLE_ENFORCE_EQ(IsVariableUsed(var_name),
                     true,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Variable(%s) not applied in cinn", var_name));
   // check dimension
   auto cinn_tensor = GetCinnTensorOfVar(var_name);
@@ -309,7 +309,7 @@ void CinnLaunchContext::CheckTensorEquivalent(
       framework::paddle2cinn::TransToPaddleDataType(cinn_tensor->type());
   PADDLE_ENFORCE_EQ(paddle_tensor.dtype(),
                     cinn_dtype,
-                    platform::errors::PreconditionNotMet(
+                    phi::errors::PreconditionNotMet(
                         "Tensors' dtype in variable(%s) are not equivalent, "
                         "paddle is = [%s], but cinn is = [%s].",
                         var_name,
@@ -345,7 +345,7 @@ void CinnLaunchContext::InitializeArguments() {
 void CinnLaunchContext::AssignExternalVariable(const std::string& var_name) {
   PADDLE_ENFORCE_EQ(IsVariableUsed(var_name),
                     true,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Variable(%s) not applied in cinn", var_name));
   auto* cinn_buffer = GetCinnBufferOfVar(var_name);
   std::string revise_var_name = RedirectVarName(var_name);
@@ -372,7 +372,7 @@ void CinnLaunchContext::AssignExternalVariable(const std::string& var_name) {
 void CinnLaunchContext::AssignInternalVariable(const std::string& var_name) {
   PADDLE_ENFORCE_EQ(IsVariableUsed(var_name),
                     true,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Variable(%s) not applied in cinn", var_name));
   auto* cinn_buffer = GetCinnBufferOfVar(var_name);
   std::string revise_var_name = RedirectVarName(var_name);
@@ -458,7 +458,7 @@ std::unique_ptr<framework::ProgramDesc> CinnLaunchContext::BuildCompiledProgram(
             PADDLE_ENFORCE_NE(
                 res,
                 cinn2paddle_varmap_.end(),
-                platform::errors::NotFound("Argument(%s) not found", arg));
+                phi::errors::NotFound("Argument(%s) not found", arg));
             var_names.emplace_back(res->second);
           }
         }
@@ -592,8 +592,8 @@ cinn_buffer_t* CinnLaunchContext::GetCinnBufferOfVar(
   PADDLE_ENFORCE_NE(
       res,
       paddle2argument_.end(),
-      platform::errors::NotFound("Variable(%s) not found in compilation result",
-                                 var_name));
+      phi::errors::NotFound("Variable(%s) not found in compilation result",
+                            var_name));
   return static_cast<cinn_buffer_t*>(res->second);
 }
 
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cc
deleted file mode 100644
index 9edb7348b125c..0000000000000
--- a/paddle/fluid/operators/cinn/cinn_launch_op.cc
+++ /dev/null
@@ -1,210 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/cinn/cinn_launch_op.h"
-
-#include <functional>
-#include <vector>
-
-#include "paddle/cinn/common/target.h"
-#include "paddle/cinn/hlir/framework/graph_compiler.h"
-#include "paddle/cinn/runtime/cinn_runtime.h"
-#include "paddle/cinn/runtime/flags.h"
-#include "paddle/common/flags.h"
-#include "paddle/phi/core/generator.h"
-#include "paddle/utils/string/string_helper.h"
-
-#if defined(PADDLE_WITH_CUDA)
-COMMON_DECLARE_bool(cudnn_deterministic);
-#endif
-
-namespace paddle {
-namespace operators {
-
-namespace details {
-
-const ::cinn::common::Target& PlaceToCinnTarget(const platform::Place& place) {
-  if (platform::is_cpu_place(place)) {
-    return ::cinn::common::DefaultHostTarget();
-  } else if (platform::is_gpu_place(place)) {
-    return ::cinn::common::DefaultNVGPUTarget();
-  }
-
-  PADDLE_THROW(platform::errors::InvalidArgument(
-      "CINN is not supported on current place:%s", place));
-  return ::cinn::common::UnkTarget();
-}
-
-void DebugCinnCompiledResult(const CinnCompiledObject& result) {
-  if (!VLOG_IS_ON(4)) {
-    return;
-  }
-  const auto& cinn_runtime_program = result.runtime_program;
-  const auto& cinn_scope = *(result.scope);
-  const auto& paddle2cinn_varmap = result.paddle2cinn_varmap;
-
-  VLOG(4) << "Compiled runtime_program instruction size:["
-          << cinn_runtime_program->size() << "]";
-
-  std::vector<std::string> infos;
-  auto cinn_var_names = cinn_scope.var_names();
-  infos.reserve(cinn_var_names.size());
-  std::transform(cinn_var_names.begin(),
-                 cinn_var_names.end(),
-                 std::back_inserter(infos),
-                 [](const auto& name_view) { return name_view.data(); });
-  VLOG(4) << "Compiled scope variable names:["
-          << string::join_strings(infos, ',') << "]";
-
-  infos.clear();
-  infos.reserve(paddle2cinn_varmap.size());
-  std::transform(paddle2cinn_varmap.begin(),
-                 paddle2cinn_varmap.end(),
-                 std::back_inserter(infos),
-                 [](const auto& paddle2cinn) {
-                   return paddle2cinn.first + "->" + paddle2cinn.second;
-                 });
-  VLOG(4) << "Compiled paddle2cinn_varmap:[" << string::join_strings(infos, ',')
-          << "]";
-}
-
-void LaunchCinnExecution(const CinnCompiledObject& compiled_obj,
-                         const CinnLaunchContext& context,
-                         void* stream) {
-  compiled_obj.runtime_program->Execute(&context.FinalizeArguments(), stream);
-}
-
-void SetCinnRuntimeFlags() {
-#if defined(PADDLE_WITH_CUDA)
-  VLOG(4) << "Set FLAGS_cinn_cudnn_deterministic to "
-          << FLAGS_cudnn_deterministic;
-  ::cinn::runtime::SetCinnCudnnDeterministic(FLAGS_cudnn_deterministic);
-#endif
-}
-
-template <>
-void SetCinnRandomSeed<phi::CPUContext>() {
-  auto seed = phi::DefaultCPUGenerator()->GetCurrentSeed();
-  ::cinn::runtime::RandomSeed::GetOrSet(seed);
-}
-
-void SetCinnTarget(const ::cinn::common::Target& target) {
-  VLOG(4) << "Set CINN compile target to " << target;
-  ::cinn::runtime::CurrentTarget::SetCurrentTarget(target);
-}
-
-}  // namespace details
-
-class CinnLaunchOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    // The cinn-graph may hasn't input for CINN now support fill_constant,
-    // and its all inputs may generated by fill_constant instead of by fetch.
-    // OP_INOUT_CHECK(ctx->HasInputs(kX) || ctx->HasInputs(kNoNeedBufferX),
-    //                "Input", string::format_string("%s|%s", kX,
-    //                kNoNeedBufferX),
-    //                "CinnLaunchOp");
-    OP_INOUT_CHECK(
-        ctx->HasOutputs(kOutputs), "Output", kOutputs, "CinnLaunchOp");
-  }
-
- protected:
-  /* [Why use single type kernel]:
-   *
-   * This op is similar to a control flow op, it doses not need
-   * a op kernel, but in order to make it execute under dynamic
-   * graph mode, implement it with op kernel.
-   *
-   * So whether the kernel data type is int, float or other type,
-   * which has no effect on its execution logic, so directly
-   * specified a data type here.
-   *
-   * Of course, the data type here is also not important.
-   */
-
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(framework::proto::VarType::FP32, ctx.GetPlace());
-  }
-};
-
-class CinnLaunchOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(kX,
-             "(vector<phi::DenseTensor>)"
-             "which are the input of graph inside the CinnLaunchOp"
-             "excluding kNoNeedBufferX.")
-        .AsDuplicable();
-    AddInput(kNoNeedBufferX,
-             "(vector<phi::DenseTensor>)"
-             "which are the input of graph inside the CinnLaunchOp but"
-             "their buffer are not needed.")
-        .AsDuplicable()
-        .AsDispensable();
-    AddOutput(kOutputs,
-              "(vector<phi::DenseTensor>)"
-              "which are the output of graph inside the CinnLaunchOp.")
-        .AsDuplicable();
-    AddAttr<int64_t>(
-        kCompilationKey,
-        "(string)"
-        "a hash key used to get the graph object or its computation result.");
-    AddComment(R"DOC(
-CinnLaunch Operator.
-
-This operator is used to launch CINN(https://github.com/PaddlePaddle/CINN/blob/develop/README.md)
-to compile a graph and execute the compiled object.
-
-Both input and output of this operator are a set of variables
-which are input and output of the graph respectively that will be
-compiled and executed in this operator.
-In addition, there is an attribute named 'compilation_key' should be
-set necessarily to get corresponding ir::Graph object of the graph
-or its computation result.
-
-It accomplishes the computation of graph following several steps:
-  1. Fetch ir::Graph object from CinnCompiler using kCompilationKey
-  2. Compile the graph to a compiled object, and insert it to the
-     global cache so that we can directly query it from this cache next time
-     when shape of input variables are not changed at all.
-  3. Create and instantiate all variables used to execute compiled runtime program
-     if necessary according to the info(type,shape) included in the return scope.
-  4. Pack each tensor buffer of all above variables as execution arguments.
-  5. Launch execution of the runtime program with above arguments, then
-     the result would be output by writing value on underlying buffer address.
-
-)DOC");
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(CinnLaunchOpNoBufVarsInferer,
-                                    kNoNeedBufferX);
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    cinn_launch,
-    ops::CinnLaunchOp,
-    ops::CinnLaunchOpMaker,
-    ops::CinnLaunchOpNoBufVarsInferer,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-/* see [Why use single type kernel] */
-PD_REGISTER_STRUCT_KERNEL(
-    cinn_launch, CPU, ALL_LAYOUT, ops::CinnLaunchOpKernel, float) {}
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
deleted file mode 100644
index a7ff605dca9b9..0000000000000
--- a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/cinn/cinn_launch_op.h"
-
-#include "paddle/cinn/runtime/flags.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/phi/core/generator.h"
-
-namespace paddle {
-namespace operators {
-
-namespace details {
-
-template <>
-void SetCinnRandomSeed<phi::GPUContext>() {
-  auto seed = phi::DefaultCUDAGenerator(0)->GetCurrentSeed();
-  ::cinn::runtime::RandomSeed::GetOrSet(seed);
-}
-
-}  // namespace details
-}  // namespace operators
-}  // namespace paddle
-
-/* see [Why use single type kernel] */
-PD_REGISTER_STRUCT_KERNEL(cinn_launch,
-                          GPU,
-                          ALL_LAYOUT,
-                          paddle::operators::CinnLaunchOpKernel,
-                          float) {}
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h
deleted file mode 100644
index 2ce23dc965b31..0000000000000
--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ /dev/null
@@ -1,185 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <chrono>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "paddle/cinn/common/target.h"
-#include "paddle/common/flags.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/new_executor/interpretercore.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
-#include "paddle/fluid/operators/cinn/cinn_launch_context.h"
-#include "paddle/fluid/operators/cinn/cinn_op_helper.h"
-#include "paddle/fluid/platform/profiler.h"
-#include "paddle/pir/include/core/program.h"
-#include "paddle/pir/include/core/value.h"
-
-COMMON_DECLARE_bool(enable_pe_launch_cinn);
-COMMON_DECLARE_bool(enable_interpretercore_launch_cinn);
-namespace paddle {
-namespace operators {
-
-using CinnCompiler = framework::paddle2cinn::CinnCompiler;
-using CinnCompiledObject = framework::paddle2cinn::CinnCompiledObject;
-
-namespace details {
-
-// Transform Paddle place to CINN target
-const ::cinn::common::Target& PlaceToCinnTarget(const platform::Place& place);
-
-// Print detailed compilation result of graph for debug
-void DebugCinnCompiledResult(const CinnCompiledObject& result);
-
-// Launch cinn to execute compiled executable program and wait done
-void LaunchCinnExecution(const CinnCompiledObject& compiled_obj,
-                         const CinnLaunchContext& context,
-                         void* stream);
-
-// Set cinn FLAGS (such as FLAGS_cinn_cudnn_deterministic) with paddle's FLAGS.
-void SetCinnRuntimeFlags();
-
-// set CINN global random seed
-template <typename DeviceContext>
-void SetCinnRandomSeed();
-
-// set CINN compile target
-void SetCinnTarget(const ::cinn::common::Target& target);
-
-}  // namespace details
-
-template <typename T, typename DeviceContext>
-class CinnLaunchOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto& scope = ctx.scope();
-    const auto& place = ctx.GetPlace();
-    void* stream = details::GetStream<DeviceContext>(ctx);
-    platform::RecordEvent record_event_1(
-        "Step 1. Find graph object and prepare input");
-    // Step 1. Find graph object and prepare input
-    PADDLE_ENFORCE_EQ(ctx.HasAttr(kCompilationKey),
-                      true,
-                      platform::errors::NotFound(
-                          "No Attribute(%s) found for CinnLaunchOp operator.",
-                          kCompilationKey));
-    const auto& compilation_key = ctx.template Attr<int64_t>(kCompilationKey);
-    VLOG(4) << "CinnLaunchOp attribute(" << kCompilationKey << ") "
-            << "value:\n"
-            << CinnCompiler::GetInstance()->ReadableKey(compilation_key);
-
-    std::map<std::string, const phi::DenseTensor*> inputs_name2tensor;
-    std::vector<std::string> input_x_variable_names;
-    std::vector<std::string> input_no_need_buffer_variable_names;
-    auto add_name2tensor_fn =
-        [&inputs_name2tensor](
-            const std::vector<std::string>& variable_names,
-            const std::vector<const phi::DenseTensor*>& tensors) {
-          std::transform(
-              variable_names.begin(),
-              variable_names.end(),
-              tensors.begin(),
-              std::inserter(inputs_name2tensor, inputs_name2tensor.end()),
-              [](const std::string& name, const phi::DenseTensor* tensor) {
-                return std::make_pair(name, tensor);
-              });
-        };
-
-    auto input_x_tensors = ctx.MultiInput<phi::DenseTensor>(kX);
-    if (!input_x_tensors.empty()) {
-      input_x_variable_names = std::move(ctx.InputNames(kX));
-      add_name2tensor_fn(input_x_variable_names, input_x_tensors);
-    }
-    auto input_no_need_buffer_tensors =
-        ctx.MultiInput<phi::DenseTensor>(kNoNeedBufferX);
-    if (!input_no_need_buffer_tensors.empty()) {
-      input_no_need_buffer_variable_names =
-          std::move(ctx.InputNames(kNoNeedBufferX));
-      add_name2tensor_fn(input_no_need_buffer_variable_names,
-                         input_no_need_buffer_tensors);
-    }
-
-    platform::RecordEvent record_event_2(
-        "Step 2. Get compilation result of the graph");
-    // Step 2. Get compilation result of the graph
-    auto target = details::PlaceToCinnTarget(place);
-    details::SetCinnTarget(target);
-    using ClockType = std::chrono::steady_clock;
-    std::chrono::time_point<ClockType> start_t, end_t;
-    if (VLOG_IS_ON(1)) {
-      VLOG(1) << "Starts to compile at thread " << std::this_thread::get_id();
-      start_t = ClockType::now();
-    }
-    const auto& cinn_compiled_object = CinnCompiler::GetInstance()->Compile(
-        compilation_key, inputs_name2tensor, target, stream);
-    if (VLOG_IS_ON(1)) {
-      end_t = ClockType::now();
-      auto time_sec = std::chrono::duration_cast<std::chrono::milliseconds>(
-          end_t - start_t);
-      VLOG(1) << "Ends to compile at thread " << std::this_thread::get_id()
-              << " , time cost : " << time_sec.count() << " ms";
-
-      const auto& visible_names =
-          cinn_compiled_object.launch_context->GetVisibleVarNames();
-      VLOG(1) << "These CINN variable can visible by Paddle: "
-              << string::join_strings(visible_names, ", ");
-    }
-    details::DebugCinnCompiledResult(cinn_compiled_object);
-    auto* launch_context = cinn_compiled_object.launch_context.get();
-
-    platform::RecordEvent record_event_3("Step 3. Set CINN runtime FLAGS.");
-    // Step 3. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
-    details::SetCinnRuntimeFlags();
-
-    // set CINN global random seed
-    details::SetCinnRandomSeed<DeviceContext>();
-
-    // Step 4. Execute the compiled CINN instructions by a PE or
-    //         by the CINN compiled program in sequential order
-    if (FLAGS_enable_pe_launch_cinn) {
-      if (FLAGS_enable_interpretercore_launch_cinn) {
-        platform::RecordEvent record_event_4(
-            "Step 4. Execute the runtime program by InterpreterCore.");
-        VLOG(4) << "Execute the runtime program by InterpreterCore";
-        auto* interpreter_core = launch_context->InitializeInterpreterCore(
-            place, const_cast<framework::Scope*>(&scope));
-        interpreter_core->Run({}, false);
-      } else {
-        platform::RecordEvent record_event_4(
-            "Step 4. Execute the runtime graph by PE.");
-        VLOG(4) << "Execute the runtime graph by PE";
-        framework::Scope& exec_scope = scope.NewScope();
-        auto* pe = launch_context->InitializePE(place, &exec_scope);
-        pe->RunWithoutFetch(launch_context->GetSkipEagerVars());
-      }
-    } else {
-      platform::RecordEvent record_event_4(
-          "Step 4. Execute the compiled executable program.");
-      VLOG(4) << "Execute the compiled executable program";
-      launch_context->UpdateCapturedEnv(scope, place);
-      LaunchCinnExecution(cinn_compiled_object, *launch_context, stream);
-    }
-    VLOG(4) << "CinnLaunchOp launch execution done.";
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h
index 3895bc09a08a0..4a61792c5b647 100644
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
@@ -37,21 +37,21 @@ class ClipByNormOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input(X) of ClipByNormOp should not be null. Please "
                           "check if it is created correctly."));
     PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Output(Out) of ClipByNormOp should not be null. "
                           "Please check if it is created correctly."));
     auto max_norm = ctx->Attrs().Get<float>("max_norm");
     PADDLE_ENFORCE_GT(
         max_norm,
         0,
-        platform::errors::InvalidArgument("max_norm should be greater than 0. "
-                                          "Received max_norm is %f.",
-                                          max_norm));
+        phi::errors::InvalidArgument("max_norm should be greater than 0. "
+                                     "Received max_norm is %f.",
+                                     max_norm));
     auto x_dims = ctx->GetInputDim("X");
     ctx->SetOutputDim("Out", x_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
diff --git a/paddle/fluid/operators/collective/alltoall_op.cc b/paddle/fluid/operators/collective/alltoall_op.cc
index a8e3b2808092c..bd99fdde2f2c2 100644
--- a/paddle/fluid/operators/collective/alltoall_op.cc
+++ b/paddle/fluid/operators/collective/alltoall_op.cc
@@ -28,7 +28,7 @@ class AllToAllBaseOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GE(
         ring_id,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The ring_id (%d) for alltoall op must be non-negative.", ring_id));
     framework::DDim dim = ctx->GetInputDim("X");
     if (dim[0] < 0) dim[0] = -1;
@@ -79,4 +79,4 @@ PD_REGISTER_STRUCT_KERNEL(alltoall,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/alltoall_op.cu.cc b/paddle/fluid/operators/collective/alltoall_op.cu.cc
index 8f942013435eb..93a44776851d4 100644
--- a/paddle/fluid/operators/collective/alltoall_op.cu.cc
+++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc
@@ -45,7 +45,7 @@ class AllToAllOpCUDAKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_GE(
         ring_id,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The ring_id (%d) for alltoall op must be non-negative.", ring_id));
     auto place = ctx.GetPlace();
 
@@ -59,7 +59,7 @@ class AllToAllOpCUDAKernel : public framework::OpKernel<T> {
     if (FLAGS_dynamic_static_unified_comm) {
       PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "You choose to use new communication library by "
                             "setting environment "
                             "variable FLAGS_dynamic_static_unified_comm True. "
@@ -70,7 +70,7 @@ class AllToAllOpCUDAKernel : public framework::OpKernel<T> {
           comm_context_manager.Get(std::to_string(ring_id)));
       PADDLE_ENFORCE_NE(comm_ctx,
                         nullptr,
-                        platform::errors::Unavailable(
+                        phi::errors::Unavailable(
                             "NCCLCommContext is nullptr, collective op should "
                             "has ring_id attr."));
       stream = comm_ctx->GetStream();
@@ -93,7 +93,7 @@ class AllToAllOpCUDAKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         x_dims[0] % nranks,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The first dimension size (%d) of the input tensor must be "
             "divisible by the number of ranks (%d).",
             x_dims[0],
@@ -126,12 +126,11 @@ class AllToAllOpCUDAKernel : public framework::OpKernel<T> {
       VLOG(3) << "old NCCLCommContext has rid " << ring_id;
     }
 #else
-    PADDLE_THROW(
-        platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
+    PADDLE_THROW(phi::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
 #endif
 #else
     PADDLE_THROW(
-        platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
+        phi::errors::Unavailable("PaddlePaddle should compile with GPU."));
 #endif
   }
 };
@@ -153,5 +152,5 @@ PD_REGISTER_STRUCT_KERNEL(alltoall,
 #endif
                           int,
                           int64_t,
-                          plat::float16) {
+                          phi::dtype::float16) {
 }
diff --git a/paddle/fluid/operators/collective/alltoall_op.h b/paddle/fluid/operators/collective/alltoall_op.h
index 61456c268d5d5..187d4965cdcc8 100644
--- a/paddle/fluid/operators/collective/alltoall_op.h
+++ b/paddle/fluid/operators/collective/alltoall_op.h
@@ -33,7 +33,7 @@ template <typename T, typename DeviceContext>
 class AllToAllOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx UNUSED) const override {
-    PADDLE_THROW(platform::errors::Unavailable(
+    PADDLE_THROW(phi::errors::Unavailable(
         "Do not support alltoall for cpu kernel now."));
   }
 };
diff --git a/paddle/fluid/operators/collective/barrier_op.cu.cc b/paddle/fluid/operators/collective/barrier_op.cu.cc
index 04d409e82b4d5..dc6b701afee00 100644
--- a/paddle/fluid/operators/collective/barrier_op.cu.cc
+++ b/paddle/fluid/operators/collective/barrier_op.cu.cc
@@ -47,7 +47,7 @@ class BarrierOpCUDAKernel : public framework::OpKernel<T> {
     if (FLAGS_dynamic_static_unified_comm) {
       PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "You choose to use new communication library by "
                             "setting environment "
                             "variable FLAGS_dynamic_static_unified_comm True. "
@@ -58,7 +58,7 @@ class BarrierOpCUDAKernel : public framework::OpKernel<T> {
           comm_context_manager.Get(std::to_string(rid)));
       PADDLE_ENFORCE_NE(comm_ctx,
                         nullptr,
-                        platform::errors::Unavailable(
+                        phi::errors::Unavailable(
                             "NCCLCommContext is nullptr, collective op should "
                             "has ring_id attr."));
       auto stream = comm_ctx->GetStream();
@@ -82,8 +82,8 @@ class BarrierOpCUDAKernel : public framework::OpKernel<T> {
       VLOG(3) << "old NCCLCommContext has rid " << rid;
     }
 #else
-    PADDLE_THROW(platform::errors::Unavailable(
-        "PaddlePaddle should compile with NCCL."));
+    PADDLE_THROW(
+        phi::errors::Unavailable("PaddlePaddle should compile with NCCL."));
 #endif
   }
 };
diff --git a/paddle/fluid/operators/collective/barrier_op.h b/paddle/fluid/operators/collective/barrier_op.h
index b05f2de53a073..6bbd5c38a2f76 100644
--- a/paddle/fluid/operators/collective/barrier_op.h
+++ b/paddle/fluid/operators/collective/barrier_op.h
@@ -41,12 +41,12 @@ class BarrierOpCPUKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         gloo->IsInitialized(),
         true,
-        platform::errors::PreconditionNotMet(
+        phi::errors::PreconditionNotMet(
             "You must initialize the gloo environment first to use it."));
     gloo::BarrierOptions opts(gloo->GetContext());
     gloo::barrier(opts);
 #else
-    PADDLE_THROW(platform::errors::Unavailable(
+    PADDLE_THROW(phi::errors::Unavailable(
         "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
 #endif
   }
diff --git a/paddle/fluid/operators/collective/c_allgather_op.cc b/paddle/fluid/operators/collective/c_allgather_op.cc
index 2a0087cd8aa72..e67a2cccc16e9 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cc
@@ -26,10 +26,10 @@ class CAllGatherOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "AllGather");
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Input", "Out", "AllGather");
     int nranks = ctx->Attrs().Get<int>("nranks");
-    PADDLE_ENFORCE_GE(nranks,
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The value of nranks should be >=2."));
+    PADDLE_ENFORCE_GE(
+        nranks,
+        2,
+        phi::errors::InvalidArgument("The value of nranks should be >=2."));
     framework::DDim dim = ctx->GetInputDim("X");
     // 0D use stack/unstack while others use concat/split
     if (dim.size() == 0) {
@@ -85,4 +85,4 @@ PD_REGISTER_STRUCT_KERNEL(c_allgather,
                           int64_t,
                           uint8_t,
                           bool,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
index dcd88f4a311ee..7b57e7af25f9b 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
@@ -67,7 +67,7 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
     if (FLAGS_dynamic_static_unified_comm) {
       PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "You choose to use new communication library by "
                             "setting environment "
                             "variable FLAGS_dynamic_static_unified_comm True. "
@@ -78,7 +78,7 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
           comm_context_manager.Get(std::to_string(rid)));
       PADDLE_ENFORCE_NE(comm_ctx,
                         nullptr,
-                        platform::errors::Unavailable(
+                        phi::errors::Unavailable(
                             "NCCLCommContext is nullptr, collective op should "
                             "has ring_id attr."));
       stream = comm_ctx->GetStream();
@@ -88,7 +88,7 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           nranks,
           comm->nranks(),
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "nranks: %s should equal to %s", nranks, comm->nranks()));
       stream = comm->stream();
       VLOG(3) << "old NCCLCommContext has rid " << rid;
@@ -112,7 +112,7 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
     }
 
 #else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
+    PADDLE_THROW(phi::errors::PreconditionNotMet(
         "PaddlePaddle should compile with GPU."));
 #endif
   }
@@ -138,5 +138,5 @@ PD_REGISTER_STRUCT_KERNEL(c_allgather,
                           int8_t,
                           int64_t,
                           bool,
-                          plat::float16) {
+                          phi::dtype::float16) {
 }
diff --git a/paddle/fluid/operators/collective/c_allgather_op.h b/paddle/fluid/operators/collective/c_allgather_op.h
index b4aff2c2363ec..c5e2088da9889 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.h
+++ b/paddle/fluid/operators/collective/c_allgather_op.h
@@ -49,14 +49,14 @@ class CAllGatherOpCPUKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         gloo->IsInitialized(),
         true,
-        platform::errors::PreconditionNotMet(
+        phi::errors::PreconditionNotMet(
             "You must initialize the gloo environment first to use it."));
     gloo::AllgatherOptions opts(gloo->GetContext());
     opts.setInput(const_cast<T*>(send_buff), send_numel);
     opts.setOutput(recv_buff, send_numel * nranks);
     gloo::allgather(opts);
 #else
-    PADDLE_THROW(platform::errors::Unavailable(
+    PADDLE_THROW(phi::errors::Unavailable(
         "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
 #endif
   }
diff --git a/paddle/fluid/operators/collective/c_allgather_op_xpu.cc b/paddle/fluid/operators/collective/c_allgather_op_xpu.cc
index d31c120cf9ede..48e965894a294 100644
--- a/paddle/fluid/operators/collective/c_allgather_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_xpu.cc
@@ -65,7 +65,7 @@ class CAllGatherOpXPUKernel : public framework::OpKernel<T> {
     if (FLAGS_dynamic_static_unified_comm) {
       PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "You choose to use new communication library by "
                             "setting environment "
                             "variable FLAGS_dynamic_static_unified_comm True. "
@@ -76,7 +76,7 @@ class CAllGatherOpXPUKernel : public framework::OpKernel<T> {
           comm_context_manager.Get(std::to_string(rid)));
       PADDLE_ENFORCE_NE(comm_ctx,
                         nullptr,
-                        platform::errors::Unavailable(
+                        phi::errors::Unavailable(
                             "BKCLCommContext is nullptr, collective op should "
                             "has ring_id attr."));
       stream = comm_ctx->GetStream();
@@ -86,7 +86,7 @@ class CAllGatherOpXPUKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           nranks,
           comm->nranks(),
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "nranks: %s should equal to %s", nranks, comm->nranks()));
       stream = comm->stream();
       VLOG(3) << "old BKCLCommContext has rid " << rid;
@@ -106,7 +106,7 @@ class CAllGatherOpXPUKernel : public framework::OpKernel<T> {
           comm->comm(), sendbuff, numel, recvbuff, dtype, stream));
     }
 #else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
+    PADDLE_THROW(phi::errors::PreconditionNotMet(
         "PaddlePaddle should be compiled with XPU and bkcl."));
 #endif
   }
@@ -124,7 +124,7 @@ PD_REGISTER_STRUCT_KERNEL(c_allgather,
                           ops::CAllGatherOpXPUKernel,
                           float,
                           double,
-                          plat::float16,
+                          phi::dtype::float16,
                           int,
                           int64_t,
                           uint8_t,
diff --git a/paddle/fluid/operators/collective/c_allreduce_avg_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_avg_op.cu.cc
index d3f0b45f64432..e859145df8b73 100644
--- a/paddle/fluid/operators/collective/c_allreduce_avg_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_avg_op.cu.cc
@@ -31,5 +31,5 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_avg,
                           double,
                           int,
                           int64_t,
-                          plat::float16,
+                          phi::dtype::float16,
                           plat::bfloat16) {}
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
index c47bf7025e1fd..d659be0f3d141 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
@@ -55,4 +55,4 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_max,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
index 277988b56916f..012b280a9ab15 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
@@ -34,5 +34,5 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_max,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {
+                          phi::dtype::float16) {
 }
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc
index 8c648b4ae4a37..943df02ad93e2 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc
@@ -28,4 +28,4 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_max,
                           ops::CAllReduceMaxXPUKernel,
                           float,
                           int,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cc
index c21337a27202e..2a9dd023cf162 100644
--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cc
@@ -56,4 +56,4 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_min,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
index 4475abdef281b..a3eec10051c52 100644
--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
@@ -31,4 +31,4 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_min,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc
index f9be16781af70..fb19a2924d1eb 100644
--- a/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc
@@ -28,4 +28,4 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_min,
                           ops::CAllReduceMinXPUKernel,
                           float,
                           int,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 55ca03c0bc626..db9d6d5361462 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -96,7 +96,7 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         gloo->IsInitialized(),
         true,
-        platform::errors::PreconditionNotMet(
+        phi::errors::PreconditionNotMet(
             "You must initialize the gloo environment first to use it."));
     gloo::AllreduceOptions opts(gloo->GetContext());
     opts.setInput(const_cast<T*>(send_buff), send_numel);
@@ -123,14 +123,14 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
                 &gloo::product<T>));
         break;
       default:
-        PADDLE_ENFORCE_EQ(true,
-                          false,
-                          platform::errors::InvalidArgument(
-                              "Invalid reduce type: %d.", red_type));
+        PADDLE_ENFORCE_EQ(
+            true,
+            false,
+            phi::errors::InvalidArgument("Invalid reduce type: %d.", red_type));
     }
     gloo::allreduce(opts);
 #else
-    PADDLE_THROW(platform::errors::Unavailable(
+    PADDLE_THROW(phi::errors::Unavailable(
         "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
 #endif
   }
@@ -150,11 +150,11 @@ class CAllReduceOpXPUKernel : public framework::OpKernel<T> {
       auto place = cond->place();
       PADDLE_ENFORCE_EQ(platform::is_cpu_place(place),
                         true,
-                        platform::errors::PreconditionNotMet(
+                        phi::errors::PreconditionNotMet(
                             "The input `cond` tensor should be on cpu place"));
       PADDLE_ENFORCE_EQ(cond->numel(),
                         1,
-                        platform::errors::PreconditionNotMet(
+                        phi::errors::PreconditionNotMet(
                             "The input `cond` should be shape [1]"));
       if (!cond->data<bool>()[0]) {
         VLOG(4) << "Skip all reduce Op since cond is 0";
@@ -197,8 +197,8 @@ class CAllReduceOpXPUKernel : public framework::OpKernel<T> {
           break;
 
         default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Invalid reduce type: %d", red_type));
+          PADDLE_THROW(phi::errors::InvalidArgument("Invalid reduce type: %d",
+                                                    red_type));
       }
 
       auto task = pg->AllReduce(out, *in, opts, false, true);
@@ -215,7 +215,7 @@ class CAllReduceOpXPUKernel : public framework::OpKernel<T> {
     if (FLAGS_dynamic_static_unified_comm) {
       PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "You choose to use new communication library by "
                             "setting environment "
                             "variable FLAGS_dynamic_static_unified_comm True. "
@@ -226,7 +226,7 @@ class CAllReduceOpXPUKernel : public framework::OpKernel<T> {
           comm_context_manager.Get(std::to_string(rid)));
       PADDLE_ENFORCE_NE(comm_ctx,
                         nullptr,
-                        platform::errors::Unavailable(
+                        phi::errors::Unavailable(
                             "BKCLCommContext is nullptr, collective op should "
                             "has ring_id attr."));
       stream = comm_ctx->GetStream();
@@ -262,8 +262,8 @@ class CAllReduceOpXPUKernel : public framework::OpKernel<T> {
         break;
 
       default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Invalid reduce type: %d", red_type));
+        PADDLE_THROW(
+            phi::errors::InvalidArgument("Invalid reduce type: %d", red_type));
     }
 
     if (comm_ctx) {
@@ -278,7 +278,7 @@ class CAllReduceOpXPUKernel : public framework::OpKernel<T> {
                                                  stream));
     }
 #else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
+    PADDLE_THROW(phi::errors::PreconditionNotMet(
         "PaddlePaddle should be compiled with XPU."));
 #endif
   }
@@ -297,11 +297,11 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
       auto place = cond->place();
       PADDLE_ENFORCE_EQ(platform::is_cpu_place(place),
                         true,
-                        platform::errors::PreconditionNotMet(
+                        phi::errors::PreconditionNotMet(
                             "The input `cond` tensor should be on cpu place"));
       PADDLE_ENFORCE_EQ(cond->numel(),
                         1,
-                        platform::errors::PreconditionNotMet(
+                        phi::errors::PreconditionNotMet(
                             "The input `cond` should be shape [1]"));
       if (!cond->data<bool>()[0]) {
         VLOG(4) << "Skip all reduce Op since cond is 0";
@@ -345,8 +345,8 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
           break;
 
         default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Invalid reduce type: %d", red_type));
+          PADDLE_THROW(phi::errors::InvalidArgument("Invalid reduce type: %d",
+                                                    red_type));
       }
 
       auto task = pg->AllReduce(out, *in, opts, false, true);
@@ -363,7 +363,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
     if (FLAGS_dynamic_static_unified_comm) {
       PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "You choose to use new communication library by "
                             "setting environment "
                             "variable FLAGS_dynamic_static_unified_comm True. "
@@ -374,7 +374,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
           comm_context_manager.Get(std::to_string(rid)));
       PADDLE_ENFORCE_NE(comm_ctx,
                         nullptr,
-                        platform::errors::Unavailable(
+                        phi::errors::Unavailable(
                             "NCCLCommContext is nullptr, collective op should "
                             "has ring_id attr."));
       stream = comm_ctx->GetStream();
@@ -420,8 +420,8 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
 #endif
 
       default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Invalid reduce type: %d", red_type));
+        PADDLE_THROW(
+            phi::errors::InvalidArgument("Invalid reduce type: %d", red_type));
     }
 
     if (comm_ctx) {
@@ -436,7 +436,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
                                                                   stream));
     }
 #else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
+    PADDLE_THROW(phi::errors::PreconditionNotMet(
         "PaddlePaddle should compile with GPU."));
 #endif
   }
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
index ee40f29d789e1..181b78b545e7c 100644
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
@@ -56,4 +56,4 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_prod,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
index c63a1d2182678..e2c0a71a9ced4 100644
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
@@ -31,4 +31,4 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_prod,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc
index 5558b1722093a..d3696c2c5dfc1 100644
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc
@@ -28,4 +28,4 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_prod,
                           ops::CAllReduceProdXPUKernel,
                           float,
                           int,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
index 79e70757fbcfd..80b97b2bc70cb 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
@@ -77,4 +77,4 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_sum,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
index 76d809cd234f0..909bd23db2413 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
@@ -34,5 +34,5 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_sum,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {
+                          phi::dtype::float16) {
 }
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
index 1d4c5f63b5850..21bedcff8774b 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
@@ -28,4 +28,4 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_sum,
                           ops::CAllReduceSumXPUKernel,
                           float,
                           int,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cc b/paddle/fluid/operators/collective/c_broadcast_op.cc
index 670b69c05701c..27f3a1bcdc29f 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cc
@@ -73,4 +73,4 @@ PD_REGISTER_STRUCT_KERNEL(c_broadcast,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
index 4d49bc4990c6e..98f9102f2d8f0 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
@@ -80,7 +80,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
 
     out->set_lod(x->lod());
 #else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
+    PADDLE_THROW(phi::errors::PreconditionNotMet(
         "PaddlePaddle should compile with GPU."));
 #endif
   }
@@ -103,5 +103,5 @@ PD_REGISTER_STRUCT_KERNEL(c_broadcast,
 #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
 #endif
-                          plat::float16) {
+                          phi::dtype::float16) {
 }
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.h b/paddle/fluid/operators/collective/c_broadcast_op.h
index e0d6158f19db7..c02b8f8a9a4fe 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.h
+++ b/paddle/fluid/operators/collective/c_broadcast_op.h
@@ -59,7 +59,7 @@ class CBroadcastOpCPUKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           gloo->IsInitialized(),
           true,
-          platform::errors::PreconditionNotMet(
+          phi::errors::PreconditionNotMet(
               "You must initialize the gloo environment first to use it."));
       gloo::BroadcastOptions opts(gloo->GetContext());
       opts.setOutput(recv_buff, send_numel);
@@ -67,7 +67,7 @@ class CBroadcastOpCPUKernel : public framework::OpKernel<T> {
       gloo::broadcast(opts);
     }
 #else
-    PADDLE_THROW(platform::errors::Unavailable(
+    PADDLE_THROW(phi::errors::Unavailable(
         "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
 #endif
   }
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_xpu.cc b/paddle/fluid/operators/collective/c_broadcast_op_xpu.cc
index 6bf9d956a342e..ac7d9623e3241 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_xpu.cc
@@ -50,7 +50,7 @@ class CBroadcastOpXPUKernel : public framework::OpKernel<T> {
     if (FLAGS_dynamic_static_unified_comm) {
       PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "You choose to use new communication library by "
                             "setting environment "
                             "variable FLAGS_dynamic_static_unified_comm True. "
@@ -61,7 +61,7 @@ class CBroadcastOpXPUKernel : public framework::OpKernel<T> {
           comm_context_manager.Get(std::to_string(ring_id)));
       PADDLE_ENFORCE_NE(comm_ctx,
                         nullptr,
-                        platform::errors::Unavailable(
+                        phi::errors::Unavailable(
                             "BKCLCommContext is nullptr, collective op should "
                             "has ring_id attr."));
       stream = comm_ctx->GetStream();
@@ -119,7 +119,7 @@ class CBroadcastOpXPUKernel : public framework::OpKernel<T> {
     out->Resize(x->dims());
     out->set_lod(x->lod());
 #else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
+    PADDLE_THROW(phi::errors::PreconditionNotMet(
         "PaddlePaddle should be compiled with XPU and BKCL."));
 #endif
   }
@@ -137,6 +137,6 @@ PD_REGISTER_STRUCT_KERNEL(c_broadcast,
                           ops::CBroadcastOpXPUKernel,
                           float,
                           double,
-                          plat::float16,
+                          phi::dtype::float16,
                           int,
                           int64_t) {}
diff --git a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc
index 14059c3d91027..ca0a45c8ae79c 100644
--- a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc
@@ -54,7 +54,7 @@ class CCommInitMultiTrainerOp : public framework::OperatorBase {
                const platform::Place& place) const override {
     auto var = scope.FindVar(Input("X"));
     PADDLE_ENFORCE_NOT_NULL(
-        var, platform::errors::InvalidArgument("Input X must be provided."));
+        var, phi::errors::InvalidArgument("Input X must be provided."));
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     ncclUniqueId* nccl_id = var->GetMutable<ncclUniqueId>();
 
@@ -70,8 +70,8 @@ class CCommInitMultiTrainerOp : public framework::OperatorBase {
     platform::NCCLCommContext::Instance().CreateNCCLCommMultiTrainer(
         devices, nccl_id, ntrainers, train_id, rid);
 #else
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "PaddlePaddle should compile with GPU."));
+    PADDLE_THROW(
+        phi::errors::Unimplemented("PaddlePaddle should compile with GPU."));
 #endif
   }
 };
diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc
index 172e330675033..5c6613a0e9ca3 100644
--- a/paddle/fluid/operators/collective/c_comm_init_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_op.cc
@@ -65,7 +65,7 @@ class CCommInitOp : public framework::OperatorBase {
 #if defined(PADDLE_WITH_CUSTOM_DEVICE)
       auto var = scope.FindVar(Input("X"));
       PADDLE_ENFORCE_NOT_NULL(
-          var, platform::errors::InvalidArgument("Input con not be empty."));
+          var, phi::errors::InvalidArgument("Input con not be empty."));
 
       int nranks = Attr<int>("nranks");
       int rid = Attr<int>("ring_id");
@@ -87,7 +87,7 @@ class CCommInitOp : public framework::OperatorBase {
             "c_comm_init_op");
       }
 #else
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
+      PADDLE_THROW(phi::errors::PreconditionNotMet(
           "PaddlePaddle should compile with custom device."));
 #endif
     } else {
@@ -99,21 +99,21 @@ class CCommInitOp : public framework::OperatorBase {
       using UniqueId = BKCLUniqueId;
       using CommContext = platform::BKCLCommContext;
 #else
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
+      PADDLE_THROW(phi::errors::PreconditionNotMet(
           "PaddlePaddle should be compiled with GPU or XPU."));
 #endif
 
       PADDLE_ENFORCE_EQ(
           platform::is_gpu_place(place) || platform::is_xpu_place(place),
           true,
-          platform::errors::PreconditionNotMet(
+          phi::errors::PreconditionNotMet(
               "CCommInitOp can run on gpu or xpu place only."));
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL)
       auto var = scope.FindVar(Input("X"));
       PADDLE_ENFORCE_NOT_NULL(
-          var, platform::errors::InvalidArgument("Input con not be empty."));
+          var, phi::errors::InvalidArgument("Input con not be empty."));
 
       int nranks = Attr<int>("nranks");
       int rid = Attr<int>("ring_id");
diff --git a/paddle/fluid/operators/collective/c_concat_op.cc b/paddle/fluid/operators/collective/c_concat_op.cc
index 27c1141f8b67f..75db7e9fad427 100644
--- a/paddle/fluid/operators/collective/c_concat_op.cc
+++ b/paddle/fluid/operators/collective/c_concat_op.cc
@@ -27,29 +27,29 @@ class CConcatOp : public framework::OperatorWithKernel {
     int nranks = ctx->Attrs().Get<int>("nranks");
     int rank = ctx->Attrs().Get<int>("rank");
     int ring_id = ctx->Attrs().Get<int>("ring_id");
-    PADDLE_ENFORCE_GE(nranks,
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The number of ranks (%d) for c_concat "
-                          "must be greater than 1.",
-                          nranks));
+    PADDLE_ENFORCE_GE(
+        nranks,
+        2,
+        phi::errors::InvalidArgument("The number of ranks (%d) for c_concat "
+                                     "must be greater than 1.",
+                                     nranks));
     PADDLE_ENFORCE_GE(
         ring_id,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The ring_id (%d) for c_concat must be non-negative.", ring_id));
     PADDLE_ENFORCE_GE(
         rank,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The rank (%d) for c_concat must be non-negative.", rank));
-    PADDLE_ENFORCE_LT(rank,
-                      nranks,
-                      platform::errors::InvalidArgument(
-                          "The value of rank (%d) for c_concat must "
-                          "be less than that of nranks.",
-                          rank,
-                          nranks));
+    PADDLE_ENFORCE_LT(
+        rank,
+        nranks,
+        phi::errors::InvalidArgument("The value of rank (%d) for c_concat must "
+                                     "be less than that of nranks.",
+                                     rank,
+                                     nranks));
 
     framework::DDim dim = ctx->GetInputDim("X");
     dim[dim.size() - 1] = dim[dim.size() - 1] * nranks;
@@ -121,4 +121,4 @@ PD_REGISTER_STRUCT_KERNEL(c_concat,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc
index b75b2d4b0f687..9ed68c7c6809b 100644
--- a/paddle/fluid/operators/collective/c_concat_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc
@@ -47,19 +47,19 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
     auto place = ctx.GetPlace();
     PADDLE_ENFORCE_GE(rank,
                       0,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "The value of rank (%d) for c_concat must be "
                           "greater than or equal to 0.",
                           rank));
     PADDLE_ENFORCE_GE(nranks,
                       2,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "The value of nranks (%d) for c_concat must be "
                           "greater than or equal to 2.",
                           nranks));
     PADDLE_ENFORCE_LT(rank,
                       nranks,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "The value of rank (%d) for c_concat must be "
                           "less than that of nranks (%d).",
                           rank,
@@ -95,7 +95,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
       if (FLAGS_dynamic_static_unified_comm) {
         PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
                           true,
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "You choose to use new communication library by "
                               "setting environment "
                               "variable FLAGS_dynamic_static_unified_comm "
@@ -107,7 +107,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_NE(
             comm_ctx,
             nullptr,
-            platform::errors::Unavailable(
+            phi::errors::Unavailable(
                 "NCCLCommContext is nullptr, collective op should "
                 "has ring_id attr."));
         stream = comm_ctx->GetStream();
@@ -117,7 +117,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_EQ(
             nranks,
             comm->nranks(),
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "nranks: %s should equal to %s", nranks, comm->nranks()));
         stream = comm->stream();
         VLOG(3) << "old NCCLCommContext has rid " << rid;
@@ -156,7 +156,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
     auto& dev_ctx2 = ctx.template device_context<phi::GPUContext>();
     functor(dev_ctx2, inputs, axis, out);
 #else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
+    PADDLE_THROW(phi::errors::PreconditionNotMet(
         "PaddlePaddle should compile with GPU."));
 #endif
   }
@@ -178,5 +178,5 @@ PD_REGISTER_STRUCT_KERNEL(c_concat,
 #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
 #endif
-                          plat::float16) {
+                          phi::dtype::float16) {
 }
diff --git a/paddle/fluid/operators/collective/c_concat_op.h b/paddle/fluid/operators/collective/c_concat_op.h
index 39bdc4c2740de..84edccffc6fa3 100644
--- a/paddle/fluid/operators/collective/c_concat_op.h
+++ b/paddle/fluid/operators/collective/c_concat_op.h
@@ -29,7 +29,7 @@ template <typename T, typename DeviceContext>
 class CConcatOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx UNUSED) const override {
-    PADDLE_THROW(platform::errors::Unavailable(
+    PADDLE_THROW(phi::errors::Unavailable(
         "Do not support c_concat for cpu kernel now."));
   }
 };
diff --git a/paddle/fluid/operators/collective/c_concat_op_xpu.cc b/paddle/fluid/operators/collective/c_concat_op_xpu.cc
index 10a2624ae83a4..fcd3c8b33f8b9 100644
--- a/paddle/fluid/operators/collective/c_concat_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_concat_op_xpu.cc
@@ -46,19 +46,19 @@ class CConcatOpXPUKernel : public framework::OpKernel<T> {
     int rid = ctx.Attr<int>("ring_id");
     PADDLE_ENFORCE_GE(rank,
                       0,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "The value of rank (%d) for c_concat must be "
                           "greater than or equal to 0.",
                           rank));
     PADDLE_ENFORCE_GE(nranks,
                       2,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "The value of nranks (%d) for c_concat must be "
                           "greater than or equal to 2.",
                           nranks));
     PADDLE_ENFORCE_LT(rank,
                       nranks,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "The value of rank (%d) for c_concat must be "
                           "less than that of nranks (%d).",
                           rank,
@@ -95,7 +95,7 @@ class CConcatOpXPUKernel : public framework::OpKernel<T> {
       if (FLAGS_dynamic_static_unified_comm) {
         PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
                           true,
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "You choose to use new communication library by "
                               "setting environment "
                               "variable FLAGS_dynamic_static_unified_comm "
@@ -107,7 +107,7 @@ class CConcatOpXPUKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_NE(
             comm_ctx,
             nullptr,
-            platform::errors::Unavailable(
+            phi::errors::Unavailable(
                 "BKCLCommContext is nullptr, collective op should "
                 "has ring_id attr."));
         stream = comm_ctx->GetStream();
@@ -118,7 +118,7 @@ class CConcatOpXPUKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_EQ(
             nranks,
             comm->nranks(),
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "nranks: %s should equal to %s", nranks, comm->nranks()));
         stream = comm->stream();
         VLOG(3) << "old BKCLCommContext has rid " << rid;
@@ -151,7 +151,7 @@ class CConcatOpXPUKernel : public framework::OpKernel<T> {
     dev_ctx.template Alloc(out, x->dtype());
     functor(dev_ctx, inputs, axis, out);
 #else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
+    PADDLE_THROW(phi::errors::PreconditionNotMet(
         "PaddlePaddle should compile with XPU."));
 #endif
   }
@@ -169,4 +169,5 @@ PD_REGISTER_STRUCT_KERNEL(c_concat,
                           float,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          plat::float16,
+                          plat::bfloat16) {}
diff --git a/paddle/fluid/operators/collective/c_embedding_op.cc b/paddle/fluid/operators/collective/c_embedding_op.cc
index 86e882b1c6cc8..0bbd64abb10d5 100644
--- a/paddle/fluid/operators/collective/c_embedding_op.cc
+++ b/paddle/fluid/operators/collective/c_embedding_op.cc
@@ -33,7 +33,7 @@ class CEmbeddingOp : public framework::OperatorWithKernel {
     VLOG(5) << "ids rank is " << ids_rank << std::endl;
     PADDLE_ENFORCE_EQ(table_dims.size(),
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dimensions of the 'c_embedding' must be 2. "
                           "But received c_embedding's dimensions = %d, "
                           "c_embedding's shape = [%s].",
@@ -57,7 +57,7 @@ class CEmbeddingOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         (height > 0 && width > 0 && start_idx >= 0),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "height:%ld width:%ld start_idx:%ld must not have negative values",
             height,
             width,
@@ -133,10 +133,10 @@ class CEmbeddingOpGrad : public framework::OperatorWithKernel {
     ctx->SetOutputDim(framework::GradVarName("W"), table_dims);
 
     // check valid
-    PADDLE_ENFORCE_EQ(table_dims.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "Only accept the dims of table_t == 2"));
+    PADDLE_ENFORCE_EQ(
+        table_dims.size(),
+        2,
+        phi::errors::InvalidArgument("Only accept the dims of table_t == 2"));
 
     const int64_t start_idx = ctx->Attrs().Get<int64_t>("start_index");
     const int64_t height = table_dims[0];
@@ -145,7 +145,7 @@ class CEmbeddingOpGrad : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         (height > 0 && width > 0 && start_idx >= 0),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "height:%ld width:%ld start_idx:%ld must not have negative values",
             height,
             width,
diff --git a/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc b/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc
index a7a234f5792ef..3d469b81609f8 100644
--- a/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc
@@ -31,10 +31,10 @@ namespace operators {
 static void GenBKCLID(std::vector<BKCLUniqueId>* bkcl_ids) {
   for (size_t i = 0; i < bkcl_ids->size(); ++i) {
     BKCLResult_t ret = bkcl_get_unique_id(&(*bkcl_ids)[i]);
-    PADDLE_ENFORCE_EQ(BKCL_SUCCESS,
-                      ret,
-                      platform::errors::PreconditionNotMet(
-                          "bkcl get unique id failed [%d]", ret));
+    PADDLE_ENFORCE_EQ(
+        BKCL_SUCCESS,
+        ret,
+        phi::errors::PreconditionNotMet("bkcl get unique id failed [%d]", ret));
   }
 }
 
@@ -46,8 +46,8 @@ static void CopyBKCLIDToVar(const std::vector<BKCLUniqueId>& bkcl_ids,
     auto var = scope.FindVar(var_name);
     PADDLE_ENFORCE_NOT_NULL(
         var,
-        platform::errors::NotFound("Variable with name %s is not found",
-                                   var_name.c_str()));
+        phi::errors::NotFound("Variable with name %s is not found",
+                              var_name.c_str()));
     auto bkcl_id = var->GetMutable<BKCLUniqueId>();
     memcpy(bkcl_id, &bkcl_ids[i], sizeof(BKCLUniqueId));
   }
diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
index c66aedd3b3923..f7f92a0a574df 100644
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
@@ -42,8 +42,8 @@ static void CopyNCCLIDToVar(const std::vector<ncclUniqueId>& nccl_ids,
     auto var = scope.FindVar(var_name);
     PADDLE_ENFORCE_NOT_NULL(
         var,
-        platform::errors::NotFound("Variable with name %s is not found",
-                                   var_name.c_str()));
+        phi::errors::NotFound("Variable with name %s is not found",
+                              var_name.c_str()));
     auto nccl_id = var->GetMutable<ncclUniqueId>();
     memcpy(nccl_id, &nccl_ids[i], sizeof(ncclUniqueId));
   }
diff --git a/paddle/fluid/operators/collective/c_gen_xccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_xccl_id_op.cc
index e404a1357ee75..c24fb4964b336 100644
--- a/paddle/fluid/operators/collective/c_gen_xccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_xccl_id_op.cc
@@ -36,8 +36,8 @@ static void CopyXCCLIDToVar(const std::vector<phi::ccl::CCLRootId>& xccl_ids,
     auto var = scope.FindVar(var_name);
     PADDLE_ENFORCE_NOT_NULL(
         var,
-        platform::errors::NotFound("Variable with name %s is not found",
-                                   var_name.c_str()));
+        phi::errors::NotFound("Variable with name %s is not found",
+                              var_name.c_str()));
     auto xccl_id = var->GetMutable<phi::ccl::CCLRootId>();
     *xccl_id = xccl_ids[i];
   }
diff --git a/paddle/fluid/operators/collective/c_identity_op.cc b/paddle/fluid/operators/collective/c_identity_op.cc
index c067c061b8613..78d4a27f822b4 100644
--- a/paddle/fluid/operators/collective/c_identity_op.cc
+++ b/paddle/fluid/operators/collective/c_identity_op.cc
@@ -31,7 +31,7 @@ class CIdentityOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GE(
         ring_id,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The ring_id (%d) for c_identity must be non-negative.", ring_id));
     framework::DDim dim = ctx->GetInputDim("X");
     ctx->SetOutputDim("Out", dim);
diff --git a/paddle/fluid/operators/collective/c_reduce_avg_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_avg_op.cu.cc
index 07d2cc748900e..1dcd5a2c6489c 100644
--- a/paddle/fluid/operators/collective/c_reduce_avg_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_avg_op.cu.cc
@@ -31,5 +31,5 @@ PD_REGISTER_STRUCT_KERNEL(c_reduce_avg,
                           double,
                           int,
                           int64_t,
-                          plat::float16,
+                          phi::dtype::float16,
                           plat::bfloat16) {}
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op.cc b/paddle/fluid/operators/collective/c_reduce_max_op.cc
index a1509a89eb3b3..a0181c9f0e7af 100644
--- a/paddle/fluid/operators/collective/c_reduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cc
@@ -53,4 +53,4 @@ PD_REGISTER_STRUCT_KERNEL(c_reduce_max,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
index 8973de0a19675..24f3dffd0517e 100644
--- a/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
@@ -31,4 +31,4 @@ PD_REGISTER_STRUCT_KERNEL(c_reduce_max,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op.cc b/paddle/fluid/operators/collective/c_reduce_min_op.cc
index 9b53d80e01607..621272895fe4c 100644
--- a/paddle/fluid/operators/collective/c_reduce_min_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op.cc
@@ -52,4 +52,4 @@ PD_REGISTER_STRUCT_KERNEL(c_reduce_min,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
index e3239cb812cd9..c7d979bd932b6 100644
--- a/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
@@ -31,4 +31,4 @@ PD_REGISTER_STRUCT_KERNEL(c_reduce_min,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h
index d90fb88fe8f3f..0ea4187ffc4f2 100644
--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -85,7 +85,7 @@ class CReduceOpCPUKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         gloo->IsInitialized(),
         true,
-        platform::errors::PreconditionNotMet(
+        phi::errors::PreconditionNotMet(
             "You must initialize the gloo environment first to use it."));
     gloo::ReduceOptions opts(gloo->GetContext());
     opts.setInput(const_cast<T*>(send_buff), send_numel);
@@ -113,14 +113,14 @@ class CReduceOpCPUKernel : public framework::OpKernel<T> {
                 &gloo::product<T>));
         break;
       default:
-        PADDLE_ENFORCE_EQ(true,
-                          false,
-                          platform::errors::InvalidArgument(
-                              "Invalid reduce type: %d.", red_type));
+        PADDLE_ENFORCE_EQ(
+            true,
+            false,
+            phi::errors::InvalidArgument("Invalid reduce type: %d.", red_type));
     }
     gloo::reduce(opts);
 #else
-    PADDLE_THROW(platform::errors::Unavailable(
+    PADDLE_THROW(phi::errors::Unavailable(
         "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
 #endif
   }
@@ -158,7 +158,7 @@ class CReduceOpXPUKernel : public framework::OpKernel<T> {
     if (FLAGS_dynamic_static_unified_comm) {
       PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "You choose to use new communication library by "
                             "setting environment "
                             "variable FLAGS_dynamic_static_unified_comm True. "
@@ -169,7 +169,7 @@ class CReduceOpXPUKernel : public framework::OpKernel<T> {
           comm_context_manager.Get(std::to_string(rid)));
       PADDLE_ENFORCE_NE(comm_ctx,
                         nullptr,
-                        platform::errors::Unavailable(
+                        phi::errors::Unavailable(
                             "BKCLCommContext is nullptr, collective op should "
                             "has ring_id attr."));
       stream = comm_ctx->GetStream();
@@ -205,8 +205,8 @@ class CReduceOpXPUKernel : public framework::OpKernel<T> {
         break;
 
       default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Invalid reduce type: %d", red_type));
+        PADDLE_THROW(
+            phi::errors::InvalidArgument("Invalid reduce type: %d", red_type));
     }
 
     if (comm_ctx) {
@@ -222,7 +222,7 @@ class CReduceOpXPUKernel : public framework::OpKernel<T> {
                                              stream));
     }
 #else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
+    PADDLE_THROW(phi::errors::PreconditionNotMet(
         "PaddlePaddle should be compiled with XPU."));
 #endif
   }
@@ -260,7 +260,7 @@ class CReduceOpCUDAKernel : public framework::OpKernel<T> {
     if (FLAGS_dynamic_static_unified_comm) {
       PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "You choose to use new communication library by "
                             "setting environment "
                             "variable FLAGS_dynamic_static_unified_comm True. "
@@ -271,7 +271,7 @@ class CReduceOpCUDAKernel : public framework::OpKernel<T> {
           comm_context_manager.Get(std::to_string(rid)));
       PADDLE_ENFORCE_NE(comm_ctx,
                         nullptr,
-                        platform::errors::Unavailable(
+                        phi::errors::Unavailable(
                             "NCCLCommContext is nullptr, collective op should "
                             "has ring_id attr."));
       stream = comm_ctx->GetStream();
@@ -311,11 +311,11 @@ class CReduceOpCUDAKernel : public framework::OpKernel<T> {
 #endif
 
       default:
-        PADDLE_ENFORCE_EQ(true,
-                          false,
-                          platform::errors::InvalidArgument(
-                              "red_type must be one of kRedSum, "
-                              "kRedMax, kRedMin, kRedProd."));
+        PADDLE_ENFORCE_EQ(
+            true,
+            false,
+            phi::errors::InvalidArgument("red_type must be one of kRedSum, "
+                                         "kRedMax, kRedMin, kRedProd."));
     }
 
     if (comm_ctx) {
@@ -331,10 +331,10 @@ class CReduceOpCUDAKernel : public framework::OpKernel<T> {
                                                                stream));
     }
 #else
-    PADDLE_ENFORCE_EQ(true,
-                      false,
-                      platform::errors::Unavailable(
-                          "PaddlePaddle should compile with GPU.."));
+    PADDLE_ENFORCE_EQ(
+        true,
+        false,
+        phi::errors::Unavailable("PaddlePaddle should compile with GPU.."));
 #endif
   }
 };
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op.cc b/paddle/fluid/operators/collective/c_reduce_prod_op.cc
index 20dacd19b382b..c34e799f5d8e1 100644
--- a/paddle/fluid/operators/collective/c_reduce_prod_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cc
@@ -53,4 +53,4 @@ PD_REGISTER_STRUCT_KERNEL(c_reduce_prod,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
index 675c274eb0638..b8b562031bc4e 100644
--- a/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
@@ -31,4 +31,4 @@ PD_REGISTER_STRUCT_KERNEL(c_reduce_prod,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op.cc b/paddle/fluid/operators/collective/c_reduce_sum_op.cc
index 72be5c391fca2..5bf5c1c2f8b9f 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cc
@@ -53,4 +53,4 @@ PD_REGISTER_STRUCT_KERNEL(c_reduce_sum,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
index dfae966a35eb0..56fd0e1293389 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
@@ -31,5 +31,5 @@ PD_REGISTER_STRUCT_KERNEL(c_reduce_sum,
                           double,
                           int,
                           int64_t,
-                          plat::float16,
+                          phi::dtype::float16,
                           plat::bfloat16) {}
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cc
index 11c0094340f08..7726c3bf5ca41 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cc
@@ -32,7 +32,7 @@ class CReduceScatterOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           dim[0] % nranks,
           0,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "dim[0] (%d) is not divisible by nranks(%d)", dim[0], nranks));
       dim[0] /= nranks;
     }
@@ -81,4 +81,4 @@ PD_REGISTER_STRUCT_KERNEL(c_reducescatter,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
index 617a8f7b7f941..e00433ad7b4d6 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
@@ -46,7 +46,7 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
     if (FLAGS_dynamic_static_unified_comm) {
       PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "You choose to use new communication library by "
                             "setting environment "
                             "variable FLAGS_dynamic_static_unified_comm True. "
@@ -57,12 +57,12 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
           comm_context_manager.Get(std::to_string(rid)));
       PADDLE_ENFORCE_NE(comm_ctx,
                         nullptr,
-                        platform::errors::Unavailable(
+                        phi::errors::Unavailable(
                             "NCCLCommContext is nullptr, collective op should "
                             "has ring_id attr."));
       PADDLE_ENFORCE_EQ(out_dims[0] % comm_ctx->GetSize(),
                         0,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The input tensor X's "
                             "dim[0] (%d) should be divisible by nranks(%d)",
                             out_dims[0],
@@ -74,7 +74,7 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
       comm = platform::NCCLCommContext::Instance().Get(rid, place);
       PADDLE_ENFORCE_EQ(out_dims[0] % comm->nranks(),
                         0,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The input tensor X's "
                             "dim[0] (%d) should be divisible by nranks(%d)",
                             out_dims[0],
@@ -90,7 +90,7 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
     int nranks = comm_ctx ? comm_ctx->GetSize() : comm->nranks();
     PADDLE_ENFORCE_EQ(out_dims[0] % nranks,
                       0,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The input tensor X's "
                           "dim[0] (%d) should be divisible by nranks(%d)",
                           out_dims[0],
@@ -117,7 +117,7 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
           stream));
     }
 #else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
+    PADDLE_THROW(phi::errors::PreconditionNotMet(
         "PaddlePaddle should compile with GPU."));
 #endif
   }
@@ -140,5 +140,5 @@ PD_REGISTER_STRUCT_KERNEL(c_reducescatter,
 #endif
                           int,
                           int64_t,
-                          plat::float16) {
+                          phi::dtype::float16) {
 }
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.h b/paddle/fluid/operators/collective/c_reducescatter_op.h
index 52af0b9c43541..9f978f3f94bf3 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.h
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.h
@@ -31,7 +31,7 @@ template <typename T, typename DeviceContext>
 class CReduceScatterOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx UNUSED) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
+    PADDLE_THROW(phi::errors::Unimplemented(
         "Unimplemented cpu kernel for CReduceScatterOp."));
   }
 };
diff --git a/paddle/fluid/operators/collective/c_scatter_op.cc b/paddle/fluid/operators/collective/c_scatter_op.cc
index 40b6eeacf8030..d3caf13485036 100644
--- a/paddle/fluid/operators/collective/c_scatter_op.cc
+++ b/paddle/fluid/operators/collective/c_scatter_op.cc
@@ -29,20 +29,20 @@ class CScatterOp : public framework::OperatorWithKernel {
     int nranks = ctx->Attrs().Get<int>("nranks");
     PADDLE_ENFORCE_GE(nranks,
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The number of ranks (%d) must be greater than 1 "
                           "to use collective op (c_scatter op).",
                           nranks));
     PADDLE_ENFORCE_GE(
         root_id,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The root_id (%d) for c_scatter_op must be non-negative.",
             root_id));
     PADDLE_ENFORCE_GE(
         ring_id,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The ring_id (%d) for c_scatter_op must be non-negative.",
             root_id));
     framework::DDim dim = ctx->GetInputDim("X");
@@ -96,4 +96,4 @@ PD_REGISTER_STRUCT_KERNEL(c_scatter,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_scatter_op.cu.cc b/paddle/fluid/operators/collective/c_scatter_op.cu.cc
index fc7a83ca638ee..7cfe5b6785b5a 100644
--- a/paddle/fluid/operators/collective/c_scatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_scatter_op.cu.cc
@@ -47,13 +47,13 @@ class CScatterOpCUDAKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_GE(
         root_id,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The root_id (%d) for c_scatter_op must be non-negative.",
             root_id));
     PADDLE_ENFORCE_GE(
         ring_id,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The ring_id (%d) for c_scatter_op must be non-negative.",
             ring_id));
 
@@ -62,7 +62,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel<T> {
     if (FLAGS_dynamic_static_unified_comm) {
       PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "You choose to use new communication library by "
                             "setting environment "
                             "variable FLAGS_dynamic_static_unified_comm True. "
@@ -73,12 +73,12 @@ class CScatterOpCUDAKernel : public framework::OpKernel<T> {
           comm_context_manager.Get(std::to_string(ring_id)));
       PADDLE_ENFORCE_NE(comm_ctx,
                         nullptr,
-                        platform::errors::Unavailable(
+                        phi::errors::Unavailable(
                             "NCCLCommContext is nullptr, collective op should "
                             "has ring_id attr."));
       PADDLE_ENFORCE_EQ(nranks,
                         comm_ctx->GetSize(),
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The number of ranks (%d) you set of must "
                             "be equal to comm_ctx->GetSize() (%d).",
                             nranks,
@@ -90,7 +90,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel<T> {
       comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
       PADDLE_ENFORCE_EQ(nranks,
                         comm->nranks(),
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The number of ranks (%d) you set of must "
                             "be equal to comm->nranks (%d).",
                             nranks,
@@ -158,7 +158,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         true,
         false,
-        platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
+        phi::errors::Unavailable("PaddlePaddle should compile with GPU."));
 #endif
   }
 };
@@ -177,4 +177,4 @@ PD_REGISTER_STRUCT_KERNEL(c_scatter,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_scatter_op.h b/paddle/fluid/operators/collective/c_scatter_op.h
index 76f3350a64c05..164b7f156de0a 100644
--- a/paddle/fluid/operators/collective/c_scatter_op.h
+++ b/paddle/fluid/operators/collective/c_scatter_op.h
@@ -44,7 +44,7 @@ class CScatterOpCPUKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         gloo->IsInitialized(),
         true,
-        platform::errors::PreconditionNotMet(
+        phi::errors::PreconditionNotMet(
             "You must initialize the gloo environment first to use it."));
 
     int64_t send_numel = out->numel();
@@ -66,7 +66,7 @@ class CScatterOpCPUKernel : public framework::OpKernel<T> {
 
     gloo::scatter(opts);
 #else
-    PADDLE_THROW(platform::errors::Unavailable(
+    PADDLE_THROW(phi::errors::Unavailable(
         "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
 #endif
   }
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cc
index e4de0ceb136c1..496733759adb3 100644
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cc
@@ -46,7 +46,7 @@ class CSoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
         if (ctx->IsRuntime() || (logits_dims[i] > 0 && labels_dims[i] > 0)) {
           PADDLE_ENFORCE_EQ(logits_dims[i],
                             labels_dims[i],
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "Input(Logits) and Input(Label) should in "
                                 "same shape in dimensions except axis."));
         }
@@ -56,7 +56,7 @@ class CSoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         labels_dims[logits_rank - 1],
         1UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "the last dimension of Input(Label) should be 1."
             "But received: the last dimension of Input(Label) is [%d],"
             "the last dimension is [%d]",
@@ -130,22 +130,22 @@ class CSoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Loss")),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(Loss@Grad) should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Softmax"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(Softmax) should be not null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput(framework::GradVarName("Loss")),
+        true,
+        phi::errors::InvalidArgument("Input(Loss@Grad) should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Softmax"),
+        true,
+        phi::errors::InvalidArgument("Input(Softmax) should be not null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("Label"),
         true,
-        platform::errors::InvalidArgument("Input(Label) should be not null."));
+        phi::errors::InvalidArgument("Input(Label) should be not null."));
 
     PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Logits")),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Output(Logits@Grad) should be not null."));
 
     ctx->SetOutputDim(framework::GradVarName("Logits"),
@@ -209,4 +209,4 @@ PD_REGISTER_STRUCT_KERNEL(c_softmax_with_cross_entropy,
                           ops::CSoftmaxWithCrossEntropyOpCPUKernel,
                           float,
                           double,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
index e65ebafad7235..80ce7ce50c4a0 100644
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
@@ -155,7 +155,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
     if (FLAGS_dynamic_static_unified_comm) {
       PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "You choose to use new communication library by "
                             "setting environment "
                             "variable FLAGS_dynamic_static_unified_comm True. "
@@ -166,7 +166,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
           comm_context_manager.Get(std::to_string(rid)));
       PADDLE_ENFORCE_NE(comm_ctx,
                         nullptr,
-                        platform::errors::Unavailable(
+                        phi::errors::Unavailable(
                             "NCCLCommContext is nullptr, collective op should "
                             "has ring_id attr."));
 
@@ -551,11 +551,11 @@ PD_REGISTER_STRUCT_KERNEL(c_softmax_with_cross_entropy,
                           ops::CSoftmaxWithCrossEntropyOpCUDAKernel,
                           float,
                           double,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
 PD_REGISTER_STRUCT_KERNEL(c_softmax_with_cross_entropy_grad,
                           GPU,
                           ALL_LAYOUT,
                           ops::CSoftmaxWithCrossEntropyGradCUDAKernel,
                           float,
                           double,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h
index 9b6a2c86897cb..3689cbcefd9bd 100644
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h
@@ -33,7 +33,7 @@ template <typename T, typename DeviceContext>
 class CSoftmaxWithCrossEntropyOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx UNUSED) const override {
-    PADDLE_THROW(platform::errors::Unavailable(
+    PADDLE_THROW(phi::errors::Unavailable(
         "Do not support c_embedding for cpu kernel now."));
   }
 };
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op_xpu.cc b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op_xpu.cc
index 499b25e65974b..65329ccd8b269 100644
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op_xpu.cc
@@ -278,7 +278,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::XPUContext, T> {
     if (FLAGS_dynamic_static_unified_comm) {
       PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "You choose to use new communication library by "
                             "setting environment "
                             "variable FLAGS_dynamic_static_unified_comm True. "
@@ -289,7 +289,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::XPUContext, T> {
           comm_context_manager.Get(std::to_string(rid)));
       PADDLE_ENFORCE_NE(comm_ctx,
                         nullptr,
-                        platform::errors::Unavailable(
+                        phi::errors::Unavailable(
                             "BKCLCommContext is nullptr, collective op should "
                             "has ring_id attr."));
 
diff --git a/paddle/fluid/operators/collective/c_split_op.cc b/paddle/fluid/operators/collective/c_split_op.cc
index dd65b99e3b7ee..f684c6fe35cf9 100644
--- a/paddle/fluid/operators/collective/c_split_op.cc
+++ b/paddle/fluid/operators/collective/c_split_op.cc
@@ -27,38 +27,38 @@ class CSplitOp : public framework::OperatorWithKernel {
     int nranks = ctx->Attrs().Get<int>("nranks");
     int rank = ctx->Attrs().Get<int>("rank");
     int ring_id = ctx->Attrs().Get<int>("ring_id");
-    PADDLE_ENFORCE_GE(nranks,
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The number of ranks (%d) for c_split "
-                          "must be greater than 1.",
-                          nranks));
+    PADDLE_ENFORCE_GE(
+        nranks,
+        2,
+        phi::errors::InvalidArgument("The number of ranks (%d) for c_split "
+                                     "must be greater than 1.",
+                                     nranks));
     PADDLE_ENFORCE_GE(
         ring_id,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The ring_id (%d) for c_split must be non-negative.", ring_id));
     PADDLE_ENFORCE_GE(
         rank,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The rank (%d) for c_split must be non-negative.", rank));
-    PADDLE_ENFORCE_LT(rank,
-                      nranks,
-                      platform::errors::InvalidArgument(
-                          "The value of rank (%d) for c_split must "
-                          "be less than that of nranks.",
-                          rank,
-                          nranks));
+    PADDLE_ENFORCE_LT(
+        rank,
+        nranks,
+        phi::errors::InvalidArgument("The value of rank (%d) for c_split must "
+                                     "be less than that of nranks.",
+                                     rank,
+                                     nranks));
 
     framework::DDim dim = ctx->GetInputDim("X");
     PADDLE_ENFORCE_EQ(
         dim[dim.size() - 1] % nranks,
         0,
-        platform::errors::InvalidArgument("The last dimension (%d) of the X "
-                                          "should be divisible by nranks (%d)",
-                                          dim[dim.size() - 1],
-                                          nranks));
+        phi::errors::InvalidArgument("The last dimension (%d) of the X "
+                                     "should be divisible by nranks (%d)",
+                                     dim[dim.size() - 1],
+                                     nranks));
 
     dim[dim.size() - 1] = dim[dim.size() - 1] / nranks;
     if (dim[0] < 0) dim[0] = -1;
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cu.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cu.cc
index 526726ae3c772..8d1134be70de1 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cu.cc
@@ -24,5 +24,5 @@ PD_REGISTER_STRUCT_KERNEL(c_sync_calc_stream,
                           double,
                           int,
                           int64_t,
-                          plat::float16,
+                          phi::dtype::float16,
                           plat::bfloat16) {}
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
index e100397924af5..a0e2d858ebd38 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
@@ -51,14 +51,14 @@ class CSyncCalcStreamKernel : public framework::OpKernel<T> {
     auto place = ctx.GetPlace();
     PADDLE_ENFORCE_EQ(platform::is_xpu_place(place),
                       true,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "Sync stream op can run on xpu place only for now."));
 
     auto dev_ctx = static_cast<platform::XPUDeviceContext*>(
         platform::DeviceContextPool::Instance().Get(place));
     dev_ctx->Wait();
 #else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
+    PADDLE_THROW(phi::errors::PreconditionNotMet(
         "PaddlePaddle should compile with GPU."));
 #endif
   }
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op_xpu.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op_xpu.cc
index 24157f1c64a6c..1448e1e3745ec 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_xpu.cc
@@ -25,4 +25,4 @@ PD_REGISTER_STRUCT_KERNEL(c_sync_calc_stream,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.h b/paddle/fluid/operators/collective/c_sync_comm_stream_op.h
index d5fdad8f04f86..d67ef6820d021 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.h
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.h
@@ -50,7 +50,7 @@ class CSyncCommStreamKernel : public framework::OpKernel<T> {
     if (FLAGS_dynamic_static_unified_comm) {
       PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "You choose to use new communication library by "
                             "setting environment "
                             "variable FLAGS_dynamic_static_unified_comm True. "
@@ -73,7 +73,7 @@ class CSyncCommStreamKernel : public framework::OpKernel<T> {
     auto place = ctx.GetPlace();
     PADDLE_ENFORCE_EQ(platform::is_xpu_place(place),
                       true,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "Sync stream op can run on xpu place only for now."));
     int ring_id = ctx.Attr<int>("ring_id");
     XPUStream stream = nullptr;
@@ -82,7 +82,7 @@ class CSyncCommStreamKernel : public framework::OpKernel<T> {
     if (FLAGS_dynamic_static_unified_comm) {
       PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "You choose to use new communication library by "
                             "setting environment "
                             "variable FLAGS_dynamic_static_unified_comm True. "
@@ -102,7 +102,7 @@ class CSyncCommStreamKernel : public framework::OpKernel<T> {
     platform::XPUStreamSync(stream);
 
 #else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
+    PADDLE_THROW(phi::errors::PreconditionNotMet(
         "PaddlePaddle should compile with GPU or XPU."));
 #endif
   }
diff --git a/paddle/fluid/operators/collective/c_wait_comm_op.cc b/paddle/fluid/operators/collective/c_wait_comm_op.cc
index fbb9c0d1ca7ce..10f4d9726f21b 100644
--- a/paddle/fluid/operators/collective/c_wait_comm_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_comm_op.cc
@@ -43,7 +43,7 @@ class CWaitCommOp : public framework::OperatorBase {
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(place),
         true,
-        platform::errors::PreconditionNotMet(
+        phi::errors::PreconditionNotMet(
             "wait_comm op can run on gpu place only for now, but got %s",
             place.DebugString()));
 
@@ -62,7 +62,7 @@ class CWaitCommOp : public framework::OperatorBase {
     if (FLAGS_dynamic_static_unified_comm) {
       PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "You choose to use new communication library by "
                             "setting environment "
                             "variable FLAGS_dynamic_static_unified_comm True. "
@@ -94,7 +94,7 @@ class CWaitCommOp : public framework::OperatorBase {
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0));
 #endif
 #else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
+    PADDLE_THROW(phi::errors::PreconditionNotMet(
         "PaddlePaddle should compile with GPU."));
 #endif
   }
diff --git a/paddle/fluid/operators/collective/c_wait_compute_op.cc b/paddle/fluid/operators/collective/c_wait_compute_op.cc
index 040e86c46b9ec..a548998ce757d 100644
--- a/paddle/fluid/operators/collective/c_wait_compute_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_compute_op.cc
@@ -43,7 +43,7 @@ class CWaitComputeOp : public framework::OperatorBase {
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(place),
         true,
-        platform::errors::PreconditionNotMet(
+        phi::errors::PreconditionNotMet(
             "wait_compute op can run on gpu place only for now, but got %s",
             place.DebugString()));
 
@@ -62,7 +62,7 @@ class CWaitComputeOp : public framework::OperatorBase {
     if (FLAGS_dynamic_static_unified_comm) {
       PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "You choose to use new communication library by "
                             "setting environment "
                             "variable FLAGS_dynamic_static_unified_comm True. "
@@ -94,7 +94,7 @@ class CWaitComputeOp : public framework::OperatorBase {
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0));
 #endif
 #else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
+    PADDLE_THROW(phi::errors::PreconditionNotMet(
         "PaddlePaddle should compile with GPU."));
 #endif
   }
diff --git a/paddle/fluid/operators/collective/gen_bkcl_id_op.cc b/paddle/fluid/operators/collective/gen_bkcl_id_op.cc
index fc765e3bde983..f7aa3baea0d60 100644
--- a/paddle/fluid/operators/collective/gen_bkcl_id_op.cc
+++ b/paddle/fluid/operators/collective/gen_bkcl_id_op.cc
@@ -34,10 +34,10 @@ namespace operators {
 static void GenBKCLID(std::vector<BKCLUniqueId>* bkcl_ids) {
   for (size_t i = 0; i < bkcl_ids->size(); ++i) {
     BKCLResult_t ret = bkcl_get_unique_id(&(*bkcl_ids)[i]);
-    PADDLE_ENFORCE_EQ(BKCL_SUCCESS,
-                      ret,
-                      platform::errors::PreconditionNotMet(
-                          "bkcl get unique id failed [%d]", ret));
+    PADDLE_ENFORCE_EQ(
+        BKCL_SUCCESS,
+        ret,
+        phi::errors::PreconditionNotMet("bkcl get unique id failed [%d]", ret));
   }
 }
 
@@ -49,8 +49,8 @@ static void CopyBKCLIDToVar(const std::vector<BKCLUniqueId>& bkcl_ids,
     auto var = scope.FindVar(var_name);
     PADDLE_ENFORCE_NOT_NULL(
         var,
-        platform::errors::NotFound("Variable with name %s is not found",
-                                   var_name.c_str()));
+        phi::errors::NotFound("Variable with name %s is not found",
+                              var_name.c_str()));
     auto bkcl_id = var->GetMutable<BKCLUniqueId>();
     memcpy(bkcl_id, &bkcl_ids[i], sizeof(BKCLUniqueId));
   }
@@ -74,14 +74,14 @@ class GenBKCLIdOp : public framework::OperatorBase {
     PADDLE_ENFORCE_GE(
         trainer_id,
         0,
-        platform::errors::InvalidArgument("trainer_id %d is less than 0. Its "
-                                          "valid range is [0, trainer_size)"));
+        phi::errors::InvalidArgument("trainer_id %d is less than 0. Its "
+                                     "valid range is [0, trainer_size)"));
     PADDLE_ENFORCE_LT(
         trainer_id,
         static_cast<int>(trainers.size()),
-        platform::errors::OutOfRange("trainer_id %d is out of range. Its valid "
-                                     "range is [0, trainer_size)",
-                                     trainer_id));
+        phi::errors::OutOfRange("trainer_id %d is out of range. Its valid "
+                                "range is [0, trainer_size)",
+                                trainer_id));
 
     int bkcl_comm_num = Attr<int>("bkcl_comm_num");
     int use_hierarchical_allreduce = Attr<bool>("use_hierarchical_allreduce");
@@ -93,18 +93,18 @@ class GenBKCLIdOp : public framework::OperatorBase {
       PADDLE_ENFORCE_GT(
           trainers.size(),
           1,
-          platform::errors::PreconditionNotMet(
+          phi::errors::PreconditionNotMet(
               "The number of collective trainers %llu <= 1", trainers.size()));
       PADDLE_ENFORCE_GT(
           inter_nranks,
           1,
-          platform::errors::PreconditionNotMet(
+          phi::errors::PreconditionNotMet(
               "inter_nranks %d <= 1 while in hierarchical allreduce mode",
               inter_nranks));
       PADDLE_ENFORCE_EQ(
           trainers.size() % inter_nranks,
           0,
-          platform::errors::PreconditionNotMet(
+          phi::errors::PreconditionNotMet(
               "The number of trainers %llu mod inter_nranks %d is not equal 0",
               trainers.size(),
               inter_nranks));
diff --git a/paddle/fluid/operators/collective/gen_nccl_id_op.cc b/paddle/fluid/operators/collective/gen_nccl_id_op.cc
index 1d03cb151e4a0..37406b2918d7f 100644
--- a/paddle/fluid/operators/collective/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/gen_nccl_id_op.cc
@@ -49,8 +49,8 @@ static void CopyNCCLIDToVar(const std::vector<ncclUniqueId>& nccl_ids,
     auto var = scope.FindVar(var_name);
     PADDLE_ENFORCE_NOT_NULL(
         var,
-        platform::errors::NotFound("Variable with name %s is not found",
-                                   var_name.c_str()));
+        phi::errors::NotFound("Variable with name %s is not found",
+                              var_name.c_str()));
     auto nccl_id = var->GetMutable<ncclUniqueId>();
     memcpy(nccl_id, &nccl_ids[i], sizeof(ncclUniqueId));
   }
@@ -74,14 +74,14 @@ class GenNCCLIdOp : public framework::OperatorBase {
     PADDLE_ENFORCE_GE(
         trainer_id,
         0,
-        platform::errors::InvalidArgument("trainer_id %d is less than 0. Its "
-                                          "valid range is [0, trainer_size)"));
+        phi::errors::InvalidArgument("trainer_id %d is less than 0. Its "
+                                     "valid range is [0, trainer_size)"));
     PADDLE_ENFORCE_LT(
         trainer_id,
         static_cast<int>(trainers.size()),
-        platform::errors::OutOfRange("trainer_id %d is out of range. Its valid "
-                                     "range is [0, trainer_size)",
-                                     trainer_id));
+        phi::errors::OutOfRange("trainer_id %d is out of range. Its valid "
+                                "range is [0, trainer_size)",
+                                trainer_id));
 
     int nccl_comm_num = Attr<int>("nccl_comm_num");
     int use_hierarchical_allreduce = Attr<bool>("use_hierarchical_allreduce");
@@ -93,18 +93,18 @@ class GenNCCLIdOp : public framework::OperatorBase {
       PADDLE_ENFORCE_GT(
           trainers.size(),
           1,
-          platform::errors::PreconditionNotMet(
+          phi::errors::PreconditionNotMet(
               "The number of collective trainers %llu <= 1", trainers.size()));
       PADDLE_ENFORCE_GT(
           inter_nranks,
           1,
-          platform::errors::PreconditionNotMet(
+          phi::errors::PreconditionNotMet(
               "inter_nranks %d <= 1 while in hierarchical allreduce mode",
               inter_nranks));
       PADDLE_ENFORCE_EQ(
           trainers.size() % inter_nranks,
           0,
-          platform::errors::PreconditionNotMet(
+          phi::errors::PreconditionNotMet(
               "The number of trainers %llu mod inter_nranks %d is not equal 0",
               trainers.size(),
               inter_nranks));
diff --git a/paddle/fluid/operators/collective/global_gather_op.cc b/paddle/fluid/operators/collective/global_gather_op.cc
index de93ca747b4e9..1b74fc6bde5f7 100644
--- a/paddle/fluid/operators/collective/global_gather_op.cc
+++ b/paddle/fluid/operators/collective/global_gather_op.cc
@@ -32,18 +32,18 @@ class GlobalGatherOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GE(
         ring_id,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The ring_id (%d) for global gather op must be non-negative.",
             ring_id));
     auto input_dims = ctx->GetInputDim("X");
     auto ndim_input = input_dims.size();
     // dim check
-    PADDLE_ENFORCE_EQ(ndim_input,
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The input tensor's dimension must be 2. "
-                          "But received input's dimension = %d.",
-                          ndim_input));
+    PADDLE_ENFORCE_EQ(
+        ndim_input,
+        2,
+        phi::errors::InvalidArgument("The input tensor's dimension must be 2. "
+                                     "But received input's dimension = %d.",
+                                     ndim_input));
     framework::DDim out_dims = common::make_ddim({-1, -1});
     ctx->SetOutputDim("Out", out_dims);
   }
@@ -119,4 +119,4 @@ PD_REGISTER_STRUCT_KERNEL(global_gather,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/global_gather_op.cu.cc b/paddle/fluid/operators/collective/global_gather_op.cu.cc
index b3dcc2aac9423..8c0285cba049d 100644
--- a/paddle/fluid/operators/collective/global_gather_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_gather_op.cu.cc
@@ -41,11 +41,11 @@ struct GlobalGatherFunctor<phi::GPUContext, T> {
     auto global_count_type =
         framework::TransToProtoVarType(global_count->dtype());
     if (local_count_type != framework::proto::VarType::INT64) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Please use int64 type in local_count."));
     }
     if (global_count_type != framework::proto::VarType::INT64) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Please use int64 type in global_count."));
     }
     auto out = ctx.Output<phi::DenseTensor>("Out");
@@ -80,7 +80,7 @@ struct GlobalGatherFunctor<phi::GPUContext, T> {
     PADDLE_ENFORCE_GE(
         ring_id,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The ring_id (%d) for global gather op must be non-negative.",
             ring_id));
     auto place = ctx.GetPlace();
@@ -94,7 +94,7 @@ struct GlobalGatherFunctor<phi::GPUContext, T> {
     if (FLAGS_dynamic_static_unified_comm) {
       PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "You choose to use new communication library by "
                             "setting environment "
                             "variable FLAGS_dynamic_static_unified_comm "
@@ -105,7 +105,7 @@ struct GlobalGatherFunctor<phi::GPUContext, T> {
           comm_context_manager.Get(std::to_string(ring_id)));
       PADDLE_ENFORCE_NE(comm_ctx,
                         nullptr,
-                        platform::errors::Unavailable(
+                        phi::errors::Unavailable(
                             "NCCLCommContext is nullptr, collective op should "
                             "has ring_id attr."));
       stream = comm_ctx->GetStream();
@@ -192,12 +192,11 @@ struct GlobalGatherFunctor<phi::GPUContext, T> {
       }
     }
 #else
-    PADDLE_THROW(
-        platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
+    PADDLE_THROW(phi::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
 #endif
 #else
     PADDLE_THROW(
-        platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
+        phi::errors::Unavailable("PaddlePaddle should compile with GPU."));
 #endif
   }
 };
@@ -215,11 +214,11 @@ struct GlobalGatherProcessGroupFunctor<phi::GPUContext, T> {
     auto global_count_type =
         framework::TransToProtoVarType(global_count->dtype());
     if (local_count_type != framework::proto::VarType::INT64) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Please use int64 type in local_count."));
     }
     if (global_count_type != framework::proto::VarType::INT64) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Please use int64 type in global_count."));
     }
     auto out = ctx.Output<phi::DenseTensor>("Out");
@@ -251,7 +250,7 @@ struct GlobalGatherProcessGroupFunctor<phi::GPUContext, T> {
     PADDLE_ENFORCE_GE(
         ring_id,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The ring_id (%d) for global gather op must be non-negative.",
             ring_id));
     auto place = ctx.GetPlace();
@@ -309,12 +308,11 @@ struct GlobalGatherProcessGroupFunctor<phi::GPUContext, T> {
 #endif
 
 #else
-    PADDLE_THROW(
-        platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
+    PADDLE_THROW(phi::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
 #endif
 #else
     PADDLE_THROW(
-        platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
+        phi::errors::Unavailable("PaddlePaddle should compile with GPU."));
 #endif
   }
 };
@@ -349,4 +347,4 @@ PD_REGISTER_STRUCT_KERNEL(global_gather,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/global_gather_op.h b/paddle/fluid/operators/collective/global_gather_op.h
index 723c5e48a5ae4..0ab3dd5da985f 100644
--- a/paddle/fluid/operators/collective/global_gather_op.h
+++ b/paddle/fluid/operators/collective/global_gather_op.h
@@ -29,7 +29,7 @@ template <typename T, typename DeviceContext>
 class GlobalGatherOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx UNUSED) const override {
-    PADDLE_THROW(platform::errors::Unavailable(
+    PADDLE_THROW(phi::errors::Unavailable(
         "Do not support global gather op for cpu kernel now."));
   }
 };
diff --git a/paddle/fluid/operators/collective/global_scatter_op.cc b/paddle/fluid/operators/collective/global_scatter_op.cc
index 095f968306bdc..e6b1bb8295bde 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.cc
+++ b/paddle/fluid/operators/collective/global_scatter_op.cc
@@ -34,18 +34,18 @@ class GlobalScatterOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GE(
         ring_id,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The ring_id (%d) for global scatter op must be non-negative.",
             ring_id));
     auto input_dims = ctx->GetInputDim("X");
     auto ndim_input = input_dims.size();
     // dim check
-    PADDLE_ENFORCE_EQ(ndim_input,
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The input tensor's dimension must be 2. "
-                          "But received input's dimension = %d.",
-                          ndim_input));
+    PADDLE_ENFORCE_EQ(
+        ndim_input,
+        2,
+        phi::errors::InvalidArgument("The input tensor's dimension must be 2. "
+                                     "But received input's dimension = %d.",
+                                     ndim_input));
 
     framework::DDim out_dims = common::make_ddim({-1, -1});
     ctx->SetOutputDim("Out", out_dims);
@@ -123,4 +123,4 @@ PD_REGISTER_STRUCT_KERNEL(global_scatter,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
index 096c33c3ef3cc..1eeb23fa602e2 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
@@ -42,11 +42,11 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
     auto global_count_type =
         framework::TransToProtoVarType(global_count->dtype());
     if (local_count_type != framework::proto::VarType::INT64) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Please use int64 type in local_count."));
     }
     if (global_count_type != framework::proto::VarType::INT64) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Please use int64 type in global_count."));
     }
     auto out = ctx.Output<phi::DenseTensor>("Out");
@@ -79,7 +79,7 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
     PADDLE_ENFORCE_GE(
         ring_id,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The ring_id (%d) for global scatter op must be non-negative.",
             ring_id));
 
@@ -95,7 +95,7 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
     if (FLAGS_dynamic_static_unified_comm) {
       PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "You choose to use new communication library by "
                             "setting environment "
                             "variable FLAGS_dynamic_static_unified_comm True. "
@@ -106,7 +106,7 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
           comm_context_manager.Get(std::to_string(ring_id)));
       PADDLE_ENFORCE_NE(comm_ctx,
                         nullptr,
-                        platform::errors::Unavailable(
+                        phi::errors::Unavailable(
                             "NCCLCommContext is nullptr, collective op should "
                             "has ring_id attr."));
 
@@ -201,12 +201,11 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
     }
 
 #else
-    PADDLE_THROW(
-        platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
+    PADDLE_THROW(phi::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
 #endif
 #else
     PADDLE_THROW(
-        platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
+        phi::errors::Unavailable("PaddlePaddle should compile with GPU."));
 #endif
   }
 };
@@ -224,11 +223,11 @@ struct GlobalScatterProcessGroupFunctor<phi::GPUContext, T> {
     auto global_count_type =
         framework::TransToProtoVarType(global_count->dtype());
     if (local_count_type != framework::proto::VarType::INT64) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Please use int64 type in local_count."));
     }
     if (global_count_type != framework::proto::VarType::INT64) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Please use int64 type in global_count."));
     }
     auto out = ctx.Output<phi::DenseTensor>("Out");
@@ -258,7 +257,7 @@ struct GlobalScatterProcessGroupFunctor<phi::GPUContext, T> {
     PADDLE_ENFORCE_GE(
         ring_id,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The ring_id (%d) for global scatter op must be non-negative.",
             ring_id));
 
@@ -316,12 +315,11 @@ struct GlobalScatterProcessGroupFunctor<phi::GPUContext, T> {
 #endif
 
 #else
-    PADDLE_THROW(
-        platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
+    PADDLE_THROW(phi::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
 #endif
 #else
     PADDLE_THROW(
-        platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
+        phi::errors::Unavailable("PaddlePaddle should compile with GPU."));
 #endif
   }
 };
@@ -356,4 +354,4 @@ PD_REGISTER_STRUCT_KERNEL(global_scatter,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/global_scatter_op.h b/paddle/fluid/operators/collective/global_scatter_op.h
index fc4b48500c071..36ea0b151dc4b 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.h
+++ b/paddle/fluid/operators/collective/global_scatter_op.h
@@ -29,7 +29,7 @@ template <typename T, typename DeviceContext>
 class GlobalScatterOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx UNUSED) const override {
-    PADDLE_THROW(platform::errors::Unavailable(
+    PADDLE_THROW(phi::errors::Unavailable(
         "Do not support global scatter op for cpu kernel now."));
   }
 };
diff --git a/paddle/fluid/operators/collective/mp_allreduce_sum_op.cc b/paddle/fluid/operators/collective/mp_allreduce_sum_op.cc
index f680818da2d94..d30d52821e74e 100644
--- a/paddle/fluid/operators/collective/mp_allreduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/mp_allreduce_sum_op.cc
@@ -96,4 +96,4 @@ PD_REGISTER_STRUCT_KERNEL(mp_allreduce_sum,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc
index b4773a8eb5456..fc856ea04e6f2 100644
--- a/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc
@@ -34,5 +34,5 @@ PD_REGISTER_STRUCT_KERNEL(mp_allreduce_sum,
 #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
 #endif
-                          plat::float16) {
+                          phi::dtype::float16) {
 }
diff --git a/paddle/fluid/operators/collective/mp_allreduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/mp_allreduce_sum_op_xpu.cc
index 9638bf68d1717..323d39f62092e 100644
--- a/paddle/fluid/operators/collective/mp_allreduce_sum_op_xpu.cc
+++ b/paddle/fluid/operators/collective/mp_allreduce_sum_op_xpu.cc
@@ -28,4 +28,4 @@ PD_REGISTER_STRUCT_KERNEL(mp_allreduce_sum,
                           ops::CAllReduceSumXPUKernel,
                           float,
                           int,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cc b/paddle/fluid/operators/collective/partial_allgather_op.cc
index 75220ea5b30a5..3ae33ecd9eeba 100644
--- a/paddle/fluid/operators/collective/partial_allgather_op.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op.cc
@@ -26,14 +26,14 @@ class PartialAllGatherOp : public framework::OperatorWithKernel {
     int nranks = ctx->Attrs().Get<int>("nranks");
     int rank = ctx->Attrs().Get<int>("rank");
 
-    PADDLE_ENFORCE_GE(nranks,
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The value of nranks should be >=2."));
+    PADDLE_ENFORCE_GE(
+        nranks,
+        2,
+        phi::errors::InvalidArgument("The value of nranks should be >=2."));
     PADDLE_ENFORCE_EQ(
         (rank >= 0 && rank < nranks),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The rank (%d) for partial_allgather op must >=0 and <nranks (%d)",
             rank,
             nranks));
@@ -93,4 +93,4 @@ PD_REGISTER_STRUCT_KERNEL(partial_allgather,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
index 87d73d76dd9bd..2ed198f7ba773 100644
--- a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
@@ -56,7 +56,7 @@ class PartialAllGatherOpCUDAKernel : public framework::OpKernel<T> {
     if (FLAGS_dynamic_static_unified_comm) {
       PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "You choose to use new communication library by "
                             "setting environment "
                             "variable FLAGS_dynamic_static_unified_comm True. "
@@ -67,7 +67,7 @@ class PartialAllGatherOpCUDAKernel : public framework::OpKernel<T> {
           comm_context_manager.Get(std::to_string(rid)));
       PADDLE_ENFORCE_NE(comm_ctx,
                         nullptr,
-                        platform::errors::Unavailable(
+                        phi::errors::Unavailable(
                             "NCCLCommContext is nullptr, collective op should "
                             "has ring_id attr."));
 
@@ -87,17 +87,17 @@ class PartialAllGatherOpCUDAKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         nranks,
         real_nranks,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "nranks: %s should equal to %s", nranks, real_nranks));
     PADDLE_ENFORCE_EQ(rank,
                       real_rank,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "rank: %s should equal to %s", rank, real_rank));
 
     PADDLE_ENFORCE_EQ(
         (numel % nranks),
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The input numel (%d) must be divisible by nranks(%d)",
             numel,
             nranks));
@@ -137,7 +137,7 @@ class PartialAllGatherOpCUDAKernel : public framework::OpKernel<T> {
       }
     }
 #else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
+    PADDLE_THROW(phi::errors::PreconditionNotMet(
         "PaddlePaddle should compile with GPU."));
 #endif
   }
@@ -160,5 +160,5 @@ PD_REGISTER_STRUCT_KERNEL(partial_allgather,
 #endif
                           int,
                           int64_t,
-                          plat::float16) {
+                          phi::dtype::float16) {
 }
diff --git a/paddle/fluid/operators/collective/partial_allgather_op.h b/paddle/fluid/operators/collective/partial_allgather_op.h
index 178545f4dd2d3..4b410154712e2 100644
--- a/paddle/fluid/operators/collective/partial_allgather_op.h
+++ b/paddle/fluid/operators/collective/partial_allgather_op.h
@@ -30,7 +30,7 @@ template <typename T, typename DeviceContext>
 class PartialAllGatherOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx UNUSED) const override {
-    PADDLE_THROW(platform::errors::Unavailable(
+    PADDLE_THROW(phi::errors::Unavailable(
         "Do not support partial_allgather for cpu kernel now."));
   }
 };
diff --git a/paddle/fluid/operators/collective/partial_recv_op.cc b/paddle/fluid/operators/collective/partial_recv_op.cc
index 5d8a1276a630e..2a512260a792d 100644
--- a/paddle/fluid/operators/collective/partial_recv_op.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cc
@@ -34,26 +34,26 @@ class PartialRecvOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GE(
         peer,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The peer (%d) for partial_recv op must be non-negative.", peer));
     PADDLE_ENFORCE_GE(
         ring_id,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The ring_id (%d) for partial_recv op must be non-negative.",
             ring_id));
     PADDLE_ENFORCE_GE(num,
                       1,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The num (%d) for partial_send op must >=1", num));
     PADDLE_ENFORCE_EQ(
         (id >= 0 && id < num),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The id (%d) for partial_send op must >=0 and <num (%d)", id, num));
     PADDLE_ENFORCE_GE(out_shape.size(),
                       1,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The size of the output shape must be greater than 0 "
                           "but the value given is %d.",
                           out_shape.size()));
@@ -61,7 +61,7 @@ class PartialRecvOp : public framework::OperatorWithKernel {
     for (size_t i = 0; i < out_shape.size(); ++i) {
       PADDLE_ENFORCE_GE(out_shape[i],
                         1,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The shape attribute for partial_recv must be set "
                             "explicitly, but the %dth element is %d which "
                             "is less than 1.",
@@ -73,7 +73,7 @@ class PartialRecvOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         (numel % num),
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The output numel (%d) must be divisible by num(%d)", numel, num));
 
     ctx->SetOutputDim("Out", common::make_ddim(out_shape));
@@ -137,4 +137,4 @@ PD_REGISTER_STRUCT_KERNEL(partial_recv,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/partial_recv_op.cu.cc b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
index 912de046b63af..7e623706b2037 100644
--- a/paddle/fluid/operators/collective/partial_recv_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
@@ -49,26 +49,26 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_GE(
         rid,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The ring_id (%d) for partial_recv op must be non-negative.", rid));
     PADDLE_ENFORCE_GE(
         peer,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The peer (%d) for partial_recv op must be non-negative.", peer));
     PADDLE_ENFORCE_GE(num,
                       1,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The num (%d) for partial_recv op must >=1", num));
     PADDLE_ENFORCE_EQ(
         (id >= 0 && id < num),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The id (%d) for partial_recv op must >=0 and <num (%d)", id, num));
     PADDLE_ENFORCE_EQ(
         (numel % num),
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The input numel (%d) must be divisible by num(%d)", numel, num));
 
     auto place = ctx.GetPlace();
@@ -98,7 +98,7 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_EQ(
             comm_context_manager.Has(std::to_string(rid)),
             true,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "You choose to use new communication library by "
                 "setting environment "
                 "variable FLAGS_dynamic_static_unified_comm True. "
@@ -110,7 +110,7 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_NE(
             comm_ctx,
             nullptr,
-            platform::errors::Unavailable(
+            phi::errors::Unavailable(
                 "NCCLCommContext is nullptr, collective op should "
                 "has ring_id attr."));
 
@@ -134,13 +134,13 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
         stream = ctx.cuda_device_context().stream();
       }
 
-      PADDLE_ENFORCE_LT(peer,
-                        nranks,
-                        platform::errors::InvalidArgument(
-                            "The value of peer (%d) you set must "
-                            "be less than nranks (%d).",
-                            peer,
-                            nranks));
+      PADDLE_ENFORCE_LT(
+          peer,
+          nranks,
+          phi::errors::InvalidArgument("The value of peer (%d) you set must "
+                                       "be less than nranks (%d).",
+                                       peer,
+                                       nranks));
 
       ncclDataType_t dtype = platform::ToNCCLDataType(type);
 
@@ -161,7 +161,7 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
               << offset << "] from " << peer;
     }
 #else
-    PADDLE_THROW(platform::errors::Unavailable(
+    PADDLE_THROW(phi::errors::Unavailable(
         "PaddlePaddle should be compiled with NCCL and "
         "NCCL version >= 2.7.3 is needed."));
 #endif
@@ -185,5 +185,5 @@ PD_REGISTER_STRUCT_KERNEL(partial_recv,
 #endif
                           int,
                           int64_t,
-                          plat::float16) {
+                          phi::dtype::float16) {
 }
diff --git a/paddle/fluid/operators/collective/partial_recv_op.h b/paddle/fluid/operators/collective/partial_recv_op.h
index baf47ef9dff8d..0840b85e504b4 100644
--- a/paddle/fluid/operators/collective/partial_recv_op.h
+++ b/paddle/fluid/operators/collective/partial_recv_op.h
@@ -28,7 +28,7 @@ template <typename T, typename DeviceContext>
 class PartialRecvOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx UNUSED) const override {
-    PADDLE_THROW(platform::errors::Unavailable(
+    PADDLE_THROW(phi::errors::Unavailable(
         "Do not support partial_recv for cpu kernel now."));
   }
 };
diff --git a/paddle/fluid/operators/collective/partial_send_op.cc b/paddle/fluid/operators/collective/partial_send_op.cc
index a655479d3d8af..388ece7f4ba12 100644
--- a/paddle/fluid/operators/collective/partial_send_op.cc
+++ b/paddle/fluid/operators/collective/partial_send_op.cc
@@ -31,22 +31,22 @@ class PartialSendOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GE(
         peer,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The peer (%d) for partial_send op must be non-negative.", peer));
     PADDLE_ENFORCE_GE(
         ring_id,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The ring_id (%d) for partial_send op must be non-negative.",
             ring_id));
     PADDLE_ENFORCE_GE(num,
                       1,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The num (%d) for partial_send op must >=1", num));
     PADDLE_ENFORCE_EQ(
         (id >= 0 && id < num),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The id (%d) for partial_send op must >=0 and <num (%d)", id, num));
   }
 
@@ -102,4 +102,4 @@ PD_REGISTER_STRUCT_KERNEL(partial_send,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/partial_send_op.cu.cc b/paddle/fluid/operators/collective/partial_send_op.cu.cc
index 51f96cfce2535..eef547eefa510 100644
--- a/paddle/fluid/operators/collective/partial_send_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_send_op.cu.cc
@@ -44,26 +44,26 @@ class PartialSendCUDAKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_GE(
         rid,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The ring_id (%d) for partial_send op must be non-negative.", rid));
     PADDLE_ENFORCE_GE(
         peer,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The peer (%d) for partial_send op must be non-negative.", peer));
     PADDLE_ENFORCE_GE(num,
                       1,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The num (%d) for partial_send op must >=1", num));
     PADDLE_ENFORCE_EQ(
         (id >= 0 && id < num),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The id (%d) for partial_send op must >=0 and <num (%d)", id, num));
     PADDLE_ENFORCE_EQ(
         (numel % num),
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The input numel (%d) must be divisible by num(%d)", numel, num));
 
     int64_t send_numel = numel / num;
@@ -92,7 +92,7 @@ class PartialSendCUDAKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_EQ(
             comm_context_manager.Has(std::to_string(rid)),
             true,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "You choose to use new communication library by "
                 "setting environment "
                 "variable FLAGS_dynamic_static_unified_comm True. "
@@ -104,7 +104,7 @@ class PartialSendCUDAKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_NE(
             comm_ctx,
             nullptr,
-            platform::errors::Unavailable(
+            phi::errors::Unavailable(
                 "NCCLCommContext is nullptr, collective op should "
                 "has ring_id attr."));
 
@@ -128,13 +128,13 @@ class PartialSendCUDAKernel : public framework::OpKernel<T> {
         stream = ctx.cuda_device_context().stream();
       }
 
-      PADDLE_ENFORCE_LT(peer,
-                        nranks,
-                        platform::errors::InvalidArgument(
-                            "The value of peer (%d) you set must "
-                            "be less than ranks (%d).",
-                            peer,
-                            nranks));
+      PADDLE_ENFORCE_LT(
+          peer,
+          nranks,
+          phi::errors::InvalidArgument("The value of peer (%d) you set must "
+                                       "be less than ranks (%d).",
+                                       peer,
+                                       nranks));
 
       ncclDataType_t dtype =
           platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype()));
@@ -157,9 +157,9 @@ class PartialSendCUDAKernel : public framework::OpKernel<T> {
               << offset << "] to " << peer;
     }
 #else
-    PADDLE_THROW(platform::errors::Unavailable(
-        "PaddlePaddle should be compiled with NCCL "
-        "and NCCL version >= 2.7.3 is needed."));
+    PADDLE_THROW(
+        phi::errors::Unavailable("PaddlePaddle should be compiled with NCCL "
+                                 "and NCCL version >= 2.7.3 is needed."));
 #endif
   }
 };
@@ -181,5 +181,5 @@ PD_REGISTER_STRUCT_KERNEL(partial_send,
 #endif
                           int,
                           int64_t,
-                          plat::float16) {
+                          phi::dtype::float16) {
 }
diff --git a/paddle/fluid/operators/collective/partial_send_op.h b/paddle/fluid/operators/collective/partial_send_op.h
index b7b72789b87ff..9076ce014fcab 100644
--- a/paddle/fluid/operators/collective/partial_send_op.h
+++ b/paddle/fluid/operators/collective/partial_send_op.h
@@ -29,7 +29,7 @@ template <typename T, typename DeviceContext>
 class PartialSendOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx UNUSED) const override {
-    PADDLE_THROW(platform::errors::Unavailable(
+    PADDLE_THROW(phi::errors::Unavailable(
         "Do not support partial_send for cpu kernel now."));
   }
 };
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cc b/paddle/fluid/operators/collective/recv_v2_op.cc
index 40757ca89daa8..1448aad5f9bfa 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cc
@@ -30,12 +30,12 @@ class RecvOpV2 : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GE(
         peer,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The peer (%d) for recv_v2 op must be non-negative.", peer));
     PADDLE_ENFORCE_GE(
         ring_id,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The ring_id (%d) for recv_v2 op must be non-negative.", ring_id));
 
     if (ctx->GetOutputsVarType("Out").front() ==
@@ -44,7 +44,7 @@ class RecvOpV2 : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_GE(
           out_shape.size(),
           1,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The size of the output shape must be greater than 0 "
               "but the value given is %d.",
               out_shape.size()));
@@ -55,7 +55,7 @@ class RecvOpV2 : public framework::OperatorWithKernel {
         for (size_t i = 0; i < out_shape.size(); ++i) {
           PADDLE_ENFORCE_GE(out_shape[i],
                             1,
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "The shape attribute for recv_v2 must be set "
                                 "explicitly, but the %dth element is %d which "
                                 "is less than 1. Or dynamic_shape should be "
@@ -122,4 +122,4 @@ PD_REGISTER_STRUCT_KERNEL(recv_v2,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
index 37cbf9dffdd3d..be849d7e6c53b 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
@@ -41,7 +41,7 @@ framework::DDim recv_shape_info(const platform::Place &place,
     PADDLE_ENFORCE_EQ(
         ((stream != nullptr && comm != nullptr) || comm_ctx != nullptr),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "NCCLComm and Stream should be provided if use NCCL "
             "to send the shape info."));
   }
@@ -131,14 +131,14 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_GE(
         rid,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The ring_id (%d) for recv_v2 op must be non-negative.", rid));
 
     int peer = ctx.Attr<int>("peer");
     PADDLE_ENFORCE_GE(
         peer,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The peer (%d) for recv_v2 op must be non-negative.", peer));
 
     gpuStream_t stream = nullptr;
@@ -180,7 +180,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
     if (FLAGS_dynamic_static_unified_comm) {
       PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "You choose to use new communication library by "
                             "setting environment "
                             "variable FLAGS_dynamic_static_unified_comm True. "
@@ -191,20 +191,20 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
           comm_context_manager.Get(std::to_string(rid)));
       PADDLE_ENFORCE_NE(comm_ctx,
                         nullptr,
-                        platform::errors::Unavailable(
+                        phi::errors::Unavailable(
                             "NCCLCommContext is nullptr, collective op should "
                             "has ring_id attr."));
       stream = comm_ctx->GetStream();
       VLOG(3) << "new comm_context_manager has rid " << rid;
     } else {
       comm = platform::NCCLCommContext::Instance().Get(rid, place);
-      PADDLE_ENFORCE_LT(peer,
-                        comm->nranks(),
-                        platform::errors::InvalidArgument(
-                            "The value of peer (%d) you set must "
-                            "be less than comm->nranks (%d).",
-                            peer,
-                            comm->nranks()));
+      PADDLE_ENFORCE_LT(
+          peer,
+          comm->nranks(),
+          phi::errors::InvalidArgument("The value of peer (%d) you set must "
+                                       "be less than comm->nranks (%d).",
+                                       peer,
+                                       comm->nranks()));
       stream = comm->stream();
       VLOG(3) << "old NCCLCommContext has rid " << rid;
     }
@@ -223,8 +223,8 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           dynamic_shape,
           false,
-          platform::errors::InvalidArgument("Dynamic shape for send/recv not "
-                                            "support LoDTensorArray for now."));
+          phi::errors::InvalidArgument("Dynamic shape for send/recv not "
+                                       "support LoDTensorArray for now."));
       auto out_array = out_var->GetMutable<framework::LoDTensorArray>();
       for (size_t idx = 0; idx < out_array->size(); ++idx) {
         VLOG(3) << "LodTensorArray: idx(" << idx << ")";
@@ -267,20 +267,20 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
       comm_ctx->Recv(out, numel, peer, stream);
     } else {
       comm = platform::NCCLCommContext::Instance().Get(rid, place);
-      PADDLE_ENFORCE_LT(peer,
-                        comm->nranks(),
-                        platform::errors::InvalidArgument(
-                            "The value of peer (%d) you set must "
-                            "be less than comm->nranks (%d).",
-                            peer,
-                            comm->nranks()));
+      PADDLE_ENFORCE_LT(
+          peer,
+          comm->nranks(),
+          phi::errors::InvalidArgument("The value of peer (%d) you set must "
+                                       "be less than comm->nranks (%d).",
+                                       peer,
+                                       comm->nranks()));
       PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
           out->data<T>(), numel, dtype, peer, comm->comm(), stream));
       VLOG(3) << "rank " << comm->rank() << " recv "
               << common::product(out->dims()) << " from " << peer;
     }
 #else
-    PADDLE_THROW(platform::errors::Unavailable(
+    PADDLE_THROW(phi::errors::Unavailable(
         "PaddlePaddle should be compiled with NCCL and "
         "NCCL version >= 2.7.3 is needed."));
 #endif
@@ -305,5 +305,5 @@ PD_REGISTER_STRUCT_KERNEL(recv_v2,
                           int,
                           int64_t,
                           int8_t,
-                          plat::float16) {
+                          phi::dtype::float16) {
 }
diff --git a/paddle/fluid/operators/collective/recv_v2_op.h b/paddle/fluid/operators/collective/recv_v2_op.h
index e76e4a7b55197..47b1941a73442 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.h
+++ b/paddle/fluid/operators/collective/recv_v2_op.h
@@ -28,8 +28,8 @@ template <typename T, typename DeviceContext>
 class RecvOpV2CPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx UNUSED) const override {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Do not support recv for cpu kernel now."));
+    PADDLE_THROW(
+        phi::errors::Unavailable("Do not support recv for cpu kernel now."));
   }
 };
 
diff --git a/paddle/fluid/operators/collective/send_v2_op.cc b/paddle/fluid/operators/collective/send_v2_op.cc
index 862a6a67813c1..c1763a5cd6478 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cc
@@ -28,12 +28,12 @@ class SendOpV2 : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GE(
         peer,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The peer (%d) for send_v2 op must be non-negative.", peer));
     PADDLE_ENFORCE_GE(
         ring_id,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The ring_id (%d) for send_v2 op must be non-negative.", ring_id));
   }
 
@@ -94,4 +94,4 @@ PD_REGISTER_STRUCT_KERNEL(send_v2,
                           double,
                           int,
                           int64_t,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc
index 8c72a7ccd384c..6938f413b0548 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc
@@ -41,7 +41,7 @@ void send_shape_info(const phi::DenseTensor& x,
     PADDLE_ENFORCE_EQ(
         ((stream != nullptr && comm != nullptr) || comm_ctx != nullptr),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "NCCLComm and Stream should be provided if use NCCL "
             "to send the shape info."));
   }
@@ -129,14 +129,14 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_GE(
         rid,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The ring_id (%d) for send_v2 op must be non-negative.", rid));
 
     int peer = ctx.Attr<int>("peer");
     PADDLE_ENFORCE_GE(
         peer,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The peer (%d) for send_v2 op must be non-negative.", peer));
     auto map = distributed::ProcessGroupMapFromGid::getInstance();
     if (map->has(rid)) {
@@ -171,7 +171,7 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
     if (FLAGS_dynamic_static_unified_comm) {
       PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "You choose to use new communication library by "
                             "setting environment "
                             "variable FLAGS_dynamic_static_unified_comm True. "
@@ -182,20 +182,20 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
           comm_context_manager.Get(std::to_string(rid)));
       PADDLE_ENFORCE_NE(comm_ctx,
                         nullptr,
-                        platform::errors::Unavailable(
+                        phi::errors::Unavailable(
                             "NCCLCommContext is nullptr, collective op should "
                             "has ring_id attr."));
       stream = comm_ctx->GetStream();
       VLOG(3) << "new comm_context_manager has rid " << rid;
     } else {
       comm = platform::NCCLCommContext::Instance().Get(rid, place);
-      PADDLE_ENFORCE_LT(peer,
-                        comm->nranks(),
-                        platform::errors::InvalidArgument(
-                            "The value of peer (%d) you set must "
-                            "be less than comm->nranks (%d).",
-                            peer,
-                            comm->nranks()));
+      PADDLE_ENFORCE_LT(
+          peer,
+          comm->nranks(),
+          phi::errors::InvalidArgument("The value of peer (%d) you set must "
+                                       "be less than comm->nranks (%d).",
+                                       peer,
+                                       comm->nranks()));
       stream = comm->stream();
       VLOG(3) << "old NCCLCommContext has rid " << rid;
     }
@@ -210,8 +210,8 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           dynamic_shape,
           false,
-          platform::errors::InvalidArgument("Dynamic shape for send/recv not "
-                                            "support LoDTensorArray for now."));
+          phi::errors::InvalidArgument("Dynamic shape for send/recv not "
+                                       "support LoDTensorArray for now."));
       auto& x_array = x_var->Get<framework::LoDTensorArray>();
       for (size_t idx = 0; idx < x_array.size(); idx++) {
         VLOG(3) << "LodTensorArray: idx(" << idx << ")";
@@ -255,9 +255,9 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
               << common::product(x->dims()) << " to " << peer;
     }
 #else
-    PADDLE_THROW(platform::errors::Unavailable(
-        "PaddlePaddle should be compiled with NCCL "
-        "and NCCL version >= 2.7.3 is needed."));
+    PADDLE_THROW(
+        phi::errors::Unavailable("PaddlePaddle should be compiled with NCCL "
+                                 "and NCCL version >= 2.7.3 is needed."));
 #endif
   }
 };
@@ -280,5 +280,5 @@ PD_REGISTER_STRUCT_KERNEL(send_v2,
                           int,
                           int64_t,
                           int8_t,
-                          plat::float16) {
+                          phi::dtype::float16) {
 }
diff --git a/paddle/fluid/operators/collective/send_v2_op.h b/paddle/fluid/operators/collective/send_v2_op.h
index 7f51861008942..196e2941e9315 100644
--- a/paddle/fluid/operators/collective/send_v2_op.h
+++ b/paddle/fluid/operators/collective/send_v2_op.h
@@ -29,8 +29,8 @@ template <typename T, typename DeviceContext>
 class SendOpV2CPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx UNUSED) const override {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Do not support send for cpu kernel now."));
+    PADDLE_THROW(
+        phi::errors::Unavailable("Do not support send for cpu kernel now."));
   }
 };
 
diff --git a/paddle/fluid/operators/common_infer_shape_functions.cc b/paddle/fluid/operators/common_infer_shape_functions.cc
index 1c13f873818f4..0c83eeb6da92e 100644
--- a/paddle/fluid/operators/common_infer_shape_functions.cc
+++ b/paddle/fluid/operators/common_infer_shape_functions.cc
@@ -37,13 +37,13 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims,
   PADDLE_ENFORCE_GE(
       axis,
       0,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Axis should be great than or equal to 0, but received axis is %d.",
           axis));
   PADDLE_ENFORCE_LE(
       axis,
       max_dim,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Axis should be less than or equal to %d, but received axis is %d.",
           max_dim,
           axis));
@@ -68,7 +68,7 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims,
         x_dims_array[i] == y_dims_array[i] || x_dims_array[i] <= 1 ||
             y_dims_array[i] <= 1,
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Broadcast dimension mismatch. Operands could "
             "not be broadcast together with the shape of X = [%s] and "
             "the shape of Y = [%s]. Received [%d] in X is not equal to "
@@ -126,7 +126,7 @@ void UnaryOpUnchangedInferShapeCheckAxis(framework::InferShapeContext *ctx) {
   PADDLE_ENFORCE_GE(
       axis,
       -x_rank,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Attr(axis) value should be in range [-R, R-1], "
           "R is the rank of Input(X). But received axis: %d, R: %d.",
           axis,
@@ -134,7 +134,7 @@ void UnaryOpUnchangedInferShapeCheckAxis(framework::InferShapeContext *ctx) {
   PADDLE_ENFORCE_LT(
       axis,
       x_rank,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Attr(axis) value should be in range [-R, R-1], "
           "R is the rank of Input(X). But received axis: %d, R: %d.",
           axis,
@@ -153,7 +153,7 @@ void BinaryOpBroadcastInferShape(framework::InferShapeContext *ctx) {
   PADDLE_ENFORCE_EQ(
       ctx->GetInputsVarType(y_name).front(),
       framework::proto::VarType::LOD_TENSOR,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The var type of input %s should be phi::DenseTensor, but got %s.",
           ctx->Inputs(y_name).front(),
           ctx->GetInputsVarType(y_name).front()));
@@ -162,7 +162,7 @@ void BinaryOpBroadcastInferShape(framework::InferShapeContext *ctx) {
       framework::proto::VarType::SELECTED_ROWS) {
     PADDLE_ENFORCE_EQ(y_dims.size(),
                       1u,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "For binary broadcastable operator, if X is "
                           "Sparse(VarType.SELECTED_ROWS"
                           "), Y must be scalar, and the size of Y should be 1. "
@@ -171,7 +171,7 @@ void BinaryOpBroadcastInferShape(framework::InferShapeContext *ctx) {
     PADDLE_ENFORCE_EQ(
         y_dims[0],
         1,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "For binary broadcastable operator, if X is "
             "Sparse(VarType.SELECTED_ROWS"
             "), Y must be scalar, the first dimension of Y should be 1. "
@@ -179,7 +179,7 @@ void BinaryOpBroadcastInferShape(framework::InferShapeContext *ctx) {
             y_dims[0]));
   } else if (ctx->GetInputsVarType(x_name).front() !=
              framework::proto::VarType::LOD_TENSOR) {
-    PADDLE_THROW(platform::errors::InvalidArgument(
+    PADDLE_THROW(phi::errors::InvalidArgument(
         "For binary broadcastable operator, the var type of input X should "
         "be LOD_TENSOR, but got %s",
         ctx->GetInputsVarType(x_name).front()));
diff --git a/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc b/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
index e684efe12c598..9f3034179fdd7 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/controlflow/conditional_block_op.h"
 
 #ifdef PADDLE_WITH_DNNL
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #endif
 #include "paddle/common/flags.h"
 
@@ -72,7 +72,7 @@ class ConditionalBlockInferOp : public ConditionalOp {
       auto *scope_var = scope.FindVar(Output("Scope"));
       PADDLE_ENFORCE_NOT_NULL(
           scope_var,
-          platform::errors::PreconditionNotMet(
+          phi::errors::PreconditionNotMet(
               "Scope must be set in ConditionalBlockInferOp."));
       auto *scopes = scope_var->GetMutable<std::vector<framework::Scope *>>();
       scopes->resize(1);
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc
index 981bf0f8b00f5..3b320dd3f7912 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/controlflow/control_flow_op_helper.h"
 
 #ifdef PADDLE_WITH_DNNL
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #endif
 
 COMMON_DECLARE_bool(use_mkldnn);
@@ -73,7 +73,7 @@ class ConditionalBlockOp : public ConditionalOp {
       auto *scope_var = scope.FindVar(Output(ConditionalOp::kScope));
       PADDLE_ENFORCE_NOT_NULL(
           scope_var,
-          platform::errors::PreconditionNotMet(
+          phi::errors::PreconditionNotMet(
               "Expect Scope variable to be set in conditional_block_op, but "
               "got a null Scope variable. Please set the Scope variable."));
 
@@ -139,7 +139,7 @@ class ConditionalBlockInferShape : public framework::InferShapeBase {
   void operator()(framework::InferShapeContext *context) const override {
     PADDLE_ENFORCE_EQ(context->HasInputs(ConditionalOp::kCondition),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "conditional_block_op must have condition input."));
   }
 };
@@ -180,14 +180,14 @@ class ConditionalBlockGradOp : public ConditionalOp {
       auto *scope_var = scope.FindVar(Input(ConditionalOp::kScope));
       PADDLE_ENFORCE_NOT_NULL(
           scope_var,
-          platform::errors::PreconditionNotMet(
+          phi::errors::PreconditionNotMet(
               "Expect Scope variable to be set in conditional_block_op, but "
               "got a null Scope variable. Please set the Scope variable."));
       auto &scopes = scope_var->Get<std::vector<framework::Scope *>>();
       PADDLE_ENFORCE_GT(
           scopes.size(),
           0,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Expect Scope variable contains at least 1 scope, but got: %d",
               scopes.size()));
       framework::Scope &cur_scope = *(scopes[0]);
@@ -272,7 +272,7 @@ class ConditionalBlockGradInferShape : public framework::InferShapeBase {
     PADDLE_ENFORCE_EQ(
         context->HasInputs(ConditionalOp::kCondition),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Condition must be set in conditional_block_grad_op."));
     if (context->HasInputs(ConditionalOp::kInputs) &&
         context->HasOutputs(framework::GradVarName(ConditionalOp::kInputs))) {
@@ -294,7 +294,7 @@ class ConditionalBlockGradInferVarType : public framework::VarTypeInference {
         ctx->OutputSize(framework::GradVarName(ConditionalOp::kInputs));
     PADDLE_ENFORCE_EQ(input_size,
                       output_size,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "input_size and output_size should be equal for "
                           "conditional_block_grad_op."));
     for (size_t i = 0; i < output_size; ++i) {
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h
index 0f04a295ed263..7b24ec5629a48 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.h
@@ -53,7 +53,7 @@ class ConditionalOp : public framework::OperatorBase {
         [&scope](const std::string &var_name) -> const phi::DenseTensor * {
           auto *var = scope.FindVar(var_name);
           PADDLE_ENFORCE_NOT_NULL(var,
-                                  platform::errors::InvalidArgument(
+                                  phi::errors::InvalidArgument(
                                       "Cannot find variable %s", var_name));
           return &var->Get<phi::DenseTensor>();
         });
@@ -64,14 +64,14 @@ class ConditionalOp : public framework::OperatorBase {
     PADDLE_ENFORCE_EQ(
         ips.size() == 1UL && ips[0]->IsInitialized(),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "condition should have one initialized input as condition"));
 
     PADDLE_ENFORCE_EQ(framework::TransToProtoVarType(ips[0]->dtype()) ==
                               framework::proto::VarType::BOOL &&
                           ips[0]->numel() == 1,
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "condition input's data type should be bool, "
                           "numel should be 1, actual numel is %d",
                           ips[0]->numel()));
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc b/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc
index 08569d835fd82..2908d1f5a5f81 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc
@@ -38,7 +38,7 @@ static void FindAllConditionalBlockAndConditionalBlockGradOp(
   PADDLE_ENFORCE_GE(
       fwd_ops->size(),
       bwd_ops->size(),
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Size of forward ops must be greater or equal to backward ops. The "
           "number of forward ops is %d and the number of backward ops is %d",
           fwd_ops->size(),
@@ -59,7 +59,7 @@ static void FindAllConditionalBlockAndConditionalBlockGradOp(
   PADDLE_ENFORCE_GE(
       fwd_ops->size(),
       bwd_ops->size(),
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "There are more conditional_block_grad ops than "
           "conditional_block ops in the graph or program. The number of "
           "forward ops is %d and the number of backward ops is %d",
@@ -122,7 +122,7 @@ static void PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOpImpl(
                                                                bwd_op)) {
         PADDLE_ENFORCE_EQ(matched_fwd_op,
                           nullptr,
-                          platform::errors::PreconditionNotMet(
+                          phi::errors::PreconditionNotMet(
                               "Found multiple matched conditional_block ops."));
         matched_fwd_op = &fwd_op;
       }
@@ -130,7 +130,7 @@ static void PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOpImpl(
 
     PADDLE_ENFORCE_NOT_NULL(
         matched_fwd_op,
-        platform::errors::PreconditionNotMet(
+        phi::errors::PreconditionNotMet(
             "Cannot find matched forward conditional_block op."));
 
     SetSkipVarsForConditionalBlockOp(const_cast<OpVariant *>(matched_fwd_op),
diff --git a/paddle/fluid/operators/controlflow/control_flow_op_helper.h b/paddle/fluid/operators/controlflow/control_flow_op_helper.h
index 0d08ae6d68663..945bcbb4e905e 100644
--- a/paddle/fluid/operators/controlflow/control_flow_op_helper.h
+++ b/paddle/fluid/operators/controlflow/control_flow_op_helper.h
@@ -96,7 +96,7 @@ static void AssignZeroToParentScope(
       PADDLE_ENFORCE_EQ(
           outside_var->IsType<phi::DenseTensor>(),
           true,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Type of outside_var %s is NOT phi::DenseTensor, which "
               "doesn't match input_var %s.",
               outside_grad_name,
@@ -108,7 +108,7 @@ static void AssignZeroToParentScope(
     } else if (input_var->IsType<framework::LoDTensorArray>()) {
       PADDLE_ENFORCE_EQ(outside_var->IsType<framework::LoDTensorArray>(),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "Type of outside_var %s is NOT LoDTensorArray, "
                             "which doesn't match input_var %s.",
                             outside_grad_name,
@@ -121,7 +121,7 @@ static void AssignZeroToParentScope(
       }
       PADDLE_ENFORCE_EQ(input_tensors.size(),
                         outside_tensors->size(),
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "LoDTensorArray outside_var %s doen't have same "
                             "size as input_var %s.",
                             outside_grad_name,
@@ -132,7 +132,7 @@ static void AssignZeroToParentScope(
       }
     } else {
       // TODO(huihuangzheng): add support for SelectedRows
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Conditional block grad op doesn't support non-phi::DenseTensor "
           "output "
           "now."));
diff --git a/paddle/fluid/operators/controlflow/depend_op.cc b/paddle/fluid/operators/controlflow/depend_op.cc
index 925990ba3ba5f..58ed498ad1b9e 100644
--- a/paddle/fluid/operators/controlflow/depend_op.cc
+++ b/paddle/fluid/operators/controlflow/depend_op.cc
@@ -50,8 +50,8 @@ class DependOp : public framework::OperatorBase {
     auto out_name = Output("Out");
     PADDLE_ENFORCE_EQ(x_name,
                       out_name,
-                      platform::errors::PreconditionNotMet(
-                          "Input(X) and Output(Out) varibale should be the "
+                      phi::errors::PreconditionNotMet(
+                          "Input(X) and Output(Out) variable should be the "
                           "same, but got Input is %s and Output is %s.",
                           x_name,
                           out_name));
diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc
index 7d0d899e8b6c3..141b13a71164b 100644
--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -34,7 +34,7 @@ const framework::FeedType& CheckAndGetFeedItem(const phi::ExtendedTensor& x,
                                                int col) {
   PADDLE_ENFORCE_GE(col,
                     0,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Expected the column index (the attribute 'col' of "
                         "operator 'Feed') of current feeding variable to be "
                         "no less than 0. But received column index = %d.",
@@ -43,7 +43,7 @@ const framework::FeedType& CheckAndGetFeedItem(const phi::ExtendedTensor& x,
   PADDLE_ENFORCE_LT(
       static_cast<size_t>(col),
       feed_list->size(),
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The column index of current feeding variable is expected to be "
           "less than the length of feeding list. But received column index = "
           "%d, the length of feeding list = %d",
@@ -60,7 +60,7 @@ void FeedDenseTensorKernel(const Context& dev_ctx,
                            phi::DenseTensor* out) {
   PADDLE_ENFORCE_NOT_NULL(
       out,
-      platform::errors::NotFound(
+      phi::errors::NotFound(
           "Output cannot be found in scope for operator 'Feed'"));
   const auto& feed_item = CheckAndGetFeedItem(x, col);
   const auto& in_tensor = paddle::get<phi::DenseTensor>(feed_item);
@@ -81,7 +81,7 @@ void FeedSparseCooTensorKernel(const Context& dev_ctx,
                                phi::SparseCooTensor* out) {
   PADDLE_ENFORCE_NOT_NULL(
       out,
-      platform::errors::NotFound(
+      phi::errors::NotFound(
           "Output cannot be found in scope for operator 'Feed'"));
   const auto& feed_item = CheckAndGetFeedItem(x, col);
   const auto& in_tensor = paddle::get<phi::SparseCooTensor>(feed_item);
@@ -103,7 +103,7 @@ void FeedStringsKernel(const Context& dev_ctx UNUSED,
                        phi::ExtendedTensor* out) {
   PADDLE_ENFORCE_NOT_NULL(
       out,
-      platform::errors::NotFound(
+      phi::errors::NotFound(
           "Output cannot be found in scope for operator 'Feed'"));
   const auto& feed_item = CheckAndGetFeedItem(x, col);
   auto strs_out = static_cast<framework::Strings*>(out);
diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc
index d3b4b086470a0..c9ceb1f3e01b2 100644
--- a/paddle/fluid/operators/controlflow/fetch_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_op.cc
@@ -76,7 +76,7 @@ class FetchOp : public framework::OperatorBase {
     auto *fetch_var = scope.FindVar(fetch_var_name);
     PADDLE_ENFORCE_NOT_NULL(
         fetch_var,
-        platform::errors::NotFound(
+        phi::errors::NotFound(
             "Input variable(%s) cannot be found in scope for operator 'Fetch'."
             "Confirm that you have used the fetch `Variable` format "
             "instead of the string literal('%s') in `fetch_list` "
@@ -91,15 +91,15 @@ class FetchOp : public framework::OperatorBase {
     auto *out_var = scope.FindVar(out_name);
     PADDLE_ENFORCE_NOT_NULL(
         out_var,
-        platform::errors::NotFound("Output variable(%s) cannot be found "
-                                   "in scope for operator 'Fetch'.",
-                                   out_name));
+        phi::errors::NotFound("Output variable(%s) cannot be found "
+                              "in scope for operator 'Fetch'.",
+                              out_name));
 
     int col = Attr<int>("col");
     PADDLE_ENFORCE_GE(
         col,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Expected the column index (the attribute 'col' of "
             "operator 'Fetch') of current fetching variable to be "
             "no less than 0. But received column index = %d.",
diff --git a/paddle/fluid/operators/controlflow/fetch_v2_op.cc b/paddle/fluid/operators/controlflow/fetch_v2_op.cc
index 8e811c20b28ff..591d3bed324d3 100644
--- a/paddle/fluid/operators/controlflow/fetch_v2_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_v2_op.cc
@@ -128,14 +128,14 @@ class FetchV2Kernel {
     PADDLE_ENFORCE_EQ(
         ctx.HasOutput("Out"),
         true,
-        platform::errors::NotFound("Output(Out) of fetch_v2_op is not found."));
+        phi::errors::NotFound("Output(Out) of fetch_v2_op is not found."));
     auto *out_var = ctx.OutputVar("Out");
 
     int col = ctx.Attr<int>("col");
     PADDLE_ENFORCE_GE(
         col,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Expected the column index (the attribute 'col' of "
             "operator 'Fetch') of current fetching variable to be "
             "no less than 0. But received column index = %d.",
@@ -163,8 +163,8 @@ class FetchV2Kernel {
       PADDLE_ENFORCE_EQ(
           check_place,
           true,
-          platform::errors::InvalidArgument("Tensor's place of input(X) must "
-                                            "be CPUPlace or CUDAPinnedPlace."));
+          phi::errors::InvalidArgument("Tensor's place of input(X) must "
+                                       "be CPUPlace or CUDAPinnedPlace."));
       if (deepcopy) {
         DeepCopy(src_item, fetch_var_name, dst_item);
       } else {
@@ -186,7 +186,7 @@ class FetchV2Kernel {
       for (size_t i = 0; i < src_item.size(); ++i) {
         PADDLE_ENFORCE_EQ(platform::is_cpu_place(src_item[i].place()),
                           true,
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "Tensor's place of input(X) must be CPUPlace."));
         if (deepcopy) {
           DeepCopy(src_item[i], fetch_var_name, &dst_item[i]);
@@ -244,7 +244,7 @@ PD_REGISTER_STRUCT_KERNEL(fetch_v2,
                           int64_t,
                           uint8_t,
                           bool,
-                          plat::float16,
+                          phi::dtype::float16,
                           plat::bfloat16,
                           plat::complex<float>,
                           plat::complex<double>) {}
diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc
deleted file mode 100644
index 9262ca59af970..0000000000000
--- a/paddle/fluid/operators/controlflow/get_places_op.cc
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-class Scope;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-static size_t CUDADevCount() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  return platform::GetGPUDeviceCount();
-#else
-  return 0UL;
-#endif
-}
-
-class GetPlacesOp : public framework::OperatorBase {
- public:
-  GetPlacesOp(const std::string &type,
-              const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    bool is_gpu = false;
-    if (Attr<std::string>("device_type") == "AUTO") {
-      is_gpu = platform::is_gpu_place(place);
-    } else {
-      is_gpu = Attr<std::string>("device_type") == "CUDA";
-    }
-    auto device_count = static_cast<size_t>(Attr<int>("device_count"));
-    if (device_count == 0) {
-      device_count =
-          is_gpu ? CUDADevCount() : std::thread::hardware_concurrency();
-    }
-    PADDLE_ENFORCE_NE(
-        device_count,
-        0UL,
-        platform::errors::InvalidArgument("Cannot indicate %s device count",
-                                          is_gpu ? "GPU" : "CPU"));
-
-    auto out_var_name = Output("Out");
-    auto &places =
-        *(GET_DATA_SAFELY(
-              scope.FindVar(out_var_name), "Output", "Out", "GetPlaces")
-              .GetMutable<platform::PlaceList>());
-    places.reserve(device_count);
-    if (is_gpu) {
-      PADDLE_ENFORCE_LE(device_count,
-                        CUDADevCount(),
-                        platform::errors::InvalidArgument(
-                            "Only %d CUDA devices found, cannot set to %d",
-                            CUDADevCount(),
-                            device_count));
-      for (size_t i = 0; i < device_count; ++i) {
-        places.emplace_back(platform::CUDAPlace(static_cast<int>(i)));
-      }
-    } else {
-      for (size_t i = 0; i < device_count; ++i) {
-        places.emplace_back(platform::CPUPlace());
-      }
-    }
-  }
-};
-
-class GetPlacesOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddOutput("Out", "vector of Place");
-    AddAttr<int>("device_count", "device count").SetDefault(0);
-    AddAttr<std::string>("device_type", "device type")
-        .InEnum({"CUDA", "CPU", "AUTO"})
-        .SetDefault("AUTO");
-    AddComment(R"DOC(
-Returns a list of places based on arguments. The list will be used for parallel
-execution.
-)DOC");
-  }
-};
-
-class GetPlacesInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    ctx->SetOutputType(
-        "Out", framework::proto::VarType::PLACE_LIST, framework::ALL_ELEMENTS);
-  }
-};
-
-class GetPlacesInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    // Do nothing
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    get_places,
-    ops::GetPlacesOp,
-    ops::GetPlacesOpProtoMaker,
-    ops::GetPlacesInferVarType,
-    ops::GetPlacesInferShape,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/controlflow/logical_op_xpu.h b/paddle/fluid/operators/controlflow/logical_op_xpu.h
index 614db61558f79..8fde735d99936 100644
--- a/paddle/fluid/operators/controlflow/logical_op_xpu.h
+++ b/paddle/fluid/operators/controlflow/logical_op_xpu.h
@@ -82,7 +82,7 @@ class BinaryLogicalOpXPUKernel : public framework::OpKernel<T> {
                                  bcast_ydims);
       PADDLE_ENFORCE_EQ(ret,
                         XPU_SUCCESS,
-                        platform::errors::External(
+                        phi::errors::External(
                             "XPU broadcast kernel return wrong value[%d %s]",
                             ret,
                             XPUAPIErrorMsg[ret]));
@@ -118,7 +118,7 @@ class BinaryLogicalOpXPUKernel : public framework::OpKernel<T> {
                                  bcast_ydims);
       PADDLE_ENFORCE_EQ(ret,
                         XPU_SUCCESS,
-                        platform::errors::External(
+                        phi::errors::External(
                             "XPU broadcast kernel return wrong value[%d %s]",
                             ret,
                             XPUAPIErrorMsg[ret]));
@@ -144,11 +144,11 @@ class BinaryLogicalOpXPUKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         ret,
         XPU_SUCCESS,
-        platform::errors::External("XPU API return wrong value[%d %s] in "
-                                   "op_name[%s].",
-                                   ret,
-                                   XPUAPIErrorMsg[ret],
-                                   XpuLogicalType2Str(xpu_type)));
+        phi::errors::External("XPU API return wrong value[%d %s] in "
+                              "op_name[%s].",
+                              ret,
+                              XPUAPIErrorMsg[ret],
+                              XpuLogicalType2Str(xpu_type)));
 
     if (need_broad_cast && dev_ctx.x_context()->xpu_stream != nullptr) {
       dev_ctx.Wait();
@@ -178,7 +178,7 @@ class UnaryLogicalOpXPUKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         ret,
         XPU_SUCCESS,
-        platform::errors::External(
+        phi::errors::External(
             "XPU API return wrong value[%d %s].", ret, XPUAPIErrorMsg[ret]));
   }
 };
diff --git a/paddle/fluid/operators/controlflow/op_variant.cc b/paddle/fluid/operators/controlflow/op_variant.cc
index 8d43a21e66437..0976ff36e63b2 100644
--- a/paddle/fluid/operators/controlflow/op_variant.cc
+++ b/paddle/fluid/operators/controlflow/op_variant.cc
@@ -70,11 +70,10 @@ void AppendOpVariantByOpName(const std::vector<framework::OpDesc *> &op_descs,
                              std::vector<OpVariant> *result_ops) {
   PADDLE_ENFORCE_NOT_NULL(
       result_ops,
-      platform::errors::Unavailable("result_ops should not be a null_ptr."));
+      phi::errors::Unavailable("result_ops should not be a null_ptr."));
   for (auto *op_desc : op_descs) {
     PADDLE_ENFORCE_NOT_NULL(
-        op_desc,
-        platform::errors::Unavailable("op_desc should not be a null_ptr."));
+        op_desc, phi::errors::Unavailable("op_desc should not be a null_ptr."));
     if (op_desc->Type() == candidate_op_name) {
       result_ops->emplace_back(op_desc);
     }
@@ -87,11 +86,10 @@ void AppendOpVariantByOpName(
     std::unordered_set<OpVariant, OpVariant::Hasher> *result_ops) {
   PADDLE_ENFORCE_NOT_NULL(
       result_ops,
-      platform::errors::Unavailable("result_ops should not be a null_ptr."));
+      phi::errors::Unavailable("result_ops should not be a null_ptr."));
   for (auto *op_desc : op_descs) {
     PADDLE_ENFORCE_NOT_NULL(
-        op_desc,
-        platform::errors::Unavailable("op_desc should not be a null_ptr."));
+        op_desc, phi::errors::Unavailable("op_desc should not be a null_ptr."));
     if (op_desc->Type() == candidate_op_name) {
       result_ops->emplace(op_desc);
     }
diff --git a/paddle/fluid/operators/controlflow/op_variant.h b/paddle/fluid/operators/controlflow/op_variant.h
index ad7cc6b741eb9..ed13a0285c375 100644
--- a/paddle/fluid/operators/controlflow/op_variant.h
+++ b/paddle/fluid/operators/controlflow/op_variant.h
@@ -49,10 +49,9 @@ class OpVariant {
   const AttrType &Attr(const std::string &name) const {
     auto &attrs = Attrs();
     auto it = attrs.find(name);
-    PADDLE_ENFORCE_NE(
-        it,
-        attrs.end(),
-        platform::errors::NotFound("Cannot find attribute %s.", name));
+    PADDLE_ENFORCE_NE(it,
+                      attrs.end(),
+                      phi::errors::NotFound("Cannot find attribute %s.", name));
     return PADDLE_GET_CONST(AttrType, it->second);
   }
 
diff --git a/paddle/fluid/operators/controlflow/pylayer_op.cc b/paddle/fluid/operators/controlflow/pylayer_op.cc
index bd83c99a0c62d..57bce4224770a 100644
--- a/paddle/fluid/operators/controlflow/pylayer_op.cc
+++ b/paddle/fluid/operators/controlflow/pylayer_op.cc
@@ -95,7 +95,7 @@ class PyLayerForwardOp : public PyLayerOp {
     auto *scope_var = scope.FindVar(Output(kScope));
     PADDLE_ENFORCE_NOT_NULL(
         scope_var,
-        platform::errors::PreconditionNotMet(
+        phi::errors::PreconditionNotMet(
             "Expect Scope variable to be set in pylayer_op, but "
             "got a null Scope variable. Please set the Scope variable."));
 
@@ -109,7 +109,7 @@ class PyLayerForwardOp : public PyLayerOp {
     PADDLE_ENFORCE_GT(
         blocks.size(),
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Expect blocks contains at least 1 block, but got: %d",
             blocks.size()));
 
@@ -123,7 +123,7 @@ class PyLayerForwardOp : public PyLayerOp {
     LOG_FIRST_N(INFO, 1) << "[ControlFlow][PyLayer] New Executor is Running.";
 
     CreateInterpreter(dev_place, *forward_block, &cur_scope, skip_vars);
-    PADDLE_ENFORCE_NOT_NULL(core_, platform::errors::Fatal("core_ is nullptr"));
+    PADDLE_ENFORCE_NOT_NULL(core_, phi::errors::Fatal("core_ is nullptr"));
     core_->Run({}, false);
   }
 };
@@ -156,7 +156,7 @@ class PyLayerBackwardMaker : public framework::SingleGradOpMaker<T> {
     PADDLE_ENFORCE_GT(
         blocks.size(),
         static_cast<size_t>(PyLayerBlockIndex::kBACKWARD),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Expect blocks contains at least 2 block, but got: %d",
             blocks.size()));
     grad_op->SetBlockAttr(
@@ -188,7 +188,7 @@ class PyLayerBackwardOp : public PyLayerOp {
     PADDLE_ENFORCE_EQ(
         inside_grads.size(),
         outside_grads.size(),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Mismatch inside_grads.size(): %d, and outside_grads.size(): %d",
             inside_grads.size(),
             outside_grads.size()));
@@ -196,14 +196,14 @@ class PyLayerBackwardOp : public PyLayerOp {
     auto *scope_var = scope.FindVar(Input(PyLayerOp::kScope));
     PADDLE_ENFORCE_NOT_NULL(
         scope_var,
-        platform::errors::PreconditionNotMet(
+        phi::errors::PreconditionNotMet(
             "Expect Scope variable to be set in pylayer_op, but "
             "got a null Scope variable. Please set the Scope variable."));
     auto &scopes = scope_var->Get<std::vector<framework::Scope *>>();
     PADDLE_ENFORCE_GT(
         scopes.size(),
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Expect Scope variable contains at least 1 scope, but got: %d",
             scopes.size()));
     framework::Scope &cur_scope = *(scopes[0]);
@@ -216,7 +216,7 @@ class PyLayerBackwardOp : public PyLayerOp {
         << "[ControlFlow][PyLayerBackwardOp] New Executor is Running.";
 
     CreateInterpreter(dev_place, *backward_block, &cur_scope, inside_grads);
-    PADDLE_ENFORCE_NOT_NULL(core_, platform::errors::Fatal("core_ is nullptr"));
+    PADDLE_ENFORCE_NOT_NULL(core_, phi::errors::Fatal("core_ is nullptr"));
 
     core_->Run({}, false);
 
@@ -252,7 +252,7 @@ class PyLayerBackwardInferVarType : public framework::VarTypeInference {
         ctx->OutputSize(framework::GradVarName(PyLayerOp::kInputs));
     PADDLE_ENFORCE_EQ(forward_input_size,
                       backward_output_size,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "input_size and output_size should be equal for "
                           "pylayer_grad op."));
     for (size_t i = 0; i < backward_output_size; ++i) {
diff --git a/paddle/fluid/operators/controlflow/pylayer_op_helper.cc b/paddle/fluid/operators/controlflow/pylayer_op_helper.cc
index 9dc53d428ef1d..bdd669c644e6e 100644
--- a/paddle/fluid/operators/controlflow/pylayer_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/pylayer_op_helper.cc
@@ -38,7 +38,7 @@ static void FindAllPyLayerOpAndPyLayerGradOp(
   PADDLE_ENFORCE_GE(
       fwd_ops->size(),
       bwd_ops->size(),
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Size of forward ops must be greater or equal to backward ops. The "
           "number of forward ops is %d and the number of backward ops is %d",
           fwd_ops->size(),
@@ -59,7 +59,7 @@ static void FindAllPyLayerOpAndPyLayerGradOp(
   PADDLE_ENFORCE_GE(
       fwd_ops->size(),
       bwd_ops->size(),
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "There are more pylayer_grad ops than "
           "pylayer ops in the graph or program. The number of "
           "forward ops is %d and the number of backward ops is %d",
@@ -119,14 +119,14 @@ static void PrepareSafeEagerDeletionOnPyLayerOpAndPyLayerGradOp(
       if (IsMatchedPyLayerOpAndPyLayerGradOp(fwd_op, bwd_op)) {
         PADDLE_ENFORCE_EQ(matched_fwd_op,
                           nullptr,
-                          platform::errors::PreconditionNotMet(
+                          phi::errors::PreconditionNotMet(
                               "Found multiple matched pylayer ops."));
         matched_fwd_op = &fwd_op;
       }
     }
 
     PADDLE_ENFORCE_NOT_NULL(matched_fwd_op,
-                            platform::errors::PreconditionNotMet(
+                            phi::errors::PreconditionNotMet(
                                 "Cannot find matched forward pylayer op."));
 
     SetSkipVarsForPyLayerOp(const_cast<OpVariant *>(matched_fwd_op), &bwd_op);
diff --git a/paddle/fluid/operators/controlflow/recurrent_op_helper.cc b/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
index 2851757dccc4d..e290fa3e016bd 100644
--- a/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
@@ -74,7 +74,7 @@ static void FindAllOpAndGradOp(const framework::ProgramDesc &program,
   PADDLE_ENFORCE_GE(
       ops.size(),
       grad_ops.size(),
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "There are more grad ops than forward ops in the graph or program, "
           "the number of ops is %d and the number of grad_ops is %d.",
           ops.size(),
@@ -95,7 +95,7 @@ static void FindAllOpAndGradOp(const framework::ProgramDesc &program,
   PADDLE_ENFORCE_GE(
       ops.size(),
       grad_ops.size(),
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "There are more grad ops than forward ops in the graph or program, "
           "the number of ops is %d and the number of grad_ops is %d.",
           ops.size(),
@@ -183,7 +183,7 @@ static void SetRecurrentOpAndRecurrentGradOpSkipVarAttr(
   PADDLE_ENFORCE_EQ(
       fwd_input.size(),
       in_grads.size(),
-      platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "Backward input gradient number does not match forward "
           "input number. The number of forward input number is %d and the "
           "number of backward input gradient number is %d.",
@@ -203,7 +203,7 @@ static void SetRecurrentOpAndRecurrentGradOpSkipVarAttr(
   PADDLE_ENFORCE_EQ(
       fwd_param.size(),
       param_grads.size(),
-      platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "Backward parameter gradient number does not match "
           "forward parameter number. The number of forward parameter number is "
           "%d and the number of backward parameter gradient is %d.",
@@ -269,15 +269,15 @@ void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
       if (IsMatchedRecurrentOpAndRecurrentGradOp(fwd_op, bwd_op)) {
         PADDLE_ENFORCE_EQ(matched_fwd_op,
                           nullptr,
-                          platform::errors::PreconditionNotMet(
+                          phi::errors::PreconditionNotMet(
                               "Found multiple recurrent forward op matches "
                               "recurrent grad op."));
         matched_fwd_op = &fwd_op;
       }
     }
-    PADDLE_ENFORCE_NOT_NULL(matched_fwd_op,
-                            platform::errors::PreconditionNotMet(
-                                "Cannot find matched forward op."));
+    PADDLE_ENFORCE_NOT_NULL(
+        matched_fwd_op,
+        phi::errors::PreconditionNotMet("Cannot find matched forward op."));
     SetRecurrentOpAndRecurrentGradOpSkipVarAttr(*matched_fwd_op, bwd_op);
     recurrent_ops.erase(*matched_fwd_op);
   }
diff --git a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
index c04e897aa6366..52006166c8fc8 100644
--- a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
@@ -94,7 +94,7 @@ class WriteToArrayInferShape : public framework::InferShapeBase {
     PADDLE_ENFORCE_EQ(
         context->HasInput("I"),
         true,
-        platform::errors::NotFound("Input(I) of WriteToArrayOp is not found."));
+        phi::errors::NotFound("Input(I) of WriteToArrayOp is not found."));
 
     // TODO(wangchaochaohu) control flow Op do not support runtime infer shape
     // Later we add [ontext->GetInputDim("I")) == 1] check when it's supported
@@ -103,10 +103,10 @@ class WriteToArrayInferShape : public framework::InferShapeBase {
       return;
     }
 
-    PADDLE_ENFORCE_EQ(context->HasOutput("Out"),
-                      true,
-                      platform::errors::NotFound(
-                          "Output(Out) of WriteToArrayOp is not found."));
+    PADDLE_ENFORCE_EQ(
+        context->HasOutput("Out"),
+        true,
+        phi::errors::NotFound("Output(Out) of WriteToArrayOp is not found."));
     context->SetOutputDim("Out", context->GetInputDim("X"));
 
     // When compile time, we need to:
@@ -148,15 +148,13 @@ class ReadFromArrayOp : public ArrayOp {
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
     auto *x = scope.FindVar(Input("X"));
-    PADDLE_ENFORCE_NOT_NULL(x,
-                            platform::errors::NotFound(
-                                "Input(X) of ReadFromArrayOp is not found."));
+    PADDLE_ENFORCE_NOT_NULL(
+        x, phi::errors::NotFound("Input(X) of ReadFromArrayOp is not found."));
     auto &x_array = x->Get<framework::LoDTensorArray>();
     auto *out = scope.FindVar(Output("Out"));
     PADDLE_ENFORCE_NOT_NULL(
         out,
-        platform::errors::NotFound(
-            "Output(Out) of ReadFromArrayOp is not found."));
+        phi::errors::NotFound("Output(Out) of ReadFromArrayOp is not found."));
     size_t offset = GetOffset(scope, place);
     if (offset < x_array.size()) {
       auto *out_tensor = out->GetMutable<phi::DenseTensor>();
diff --git a/paddle/fluid/operators/controlflow/unity_build_rule.cmake b/paddle/fluid/operators/controlflow/unity_build_rule.cmake
index 594ae3a36cf1d..4b88de66fd2f9 100644
--- a/paddle/fluid/operators/controlflow/unity_build_rule.cmake
+++ b/paddle/fluid/operators/controlflow/unity_build_rule.cmake
@@ -6,15 +6,11 @@
 # in combination rule, you can remove the source file from the following rules.
 register_unity_group(
   cc
-  compare_all_op.cc
-  compare_op.cc
   conditional_block_infer_op.cc
   feed_op.cc
   fetch_op.cc
   fetch_v2_op.cc
   get_places_op.cc
-  logical_op.cc
-  bitwise_op.cc
   tensor_array_read_write_op.cc
   while_op.cc)
 register_unity_group(cu logical_op.cu bitwise_op.cu compare_op.cu
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 5c758bbf7ff42..65f9145dbd89d 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -20,7 +20,7 @@
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
 
 #ifdef PADDLE_WITH_DNNL
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #endif
 #include "paddle/fluid/platform/flags.h"
 
@@ -64,15 +64,15 @@ class WhileOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &dev_place) const override {
-    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)),
-                            platform::errors::NotFound(
-                                "Input(Condition) of WhileOp is not found."));
+    PADDLE_ENFORCE_NOT_NULL(
+        scope.FindVar(Input(kCondition)),
+        phi::errors::NotFound("Input(Condition) of WhileOp is not found."));
 
     auto &cond = scope.FindVar(Input(kCondition))->Get<phi::DenseTensor>();
     PADDLE_ENFORCE_EQ(
         cond.numel(),
         1,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The numel of Input(Condition) of WhileOp must be 1. But now "
             "the Condition's numel is ",
             cond.numel(),
@@ -136,7 +136,7 @@ class WhileOp : public framework::OperatorBase {
 
     PADDLE_ENFORCE_EQ(step_scopes->size(),
                       0,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "The Output(StepScope) of WhileOp should be empty."));
 
     bool cond_data = GetCondData(cond);
@@ -329,7 +329,7 @@ class WhileGradOp : public framework::OperatorBase {
     PADDLE_ENFORCE_EQ(
         Attr<bool>("is_test"),
         false,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "WhileGradOp is only callable when is_test is false."));
     // get device context from pool
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
@@ -350,7 +350,7 @@ class WhileGradOp : public framework::OperatorBase {
 
     PADDLE_ENFORCE_EQ(outside_og_names.size(),
                       inside_og_names.size(),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The number of original output gradient names "
                           "does not match the number of backward input "
                           "gradient names. The number of Backward input "
@@ -397,7 +397,7 @@ class WhileGradOp : public framework::OperatorBase {
               !og_outside.GetMutable<phi::DenseTensor>()->IsInitialized()) {
             auto *var_desc = parent_block->FindVarRecursive(outside_og_name);
             PADDLE_ENFORCE_NOT_NULL(var_desc,
-                                    platform::errors::PreconditionNotMet(
+                                    phi::errors::PreconditionNotMet(
                                         "Var `%s` is not found in parent "
                                         "block, can't fill constant.",
                                         outside_og_name));
@@ -448,7 +448,7 @@ class WhileGradOp : public framework::OperatorBase {
               PADDLE_ENFORCE_EQ(
                   inside_array[j].numel(),
                   0,
-                  platform::errors::InvalidArgument(
+                  phi::errors::InvalidArgument(
                       "The numel of %d-th element of var %s (LoDTensorArray) "
                       "in while block must be 0, but received its numel is %d.",
                       j,
@@ -457,7 +457,7 @@ class WhileGradOp : public framework::OperatorBase {
             }
           }
         } else {
-          PADDLE_THROW(platform::errors::Unimplemented(
+          PADDLE_THROW(phi::errors::Unimplemented(
               "Currently only support phi::DenseTensor and "
               "phi::DenseTensorArray in "
               "WhileGradOp."));
@@ -474,7 +474,7 @@ class WhileGradOp : public framework::OperatorBase {
       auto &p_names = Inputs(kX);
       PADDLE_ENFORCE_EQ(pg_ig_names.size(),
                         p_names.size(),
-                        platform::errors::PreconditionNotMet(
+                        phi::errors::PreconditionNotMet(
                             "The number of names in Outputs(X@GRAD) does not "
                             "match the number of names in Inputs(X). The "
                             "number of names in Outputs(X@GRAD) is %d and "
@@ -493,8 +493,8 @@ class WhileGradOp : public framework::OperatorBase {
         auto pg_ig_var = cur_scope.FindVar(inside_grad_name);
         PADDLE_ENFORCE_NOT_NULL(
             pg_ig_var,
-            platform::errors::NotFound("Variable %s is not found.",
-                                       inside_grad_name));
+            phi::errors::NotFound("Variable %s is not found.",
+                                  inside_grad_name));
         if (pg_ig_var->IsType<framework::LoDTensorArray>()) {
           auto pg_ig_lod_t_arr =
               pg_ig_var->GetMutable<framework::LoDTensorArray>();
@@ -531,13 +531,13 @@ class WhileGradOp : public framework::OperatorBase {
           auto *var = (*cur_scope_iter)->FindVar(inside_grad_name);
           PADDLE_ENFORCE_NOT_NULL(
               var,
-              platform::errors::NotFound("Variable %s is not found.",
-                                         inside_grad_name));
+              phi::errors::NotFound("Variable %s is not found.",
+                                    inside_grad_name));
           PADDLE_ENFORCE_EQ(
               var->IsType<framework::LoDTensorArray>() ||
                   var->IsType<phi::DenseTensor>(),
               true,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "Currently the type of var only can be LoDTensorArray, "
                   "or phi::DenseTensor, but the received var[%s] is %s.",
                   inside_grad_name,
@@ -721,7 +721,7 @@ class WhileGradOpShapeInference : public framework::InferShapeBase {
     auto out_var_ptrs = ctx->GetOutputVarPtrs(kXGRAD);
     PADDLE_ENFORCE_EQ(in_var_ptrs.size(),
                       out_var_ptrs.size(),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The size of Inputs(X) must be the same as "
                           "the size of Outputs(X@GRAD)."));
 
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc
index 80b4abe763123..638f2fbae740a 100644
--- a/paddle/fluid/operators/controlflow/while_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/while_op_helper.cc
@@ -86,7 +86,7 @@ static void ModifyWhileOpAndWhileGradOpAttr(const OpVariant &fwd_op,
   PADDLE_ENFORCE_EQ(
       fwd_input.size(),
       in_grads.size(),
-      platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "Backward output gradient number does not match forward input number."
           "The number of forward input number is %d and the number of backward "
           "output gradient number is %d.",
@@ -116,7 +116,7 @@ static void FindAllWhileAndWhileGradOp(const framework::ProgramDesc &program,
   PADDLE_ENFORCE_GE(
       while_ops->size(),
       while_grad_ops->size(),
-      platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "There are more while_grad_ops than forward while_ops in the graph "
           "or program, the number of while_ops is %d and the number of "
           "while_grad_ops is %d.",
@@ -137,7 +137,7 @@ static void FindAllWhileAndWhileGradOp(const framework::ProgramDesc &program,
   PADDLE_ENFORCE_GE(
       while_ops->size(),
       while_grad_ops->size(),
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "There are more while_grad_ops than forward while_ops in the graph "
           "or program, the number of while_ops is %d and the number of "
           "while_grad_ops is %d.",
@@ -167,14 +167,14 @@ static void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(
       if (IsMatchedWhileOpAndWhileGradOp(fwd_op, bwd_op)) {
         PADDLE_ENFORCE_EQ(matched_fwd_op,
                           nullptr,
-                          platform::errors::PreconditionNotMet(
+                          phi::errors::PreconditionNotMet(
                               "Found multiple while forward ops match while "
                               "grad ops."));
         matched_fwd_op = &fwd_op;
       }
     }
     PADDLE_ENFORCE_NOT_NULL(matched_fwd_op,
-                            platform::errors::PreconditionNotMet(
+                            phi::errors::PreconditionNotMet(
                                 "Cannot find matched forward while op."));
     ModifyWhileOpAndWhileGradOpAttr(*matched_fwd_op, bwd_op);
     while_op_set.erase(*matched_fwd_op);
@@ -231,7 +231,7 @@ bool GetCondData(const phi::DenseTensor &cond) {
     defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
   framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get());
 #else
-  PADDLE_THROW(platform::errors::PreconditionNotMet(
+  PADDLE_THROW(phi::errors::PreconditionNotMet(
       "This version of PaddlePaddle does NOT support GPU/XPU but got "
       "GPU/XPU tensor Cond in WhileOp. Please compile WITH_GPU or "
       "WITH_XPU option."));
diff --git a/paddle/fluid/operators/copy_cross_scope_op.cc b/paddle/fluid/operators/copy_cross_scope_op.cc
deleted file mode 100644
index ed433518068b4..0000000000000
--- a/paddle/fluid/operators/copy_cross_scope_op.cc
+++ /dev/null
@@ -1,155 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/var_type_traits.h"
-
-namespace paddle {
-namespace framework {
-class OpDesc;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-class CopyCrossScopeOp : public framework::OperatorBase {
- public:
-  CopyCrossScopeOp(const std::string& type,
-                   const framework::VariableNameMap& inputs,
-                   const framework::VariableNameMap& outputs,
-                   const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext* ctx) const {}
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    int num_micro_scopes = static_cast<int>(scope.kids().size());
-    int num_micro_batches = Attr<int>("num_micro_batches");
-    bool ToM = Attr<bool>("to_main_scope");
-    PADDLE_ENFORCE_EQ(num_micro_scopes,
-                      num_micro_batches,
-                      platform::errors::InvalidArgument(
-                          "For pipeline, number of micro scopes (%d) should "
-                          "be equal to number of micro batches (%d).",
-                          num_micro_scopes,
-                          num_micro_batches));
-    const std::string& id_name = Input("Id");
-    auto* id_var = scope.FindVar(id_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        id_var,
-        platform::errors::NotFound("No variable with name %s found.", id_name));
-    auto id_tensor = id_var->GetMutable<phi::DenseTensor>();
-    auto it = scope.kids().begin();
-    phi::DenseTensor cpu_id_tensor;
-    paddle::framework::TensorCopySync(
-        *id_tensor, platform::CPUPlace(), &cpu_id_tensor);
-    auto id_value = cpu_id_tensor.data<int64_t>();
-    for (auto i = 0; i < *id_value; i++) {
-      it++;
-    }
-    if (it == scope.kids().end()) {
-      if (ToM) {
-        auto dst_scope = *it;
-        const std::string& x_name = Input("X");
-        auto* dst_var = dst_scope->FindVar(x_name);
-        PADDLE_ENFORCE_NOT_NULL(
-            dst_var,
-            platform::errors::NotFound(
-                "No variable with name %s found in source scope.", x_name));
-        auto* main_var = scope.FindVar(x_name);
-        PADDLE_ENFORCE_NOT_NULL(
-            main_var,
-            platform::errors::NotFound(
-                "No variable with name %s found in destination scope.",
-                x_name));
-        auto dst_tensor = dst_var->GetMutable<phi::DenseTensor>();
-        auto main_tensor = main_var->GetMutable<phi::DenseTensor>();
-        paddle::framework::TensorCopySync(
-            *dst_tensor, main_tensor->place(), main_tensor);
-      }
-      return;
-    }
-    auto source_scope = *it;
-    it++;
-    auto dst_scope = *it;
-    const std::string& x_name = Input("X");
-    auto* source_var = source_scope->FindVar(x_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        source_var,
-        platform::errors::NotFound(
-            "No variable with name %s found in source scope.", x_name));
-    auto* dst_var = dst_scope->FindVar(x_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        dst_var,
-        platform::errors::NotFound(
-            "No variable with name %s found in destination scope.", x_name));
-    auto src_tensor = source_var->GetMutable<phi::DenseTensor>();
-    auto dst_tensor = dst_var->GetMutable<phi::DenseTensor>();
-    paddle::framework::TensorCopySync(
-        *src_tensor, dst_tensor->place(), dst_tensor);
-
-    if (ToM) {
-      auto* main_var = scope.FindVar(x_name);
-      PADDLE_ENFORCE_NOT_NULL(
-          main_var,
-          platform::errors::NotFound(
-              "No variable with name %s found in destination scope.", x_name));
-      auto main_tensor = main_var->GetMutable<phi::DenseTensor>();
-      paddle::framework::TensorCopySync(
-          *dst_tensor, main_tensor->place(), main_tensor);
-    }
-  }
-};
-
-class CopyCrossScopeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor), The first input tensor of copy_cross_scope op, which "
-             "is copying micro scope.");
-    AddInput("Id",
-             "(Tensor), The second input tensor of copy_cross_scope op, which "
-             "is a id of the current micro scope.");
-    AddAttr<bool>("to_main_scope", "Return current scope to main scope.")
-        .SetDefault(false);
-    AddAttr<int>("num_micro_batches", "Number of micro batches for pipeline.");
-    AddComment(R"DOC(
-      This op is used by pipeline to copy tensors across micro batch scopes.
-      Copy the variable value of the giving Id's micro scope to the micro scope of Id + 1 position.
-      If need to copy back to the main scope, using to_main_scope option to copy the variable value of
-      the current micro scope to the main scope.
-    )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(copy_cross_scope,
-                             ops::CopyCrossScopeOp,
-                             ops::CopyCrossScopeOpMaker);
diff --git a/paddle/fluid/operators/correlation_op.cc b/paddle/fluid/operators/correlation_op.cc
index 427f9a0307399..1243ae595bac4 100644
--- a/paddle/fluid/operators/correlation_op.cc
+++ b/paddle/fluid/operators/correlation_op.cc
@@ -84,19 +84,19 @@ class CorrelationOp : public framework::OperatorWithKernel {
     auto in_dims = ctx->GetInputDim("Input1");
     auto in2_dims = ctx->GetInputDim("Input2");
 
-    PADDLE_ENFORCE_EQ(in_dims.size() == 4,
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of CorrelationOp must be 4 dims."
-                          "But received dims is %d.",
-                          in_dims.size()));
-
-    PADDLE_ENFORCE_EQ(in2_dims.size() == 4,
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(Y) of CorrelationOp must be 4 dims."
-                          "But received dims is %d.",
-                          in2_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        in_dims.size() == 4,
+        true,
+        phi::errors::InvalidArgument("Input(X) of CorrelationOp must be 4 dims."
+                                     "But received dims is %d.",
+                                     in_dims.size()));
+
+    PADDLE_ENFORCE_EQ(
+        in2_dims.size() == 4,
+        true,
+        phi::errors::InvalidArgument("Input(Y) of CorrelationOp must be 4 dims."
+                                     "But received dims is %d.",
+                                     in2_dims.size()));
     std::vector<int64_t> output_shape =
         CorrelationOutputSize(static_cast<int>(in_dims[0]),
                               static_cast<int>(in_dims[2]),
@@ -114,11 +114,11 @@ class CorrelationOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     auto input_data_type =
         OperatorWithKernel::IndicateVarDataType(ctx, "Input1");
-    PADDLE_ENFORCE_EQ(input_data_type,
-                      framework::TransToProtoVarType(
-                          ctx.Input<phi::DenseTensor>("Input2")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "X and Y shoule have the same datatype"));
+    PADDLE_ENFORCE_EQ(
+        input_data_type,
+        framework::TransToProtoVarType(
+            ctx.Input<phi::DenseTensor>("Input2")->dtype()),
+        phi::errors::InvalidArgument("X and Y shoule have the same datatype"));
     return phi::KernelKey(input_data_type, ctx.GetPlace());
   }
 };
@@ -173,7 +173,7 @@ class CorrelationKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(ctx.GetPlace()),
         true,
-        platform::errors::Unimplemented("Correlation only supports GPU now."));
+        phi::errors::Unimplemented("Correlation only supports GPU now."));
   }
 };
 
diff --git a/paddle/fluid/operators/correlation_op.cu b/paddle/fluid/operators/correlation_op.cu
index ee6cc22c867c3..61b922e0caecc 100644
--- a/paddle/fluid/operators/correlation_op.cu
+++ b/paddle/fluid/operators/correlation_op.cu
@@ -179,10 +179,10 @@ template <typename T, typename DeviceContext>
 class CorrelationCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Correlation only supports GPU now."));
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()),
+        true,
+        phi::errors::InvalidArgument("Correlation only supports GPU now."));
 
     auto *input1 = ctx.Input<phi::DenseTensor>("Input1");
     auto *input2 = ctx.Input<phi::DenseTensor>("Input2");
@@ -447,10 +447,10 @@ template <typename T, typename DeviceContext>
 class CorrelationCUDAGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Correlation only supports GPU now."));
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()),
+        true,
+        phi::errors::InvalidArgument("Correlation only supports GPU now."));
     const auto *input1 = ctx.Input<phi::DenseTensor>("Input1");
     const auto *input2 = ctx.Input<phi::DenseTensor>("Input2");
     const auto *grad_output =
diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc
index e0cbcc513d6cd..62edb0ece83fc 100644
--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
@@ -103,7 +103,7 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
     if (has_length) {
       PADDLE_ENFORCE_EQ(emission_dims.size(),
                         3,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The Input(Emission) should be a 3-D tensor. But "
                             "received: input rank %u, input shape [%s]. ",
                             emission_dims.size(),
@@ -111,7 +111,7 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
     } else {
       PADDLE_ENFORCE_EQ(emission_dims.size(),
                         2,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The Input(Emission) should be a 2-D tensor. But "
                             "received: input rank %u, input shape [%s].",
                             emission_dims.size(),
@@ -121,7 +121,7 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
     auto transition_dims = ctx->GetInputDim("Transition");
     PADDLE_ENFORCE_EQ(transition_dims.size(),
                       2UL,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The Input(Transition) should be a 2-D tensor. But "
                           "received: input rank %u, input shape [%s].",
                           transition_dims.size(),
@@ -129,7 +129,7 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         transition_dims[0] - 2,
         transition_dims[1],
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "An invalid dimension for the Input(Transition), which should "
             "be a 2-D tensor with shape [(D + 2) x D]. But received: input "
             "rank %u, "
@@ -140,7 +140,7 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
                              transition_dims[transition_dims.size() - 1] > 0)) {
       PADDLE_ENFORCE_EQ(emission_dims[emission_dims.size() - 1],
                         transition_dims[transition_dims.size() - 1],
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The last dimension of the Input(Emission) and the "
                             "Input(Transition) "
                             "should be equal to the tag number. But received "
@@ -159,7 +159,7 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
             (label_dims.size() == 3UL && label_dims[2] == 1) ||
                 label_dims.size() == 2UL,
             true,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "The Input(Label) should be a 3-D tensor with last dimension "
                 "fixed to 1 or a 2-D tensor in padding mode. But received: "
                 "input "
@@ -171,7 +171,7 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
             (label_dims.size() == 2UL && label_dims[1] == 1) ||
                 label_dims.size() == 1UL,
             true,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "The Input(Label) should be a 2-D tensor with last "
                 "dimension fixed to 1 or a 1-D tensor. But received: "
                 "input rank %u, input shape [%s].",
@@ -182,7 +182,7 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
         PADDLE_ENFORCE_EQ(
             emission_dims[0],
             label_dims[0],
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "The first dimension of Input(Emission) and Input(Label) "
                 "should be the same. But received Input(Emission): rank %u, "
                 "shape [%s]; received Input(Label): rank %u, shape [%s].",
diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h
index 50d6eece098e3..6649043014d64 100644
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@@ -78,7 +78,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     } else {
       PADDLE_ENFORCE_EQ(emission_weights->NumLevels(),
                         1UL,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The Input(Emission) should be a sequence with lod "
                             "level 1. But received: lod level %u.",
                             emission_weights->NumLevels()));
@@ -86,7 +86,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_GT(
           lod.size(),
           0,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Input(Emission) must be a sequence. But received: lod level %u.",
               lod.size()));
       const size_t level = 0;
@@ -105,7 +105,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
       if (label) {
         PADDLE_ENFORCE_EQ(label->NumLevels(),
                           1UL,
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "The Input(label) should be a sequence with lod "
                               "level 1. But received: lod level %u.",
                               label->NumLevels()));
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index 19164959c7ceb..80db8230e9e24 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -34,7 +34,7 @@ class CropOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           int64_t(shape.size()),
           x_dim.size(),
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The number of elements (%d) of CropOp's "
               "'shape' attribute should be equal to the number of dimensions "
               "(%d) of the Input(X).",
@@ -49,7 +49,7 @@ class CropOp : public framework::OperatorWithKernel {
       auto y_dim = ctx->GetInputDim("Y");
       PADDLE_ENFORCE_EQ(common::arity(x_dim),
                         common::arity(y_dim),
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The number of dimensions (%d) of CropOp's input(X)"
                             " must be equal to that (%d) of input(Y).",
                             common::arity(x_dim),
diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h
index 7d0d4f06392fa..04b077de36e50 100644
--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/strided_memcpy.h"
 
 namespace paddle {
@@ -36,25 +36,25 @@ static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
   if (ctx.HasInput("Offsets")) {
     PADDLE_ENFORCE_EQ(ctx.Attr<std::vector<int>>("offsets").empty(),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input 'Offsets' and attribute 'offsets' "
                           "should not be used at the same time for CropOp."));
     const auto* offsets_tensor = ctx.Input<phi::DenseTensor>("Offsets");
     PADDLE_ENFORCE_EQ(offsets_tensor->dims().size(),
                       1,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The number of dimensions of input 'Offsets' for "
                           "CropOp must be 1, but the value received is %d.",
                           offsets_tensor->dims().size()));
     PADDLE_ENFORCE_EQ(
         rank,
         offsets_tensor->dims()[0],
-        platform::errors::InvalidArgument("The number of elements (%d) for "
-                                          "input 'Offsets' must be equal to "
-                                          "the number of dimensions (%d) "
-                                          "of the input tensor.",
-                                          offsets_tensor->dims()[0],
-                                          rank));
+        phi::errors::InvalidArgument("The number of elements (%d) for "
+                                     "input 'Offsets' must be equal to "
+                                     "the number of dimensions (%d) "
+                                     "of the input tensor.",
+                                     offsets_tensor->dims()[0],
+                                     rank));
     const int* offsets_data;
     phi::DenseTensor cpu_tmp_tensor;
     if (platform::is_cpu_place(offsets_tensor->place())) {
@@ -70,12 +70,12 @@ static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
     PADDLE_ENFORCE_EQ(
         rank,
         static_cast<int>(res.size()),
-        platform::errors::InvalidArgument("The number of elements (%d) for "
-                                          "input 'Offsets' must be equal to "
-                                          "the number of dimensions (%d) "
-                                          "of the input tensor.",
-                                          res.size(),
-                                          rank));
+        phi::errors::InvalidArgument("The number of elements (%d) for "
+                                     "input 'Offsets' must be equal to "
+                                     "the number of dimensions (%d) "
+                                     "of the input tensor.",
+                                     res.size(),
+                                     rank));
   }
   return res;
 }
@@ -101,7 +101,7 @@ void CropFunction(const framework::ExecutionContext& context) {
   }
   auto& place =
       *context.template device_context<DeviceContext>().eigen_device();
-  EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(
+  phi::funcs::EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(
       place, out_tensor, x_tensor, e_offsets, e_shape);
 }
 
@@ -113,14 +113,14 @@ class CropKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_GE(
         rank,
         1,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The number of dimensions of the Input(X) for CropOp must be "
             "greater than or equal to 1, but the value received is %d.",
             rank));
     PADDLE_ENFORCE_LE(
         rank,
         6,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The number of dimensions of the Input(X) for CropOp must be "
             "less than or equal to 6, but the value received is %d.",
             rank));
@@ -165,7 +165,7 @@ void CropGradFunction(const framework::ExecutionContext& context) {
     auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
-    EigenPad<std::decay_t<decltype(place)>, T, D>::Eval(
+    phi::funcs::EigenPad<std::decay_t<decltype(place)>, T, D>::Eval(
         place, d_x_tensor, d_out_tensor, paddings, static_cast<T>(0));
   }
 }
@@ -181,7 +181,7 @@ class CropGradKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_GE(
         rank,
         1,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The number of dimensions of the input 'Out@GRAD' for "
             "CropGrad must be greater than or equal "
             "to 1, but the value received is %d.",
@@ -189,7 +189,7 @@ class CropGradKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_LE(
         rank,
         6,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The number of dimensions of the input 'Out@GRAD' for "
             "CropGrad must be less than or equal "
             "to 6, but the value received is %d.",
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index cc2b4b4252835..e8baeac3b0bfa 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -42,7 +42,7 @@ class CrossEntropyOpBase : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           common::slice_ddim(x_dims, 0, rank - 1),
           common::slice_ddim(label_dims, 0, rank - 1),
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Input(X) and Input(Label) shall have the same shape "
               "except the last dimension. But received: the shape of Input(X) "
               "is "
@@ -55,7 +55,7 @@ class CrossEntropyOpBase : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           rank,
           label_dims.size(),
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "If Attr(soft_label) == true, Input(X) and Input(Label) "
               "shall have the same dimensions. But received: the dimensions of "
               "Input(X) is [%d],"
@@ -72,7 +72,7 @@ class CrossEntropyOpBase : public framework::OperatorWithKernel {
         PADDLE_ENFORCE_EQ(
             x_dims[rank - 1],
             label_dims[rank - 1],
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "If Attr(soft_label) == true, the last dimension of "
                 "Input(X) and Input(Label) should be equal. But received: the"
                 "last dimension of Input(X) is [%d], the shape of Input(X) is "
@@ -91,7 +91,7 @@ class CrossEntropyOpBase : public framework::OperatorWithKernel {
         PADDLE_ENFORCE_EQ(
             label_dims[rank - 1],
             1UL,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "the last dimension of Input(Label) should be 1."
                 "But received: the last dimension of Input(Label) is [%d],"
                 "the last dimension is [%d]",
@@ -101,7 +101,7 @@ class CrossEntropyOpBase : public framework::OperatorWithKernel {
         PADDLE_ENFORCE_EQ(
             rank,
             label_dims.size() + 1,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "ShapeError: The rank of Input(X) should be equal to "
                 "Input(Label) plus 1."
                 "But received: The dimension of Input(X) is [%d], "
@@ -160,7 +160,7 @@ class CrossEntropyGradientOpBase : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         dy_dims.size(),
         label_dims.size(),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(Y@Grad) and Input(Y) should have the same rank."
             "But received: Y@Grad's rank is [%d], Y's rank is [%d]",
             dy_dims.size(),
@@ -175,7 +175,7 @@ class CrossEntropyGradientOpBase : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           common::slice_ddim(x_dims, 0, rank - 1),
           common::slice_ddim(dy_dims, 0, rank - 1),
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The Input(X) and Input(Y@Grad) should have the same "
               "shape except the last dimension. but received: "
               "the shape of Input(X) is [%s], "
diff --git a/paddle/fluid/operators/cross_entropy_op.cu b/paddle/fluid/operators/cross_entropy_op.cu
index 06ac7791e6d68..e4e2420d152bc 100644
--- a/paddle/fluid/operators/cross_entropy_op.cu
+++ b/paddle/fluid/operators/cross_entropy_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cross_entropy_op.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/float16.h"
 
 namespace plat = paddle::platform;
 namespace ops = paddle::operators;
@@ -24,14 +24,14 @@ PD_REGISTER_STRUCT_KERNEL(cross_entropy,
                           ops::CrossEntropyOpKernel,
                           float,
                           double,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
 PD_REGISTER_STRUCT_KERNEL(cross_entropy_grad,
                           GPU,
                           ALL_LAYOUT,
                           ops::CrossEntropyGradientOpKernel,
                           float,
                           double,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
 
 PD_REGISTER_STRUCT_KERNEL(cross_entropy2,
                           GPU,
@@ -39,11 +39,11 @@ PD_REGISTER_STRUCT_KERNEL(cross_entropy2,
                           ops::CrossEntropyOpKernel2,
                           float,
                           double,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
 PD_REGISTER_STRUCT_KERNEL(cross_entropy_grad2,
                           GPU,
                           ALL_LAYOUT,
                           ops::CrossEntropyGradientOpKernel2,
                           float,
                           double,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h
index 5b76cc9a65a2b..9c0d025cb0cbb 100644
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -180,7 +180,7 @@ struct HardLabelCrossEntropyForwardFunctor {
     auto label = label_[idx];
     if (label != ignore_index_) {
       // don't update to PADDLE_ENFORCE_GE and PADDLE_ENFORCE_LT cause
-      // can't use platform::errors::InvalidArgument in HOSTDEVICE
+      // can't use phi::errors::InvalidArgument in HOSTDEVICE
       PADDLE_ENFORCE(label >= 0 && label < feature_size_,
                      "Variable value (label) of "
                      "OP(fluid.layers.cross_entropy) expected >= 0 "
diff --git a/paddle/fluid/operators/ctc_align_op.cc b/paddle/fluid/operators/ctc_align_op.cc
deleted file mode 100644
index a40ba84610293..0000000000000
--- a/paddle/fluid/operators/ctc_align_op.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/ctc_align_op.h"
-
-namespace paddle {
-namespace operators {
-
-class CTCAlignOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "ctc_align");
-    OP_INOUT_CHECK(ctx->HasOutput("Output"), "Output", "Output", "ctc_align");
-
-    auto input_dims = ctx->GetInputDim("Input");
-
-    // TODO(wanghaoshuang): it is tricky to set the wrong dimension here.
-    ctx->SetOutputDim("Output", input_dims);
-    if (ctx->HasInput("InputLength")) {
-      ctx->SetOutputDim("OutputLength", {input_dims[0], 1});
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-class CTCAlignOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input",
-             "2-D Tensor or LodTensor with  shape "
-             "[Lp, 1], where Lp is the sum of all input sequences' length.");
-    AddInput("InputLength",
-             "2-D Tensor with shape [batch_size, 1], "
-             " When Input is padding mode, InputLength is length of every "
-             "sequence in Input.")
-        .AsDispensable();
-    AddOutput("Output", "(Tensor, default: Tensor<int>), The align result.");
-    AddOutput("OutputLength",
-              "2-D Tensor with shape [batch_size, 1], "
-              "When Input is padding mode, OutputLength is length of every "
-              "sequence in Output.")
-        .AsDispensable();
-    AddAttr<int>("blank",
-                 "(int, default: 0), the blank label set in Connectionist "
-                 "Temporal Classification (CTC) op.")
-        .SetDefault(0);
-    AddAttr<bool>("merge_repeated",
-                  "(bool, default: true), whether to "
-                  "merge repeated elements between two blanks. ")
-        .SetDefault(true);
-    // add attr padding number for tensor input
-    AddAttr<int>("padding_value",
-                 "(int, default: 0), padding number "
-                 "use to padding tensor. ")
-        .SetDefault(0);
-    AddComment(R"DOC(
-CTCAlign op is used to merge repeated elements between two blanks
-and then delete all blanks in sequence.
-
-Given:
-    Input.data = [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6,
-                  6, 0, 0, 7, 7, 7, 0]
-    Input.dims = {18, 1}
-    Input.LoD = [[0, 11, 18]]
-
-And:
-    blank = 0
-    merge_repeated = True
-
-Then:
-    Output.data = [1, 2, 4, 4, 5, 6,
-                   6, 7]
-    Output.dims = {8, 1}
-    Output.LoD = [[0, 6, 8]]
-or Given:
-    Input.data = [[0, 1, 2, 2, 0, 4],
-                  [0, 4, 5, 0, 6, 0],
-                  [0, 7, 7, 7, 0, 0]]
-    InputLength.data  = [[6],
-                         [5],
-                         [4]],
-    Input.dims = {3, 6},
-    Input.Lod = []
-And:
-    blank = 0
-    merge_repeated = True
-    padding_value = 0
-
-Then:
-    Output.data = [[1, 2, 4, 0, 0, 0],
-                   [4, 5, 6, 0, 0, 0],
-                   [7, 0, 0, 0, 0, 0]],
-    OutputLength.data = [[3],
-                         [3],
-                         [1]],
-    Output.dims = {3, 6},
-    Output.Lod = []
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    ctc_align,
-    ops::CTCAlignOp,
-    ops::CTCAlignOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-PD_REGISTER_STRUCT_KERNEL(
-    ctc_align, CPU, ALL_LAYOUT, ops::CTCAlignKernel, int, int64_t) {}
diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu
deleted file mode 100644
index 3b7490b1dcff3..0000000000000
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdio.h>
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-
-#include <vector>
-
-#include "paddle/fluid/operators/ctc_align_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void MergeAndDelCudaKernel(const int64_t num_token,
-                                      const T* tokens,
-                                      const size_t num_seq,
-                                      size_t* lod0,
-                                      const int blank,
-                                      const int merge_repeated,
-                                      size_t* out_lod0,
-                                      T* output) {
-  int output_idx = 0;
-  out_lod0[0] = 0;
-
-  for (int i = 0; i < num_seq; ++i) {
-    T pre_token = -1;
-    for (int j = lod0[i]; j < lod0[i + 1]; ++j) {
-      if (tokens[j] != blank && !(merge_repeated && tokens[j] == pre_token)) {
-        output[output_idx] = tokens[j];
-        ++output_idx;
-      }
-      pre_token = tokens[j];
-    }
-    out_lod0[i + 1] = output_idx;
-  }
-}
-
-template <typename T>
-__global__ void PaddingMergeAndDelCudaKernel(const int64_t num_token,
-                                             const T* tokens,
-                                             const T* tokens_length,
-                                             const int blank,
-                                             const int merge_repeated,
-                                             const int padding_value,
-                                             const int64_t batch_size,
-                                             T* output,
-                                             T* output_length) {
-  int ind = blockIdx.x * blockDim.x + threadIdx.x;
-  if (ind >= batch_size) return;
-  int output_idx = ind * num_token;
-  T prev_token = -1;
-  for (int i = ind * num_token; i < ind * num_token + tokens_length[ind]; i++) {
-    if ((unsigned)tokens[i] != blank &&
-        !(merge_repeated && tokens[i] == prev_token)) {
-      output[output_idx] = tokens[i];
-      ++output_idx;
-    }
-    prev_token = tokens[i];
-  }
-  output_length[ind] = output_idx - ind * num_token;
-  for (int i = output_idx; i < ind * num_token + num_token; i++) {
-    output[i] = padding_value;
-  }
-}
-
-template <typename T, typename DeviceContext>
-class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "CTCAlign operator CUDA kernel must use CUDAPlace "
-                          "rather than CPUPlace."));
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* output = ctx.Output<phi::DenseTensor>("Output");
-    const int blank = ctx.Attr<int>("blank");
-    const int merge_repeated =
-        static_cast<int>(ctx.Attr<bool>("merge_repeated"));
-    const T* tokens = input->data<T>();
-    auto stream = ctx.cuda_device_context().stream();
-
-    // tensor input which has no lod
-    if (input->lod().empty()) {
-      const int padding_value = ctx.Attr<int>("padding_value");
-      auto input_dims = input->dims();
-      T* output_data = output->mutable_data<T>({input_dims[0], input_dims[1]},
-                                               ctx.GetPlace());
-      auto* input_length = ctx.Input<phi::DenseTensor>("InputLength");
-      const T* input_length_data = input_length->data<T>();
-      auto* output_length = ctx.Output<phi::DenseTensor>("OutputLength");
-      T* output_length_data =
-          output_length->mutable_data<T>({input_dims[0], 1}, ctx.GetPlace());
-      PaddingMergeAndDelCudaKernel<T>
-          <<<32, (input_dims[0] + 32 - 1) / 32, 0, stream>>>(
-              input_dims[1],
-              tokens,
-              input_length_data,
-              blank,
-              merge_repeated,
-              padding_value,
-              input_dims[0],
-              output_data,
-              output_length_data);
-    } else {
-      const size_t level = 0;
-      auto input_lod = framework::ToAbsOffset(input->lod());
-
-      const int64_t num_tokens = input->dims()[0];
-      const size_t num_seq = input_lod[level].size() - 1;
-
-      // prepare a lod to record lod information while merging elements
-      thrust::device_vector<size_t> dev_out_lod0(input_lod[level].size());
-      size_t* dev_out_lod0_ptr = thrust::raw_pointer_cast(dev_out_lod0.data());
-
-      // merge elements and delete blank
-      T* output_data = output->mutable_data<T>({num_tokens, 1}, ctx.GetPlace());
-
-      phi::MixVector<size_t> mixv_input_lod(&input_lod[level]);
-      MergeAndDelCudaKernel<T>
-          <<<1, 1, 0, stream>>>(num_tokens,
-                                tokens,
-                                num_seq,
-                                mixv_input_lod.CUDAMutableData(ctx.GetPlace()),
-                                blank,
-                                merge_repeated,
-                                dev_out_lod0_ptr,
-                                output_data);
-      mixv_input_lod.CopyToCPU();
-
-      // set output lod
-      std::vector<size_t> host_out_lod0(dev_out_lod0.begin(),
-                                        dev_out_lod0.end());
-      framework::LoD out_lod;
-      out_lod.push_back(host_out_lod0);
-      output->set_lod(out_lod);
-
-      // resize output dims
-      output->Resize({static_cast<int64_t>(host_out_lod0.back()), 1});
-
-      if (host_out_lod0.back() == 0) {
-        output->Resize({1, 1});
-        output->mutable_data<T>(ctx.GetPlace());
-        phi::funcs::SetConstant<phi::GPUContext, T> set_constant;
-        set_constant(
-            ctx.template device_context<phi::GPUContext>(), output, -1);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(
-    ctc_align, GPU, ALL_LAYOUT, ops::CTCAlignOpCUDAKernel, int, int64_t) {}
diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h
deleted file mode 100644
index faa2efab772a6..0000000000000
--- a/paddle/fluid/operators/ctc_align_op.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string.h>
-
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class CTCAlignKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* output = ctx.Output<phi::DenseTensor>("Output");
-    size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));
-    bool merge_repeated = ctx.Attr<bool>("merge_repeated");
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    auto input_dims = common::vectorize<int>(input->dims());
-    const T* input_data = input->data<T>();
-
-    // support tensor input, no lod information
-    if (input->lod().empty()) {
-      size_t padding_value =
-          static_cast<size_t>(ctx.Attr<int>("padding_value"));
-      auto* input_length = ctx.Input<phi::DenseTensor>("InputLength");
-      const T* input_length_data = input_length->data<T>();
-
-      auto* output_length = ctx.Output<phi::DenseTensor>("OutputLength");
-      T* output_length_data = output_length->mutable_data<T>(ctx.GetPlace());
-
-      for (size_t batch_id = 0; batch_id < (unsigned)input_dims[0];
-           batch_id++) {
-        T prev_token = -1;
-        size_t output_idx = 0;
-        for (size_t i = 0; i < (unsigned)input_length_data[batch_id]; i++) {
-          size_t input_ind = batch_id * input_dims[1] + i;
-          if ((unsigned)input_data[input_ind] != blank &&
-              !(merge_repeated && input_data[input_ind] == prev_token)) {
-            output_data[batch_id * input_dims[1] + output_idx] =
-                input_data[input_ind];
-            ++output_idx;
-          }
-          prev_token = input_data[input_ind];
-        }
-        output_length_data[batch_id] = output_idx;
-        for (size_t j = output_idx; j < (unsigned)input_dims[1]; j++)
-          output_data[batch_id * input_dims[1] + j] = padding_value;
-      }
-    } else {
-      const size_t level = 0;
-      auto input_lod = framework::ToAbsOffset(input->lod());
-
-      // check input dims and lod
-      PADDLE_ENFORCE_EQ(
-          input_dims[0],
-          static_cast<int64_t>(input_lod[level].back()),
-          platform::errors::InvalidArgument(
-              "The first dimension %d of CTCAlign operator Input(Input) should "
-              "be equal to "
-              "the sum of all sequences' lengths %d.",
-              input_dims[0],
-              static_cast<int64_t>(input_lod[level].back())));
-
-      const size_t num_sequences = input_lod[level].size() - 1;
-
-      // merge repeated tokens and delete blank
-      size_t output_idx = 0;
-      std::vector<size_t> output_lod0(1, 0);
-      for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) {
-        T prev_token = -1;
-        for (size_t i = input_lod[level][seq_idx];
-             i < input_lod[level][seq_idx + 1];
-             ++i) {
-          if ((unsigned)input_data[i] != blank &&
-              !(merge_repeated && input_data[i] == prev_token)) {
-            output_data[output_idx] = input_data[i];
-            ++output_idx;
-          }
-          prev_token = input_data[i];
-        }
-        output_lod0.push_back(output_idx);
-      }
-
-      // set output lod
-      framework::LoD output_lod;
-      output_lod.push_back(output_lod0);
-      output->set_lod(output_lod);
-      // resize output dims
-      output->Resize({static_cast<int64_t>(output_lod0.back()), 1});
-      // for empty sequence
-      if (output_lod0.back() == 0) {
-        output->Resize({1, 1});
-        output_data = output->mutable_data<T>(ctx.GetPlace());
-        output_data[0] = -1;
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h
index 9b6774af5832a..eaca6842d350c 100644
--- a/paddle/fluid/operators/cudnn_rnn_cache.h
+++ b/paddle/fluid/operators/cudnn_rnn_cache.h
@@ -267,7 +267,7 @@ class CudnnRNNCache {
     PADDLE_ENFORCE_EQ(
         weights_size_,
         cudnn_size * weight_numel,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The cudnn lstm and setting weight size should be same."));
 
     int dim_w[3];
diff --git a/paddle/fluid/operators/custom_device_common_op_registry.cc b/paddle/fluid/operators/custom_device_common_op_registry.cc
index d63197af754f2..d45a1a5a6a675 100644
--- a/paddle/fluid/operators/custom_device_common_op_registry.cc
+++ b/paddle/fluid/operators/custom_device_common_op_registry.cc
@@ -65,19 +65,19 @@ class CConcatOpCustomDeviceKernel : public framework::OpKernel<T> {
 
     PADDLE_ENFORCE_GE(rank,
                       0,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "The value of rank (%d) for c_concat must be "
                           "greater than or equal to 0.",
                           rank));
     PADDLE_ENFORCE_GE(nranks,
                       2,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "The value of nranks (%d) for c_concat must be "
                           "greater than or equal to 2.",
                           nranks));
     PADDLE_ENFORCE_LT(rank,
                       nranks,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "The value of rank (%d) for c_concat must be "
                           "less than that of nranks (%d).",
                           rank,
@@ -107,7 +107,7 @@ class CConcatOpCustomDeviceKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           nranks,
           comm->GetSize(),
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "nranks: %s should equal to %s", nranks, comm->GetSize()));
 
       int64_t send_numel = x->numel();
@@ -160,7 +160,7 @@ class CIdentityOpCustomDeviceKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_GE(
         rid,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The ring_id (%d) for c_identity op must be non-negative.", rid));
     ctx.device_context().Alloc<T>(out);
 
@@ -180,19 +180,19 @@ class CSplitOpCustomDeviceKernel : public framework::OpKernel<T> {
 
     PADDLE_ENFORCE_GE(rank,
                       0,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "The value of rank (%d) for c_split must be "
                           "greater than or equal to 0.",
                           rank));
     PADDLE_ENFORCE_GE(nranks,
                       2,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "The value of nranks (%d) for c_split must be "
                           "greater than or equal to 2.",
                           nranks));
     PADDLE_ENFORCE_LT(rank,
                       nranks,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "The value of rank (%d) for c_split must be "
                           "less than that of nranks (%d).",
                           rank,
@@ -259,7 +259,7 @@ class CEmbeddingOpCustomDeviceKernel : public framework::OpKernel<T> {
               *reinterpret_cast<phi::DenseTensor*>(out_tensor.impl().get()))
           .Resize(out_dims);
     } else {
-      PADDLE_THROW(platform::errors::Unavailable(
+      PADDLE_THROW(phi::errors::Unavailable(
           "CustomDevice c_embedding ids only support int32 or int64."));
     }
   }
@@ -319,7 +319,7 @@ class CEmbeddingGradOpCustomDeviceKernel : public framework::OpKernel<T> {
       table_grad_t->ShareDataWith(
           *reinterpret_cast<phi::DenseTensor*>(table_grad_tensor.impl().get()));
     } else {
-      PADDLE_THROW(platform::errors::Unavailable(
+      PADDLE_THROW(phi::errors::Unavailable(
           "CustomDevice c_embedding ids only support int32 or int64."));
     }
   }
@@ -543,11 +543,11 @@ class CAllReduceOpCustomDeviceKernel : public framework::OpKernel<T> {
       auto place = cond->place();
       PADDLE_ENFORCE_EQ(platform::is_cpu_place(place),
                         true,
-                        platform::errors::PreconditionNotMet(
+                        phi::errors::PreconditionNotMet(
                             "The input `cond` tensor should be on cpu place"));
       PADDLE_ENFORCE_EQ(cond->numel(),
                         1,
-                        platform::errors::PreconditionNotMet(
+                        phi::errors::PreconditionNotMet(
                             "The input `cond` should be shape [1]"));
       if (!cond->data<bool>()[0]) {
         VLOG(4) << "Skip all reduce Op since cond is 0";
@@ -594,8 +594,8 @@ class CAllReduceOpCustomDeviceKernel : public framework::OpKernel<T> {
           break;
 
         default:
-          PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-              "Invalid reduce type: %d", red_type));
+          PADDLE_THROW(phi::errors::InvalidArgument("Invalid reduce type: %d",
+                                                    red_type));
       }
 
       auto task = pg->AllReduce(in_tensor, out_tensor, opts);
@@ -910,14 +910,14 @@ class GlobalScatterOpCustomDeviceKernel : public framework::OpKernel<T> {
     const auto& dev_ctx = ctx.template device_context<phi::CustomContext>();
     auto place = ctx.GetPlace();
 
-    PADDLE_ENFORCE_EQ(local_count->dtype(),
-                      phi::DataType::INT64,
-                      platform::errors::InvalidArgument(
-                          "Please use int64 type in local_count."));
-    PADDLE_ENFORCE_EQ(global_count->dtype(),
-                      phi::DataType::INT64,
-                      platform::errors::InvalidArgument(
-                          "Please use int64 type in global_count."));
+    PADDLE_ENFORCE_EQ(
+        local_count->dtype(),
+        phi::DataType::INT64,
+        phi::errors::InvalidArgument("Please use int64 type in local_count."));
+    PADDLE_ENFORCE_EQ(
+        global_count->dtype(),
+        phi::DataType::INT64,
+        phi::errors::InvalidArgument("Please use int64 type in global_count."));
 
     auto map = distributed::ProcessGroupMapFromGid::getInstance();
     const int64_t* cpu_local_count_data;
@@ -1124,14 +1124,14 @@ class GlobalGatherOpCustomDeviceKernel : public framework::OpKernel<T> {
     auto place = ctx.GetPlace();
     auto out = ctx.Output<phi::DenseTensor>("Out");
 
-    PADDLE_ENFORCE_EQ(local_count->dtype(),
-                      phi::DataType::INT64,
-                      platform::errors::InvalidArgument(
-                          "Please use int64 type in local_count."));
-    PADDLE_ENFORCE_EQ(global_count->dtype(),
-                      phi::DataType::INT64,
-                      platform::errors::InvalidArgument(
-                          "Please use int64 type in global_count."));
+    PADDLE_ENFORCE_EQ(
+        local_count->dtype(),
+        phi::DataType::INT64,
+        phi::errors::InvalidArgument("Please use int64 type in local_count."));
+    PADDLE_ENFORCE_EQ(
+        global_count->dtype(),
+        phi::DataType::INT64,
+        phi::errors::InvalidArgument("Please use int64 type in global_count."));
 
     const int64_t* cpu_local_count_data;
     const int64_t* cpu_global_count_data;
@@ -1370,7 +1370,7 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
           float>,
       paddle::operators::CConcatOpCustomDeviceKernel<
           paddle::platform::CustomDeviceContext,
-          paddle::platform::float16>);
+          phi::dtype::float16>);
   REGISTER_OP_CUSTOM_DEVICE_KERNEL(
       c_split,
       device_type,
@@ -1382,7 +1382,7 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
           int>,
       paddle::operators::CSplitOpCustomDeviceKernel<
           paddle::platform::CustomDeviceContext,
-          paddle::platform::float16>);
+          phi::dtype::float16>);
   REGISTER_OP_CUSTOM_DEVICE_KERNEL(
       c_embedding,
       device_type,
@@ -1391,7 +1391,7 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
           float>,
       paddle::operators::CEmbeddingOpCustomDeviceKernel<
           paddle::platform::CustomDeviceContext,
-          paddle::platform::float16>);
+          phi::dtype::float16>);
   REGISTER_OP_CUSTOM_DEVICE_KERNEL(
       c_embedding_grad,
       device_type,
@@ -1400,7 +1400,7 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
           float>,
       paddle::operators::CEmbeddingGradOpCustomDeviceKernel<
           paddle::platform::CustomDeviceContext,
-          paddle::platform::float16>);
+          phi::dtype::float16>);
 
   REGISTER_OP_CUSTOM_DEVICE_KERNEL(
       c_softmax_with_cross_entropy,
@@ -1413,7 +1413,7 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
           double>,
       paddle::operators::CSoftmaxWithCrossEntropyOpCustomDeviceKernel<
           paddle::platform::CustomDeviceContext,
-          paddle::platform::float16>) {}
+          phi::dtype::float16>) {}
 
   REGISTER_OP_CUSTOM_DEVICE_KERNEL(
       c_softmax_with_cross_entropy_grad,
@@ -1426,7 +1426,7 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
           double>,
       paddle::operators::CSoftmaxWithCrossEntropyGradCustomDeviceKernel<
           paddle::platform::CustomDeviceContext,
-          paddle::platform::float16>) {}
+          phi::dtype::float16>) {}
 
   REGISTER_OP_CUSTOM_DEVICE_KERNEL(
       c_identity,
@@ -1445,7 +1445,7 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
           int64_t>,
       paddle::operators::CIdentityOpCustomDeviceKernel<
           paddle::platform::CustomDeviceContext,
-          paddle::platform::float16>) {}
+          phi::dtype::float16>) {}
 
   REGISTER_OP_CUSTOM_DEVICE_KERNEL(
       c_sync_calc_stream,
@@ -1467,7 +1467,7 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
           double>,
       paddle::operators::CSyncCalcStreamCustomDeviceKernel<
           paddle::platform::CustomDeviceContext,
-          paddle::platform::float16>) {}
+          phi::dtype::float16>) {}
   REGISTER_OP_CUSTOM_DEVICE_KERNEL(
       c_allreduce_sum,
       device_type,
@@ -1481,7 +1481,7 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
           phi::ccl::CCLReduceOp::SUM>,
       paddle::operators::CAllReduceOpCustomDeviceKernel<
           paddle::platform::CustomDeviceContext,
-          paddle::platform::float16,
+          phi::dtype::float16,
           phi::ccl::CCLReduceOp::SUM>,
       paddle::operators::CAllReduceOpCustomDeviceKernel<
           paddle::platform::CustomDeviceContext,
@@ -1504,7 +1504,7 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
           phi::ccl::CCLReduceOp::SUM>,
       paddle::operators::CAllReduceOpCustomDeviceKernel<
           paddle::platform::CustomDeviceContext,
-          paddle::platform::float16,
+          phi::dtype::float16,
           phi::ccl::CCLReduceOp::SUM>,
       paddle::operators::CAllReduceOpCustomDeviceKernel<
           paddle::platform::CustomDeviceContext,
@@ -1527,7 +1527,7 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
           phi::ccl::CCLReduceOp::MIN>,
       paddle::operators::CAllReduceOpCustomDeviceKernel<
           paddle::platform::CustomDeviceContext,
-          paddle::platform::float16,
+          phi::dtype::float16,
           phi::ccl::CCLReduceOp::MIN>,
       paddle::operators::CAllReduceOpCustomDeviceKernel<
           paddle::platform::CustomDeviceContext,
@@ -1550,7 +1550,7 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
           phi::ccl::CCLReduceOp::MAX>,
       paddle::operators::CAllReduceOpCustomDeviceKernel<
           paddle::platform::CustomDeviceContext,
-          paddle::platform::float16,
+          phi::dtype::float16,
           phi::ccl::CCLReduceOp::MAX>,
       paddle::operators::CAllReduceOpCustomDeviceKernel<
           paddle::platform::CustomDeviceContext,
@@ -1573,7 +1573,7 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
           phi::ccl::CCLReduceOp::PRODUCT>,
       paddle::operators::CAllReduceOpCustomDeviceKernel<
           paddle::platform::CustomDeviceContext,
-          paddle::platform::float16,
+          phi::dtype::float16,
           phi::ccl::CCLReduceOp::PRODUCT>,
       paddle::operators::CAllReduceOpCustomDeviceKernel<
           paddle::platform::CustomDeviceContext,
@@ -1590,8 +1590,7 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
       paddle::operators::CBroadcastOpCustomDeviceKernel<int64_t>,
       paddle::operators::CBroadcastOpCustomDeviceKernel<float>,
       paddle::operators::CBroadcastOpCustomDeviceKernel<double>,
-      paddle::operators::CBroadcastOpCustomDeviceKernel<
-          paddle::platform::float16>) {}
+      paddle::operators::CBroadcastOpCustomDeviceKernel<phi::dtype::float16>) {}
   REGISTER_OP_CUSTOM_DEVICE_KERNEL(
       barrier,
       device_type,
@@ -1614,7 +1613,7 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
       paddle::operators::RandomRoutingOpCustomDeviceKernel<float>,
       paddle::operators::RandomRoutingOpCustomDeviceKernel<double>,
       paddle::operators::RandomRoutingOpCustomDeviceKernel<
-          paddle::platform::float16>) {}
+          phi::dtype::float16>) {}
   REGISTER_OP_CUSTOM_DEVICE_KERNEL(
       assign_pos,
       device_type,
@@ -1628,7 +1627,7 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
       paddle::operators::GlobalScatterOpCustomDeviceKernel<int32_t>,
       paddle::operators::GlobalScatterOpCustomDeviceKernel<int64_t>,
       paddle::operators::GlobalScatterOpCustomDeviceKernel<
-          paddle::platform::float16>) {}
+          phi::dtype::float16>) {}
   REGISTER_OP_CUSTOM_DEVICE_KERNEL(
       global_gather,
       device_type,
@@ -1637,7 +1636,7 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
       paddle::operators::GlobalGatherOpCustomDeviceKernel<int32_t>,
       paddle::operators::GlobalGatherOpCustomDeviceKernel<int64_t>,
       paddle::operators::GlobalGatherOpCustomDeviceKernel<
-          paddle::platform::float16>) {}
+          phi::dtype::float16>) {}
 #endif
 }
 
diff --git a/paddle/fluid/operators/cvm_op.cc b/paddle/fluid/operators/cvm_op.cc
index 1e414ff217c2f..a305263338769 100644
--- a/paddle/fluid/operators/cvm_op.cc
+++ b/paddle/fluid/operators/cvm_op.cc
@@ -33,8 +33,8 @@ class CVMOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         x_dims.size(),
         2UL,
-        platform::errors::InvalidArgument(
-            "Input(X)'s rank should be 2, but got %d", x_dims.size()));
+        phi::errors::InvalidArgument("Input(X)'s rank should be 2, but got %d",
+                                     x_dims.size()));
 
     if (ctx->Attrs().Get<bool>("use_cvm")) {
       ctx->SetOutputDim("Y", {x_dims[0], x_dims[1]});
@@ -77,23 +77,23 @@ class CVMGradientOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         x_dims.size(),
         2,
-        platform::errors::InvalidArgument(
-            "Expect Input(X)'s rank == 2, but got %d", x_dims.size()));
+        phi::errors::InvalidArgument("Expect Input(X)'s rank == 2, but got %d",
+                                     x_dims.size()));
     PADDLE_ENFORCE_EQ(
         dy_dims.size(),
         2,
-        platform::errors::InvalidArgument(
-            "Expect Input(X)'s rank == 2, but got %d", dy_dims.size()));
+        phi::errors::InvalidArgument("Expect Input(X)'s rank == 2, but got %d",
+                                     dy_dims.size()));
     PADDLE_ENFORCE_EQ(
         cvm_dims.size(),
         2,
-        platform::errors::InvalidArgument(
-            "Expect Input(X)'s rank == 2, but got %d", cvm_dims.size()));
+        phi::errors::InvalidArgument("Expect Input(X)'s rank == 2, but got %d",
+                                     cvm_dims.size()));
 
     PADDLE_ENFORCE_EQ(
         x_dims[0],
         dy_dims[0],
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The 1st dimension of Input(X) and Input(Y@Grad) should "
             "be equal, X is %d, Y@Grad is %d",
             x_dims[0],
@@ -102,7 +102,7 @@ class CVMGradientOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         cvm_dims[1],
         2,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "When Attr(soft_label) == false, the 2nd dimension of "
             "Input(CVM) should be 2, but got %d cvm_dims[1]"));
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
diff --git a/paddle/fluid/operators/cvm_op.cu b/paddle/fluid/operators/cvm_op.cu
index 1fbce90e494a0..5e127a532267b 100644
--- a/paddle/fluid/operators/cvm_op.cu
+++ b/paddle/fluid/operators/cvm_op.cu
@@ -110,7 +110,7 @@ class CVMCUDAKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           batch_size,
           lod[lod.size() - 1],
-          platform::errors::PreconditionNotMet(
+          phi::errors::PreconditionNotMet(
               "Input(X)'s dim[0] must be equal to last element of lod"));
       CvmComputeKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
                              PADDLE_CUDA_NUM_THREADS,
@@ -164,7 +164,7 @@ class CVMGradCUDAKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           batch_size,
           lod[lod.size() - 1],
-          platform::errors::PreconditionNotMet(
+          phi::errors::PreconditionNotMet(
               "Output(X@GRAD)'s dim[0] must be equal to last element of lod"));
       phi::MixVector<size_t> mixv_lod(&lod);
       CvmGradComputeKernel<<<(dx_numel + PADDLE_CUDA_NUM_THREADS - 1) /
diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc
index cc3a224a7e862..750310547306d 100644
--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
@@ -57,11 +57,11 @@ class DataNormOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           ctx->HasInput("scale_w"),
           true,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Input(scale_w) of DataNormOp should not be null."));
       PADDLE_ENFORCE_EQ(ctx->HasInput("bias"),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "Input(bias) of DataNormOp should not be null."));
     }
 
@@ -69,39 +69,39 @@ class DataNormOp : public framework::OperatorWithKernel {
     const DataLayout data_layout = common::StringToDataLayout(
         ctx->Attrs().Get<std::string>("data_layout"));
 
-    PADDLE_ENFORCE_EQ(x_dims.size() >= 2 && x_dims.size() <= 5,
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input X must have 2 to 5 dimensions."));
+    PADDLE_ENFORCE_EQ(
+        x_dims.size() >= 2 && x_dims.size() <= 5,
+        true,
+        phi::errors::InvalidArgument("Input X must have 2 to 5 dimensions."));
 
     const int64_t C =
         (data_layout == DataLayout::kNCHW ? x_dims[1]
                                           : x_dims[x_dims.size() - 1]);
 
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize").size(),
-                      1UL,
-                      platform::errors::InvalidArgument(
-                          "The input dim of BatchSize should be 1"));
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum").size(),
-                      1UL,
-                      platform::errors::InvalidArgument(
-                          "The input dim of BatchSum should be 1"));
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputDim("BatchSize").size(),
+        1UL,
+        phi::errors::InvalidArgument("The input dim of BatchSize should be 1"));
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputDim("BatchSum").size(),
+        1UL,
+        phi::errors::InvalidArgument("The input dim of BatchSum should be 1"));
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum").size(),
                       1UL,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The input dim of BatchSquareSum should be 1"));
     if (ctx->IsRuntime()) {
       PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize")[0],
                         C,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The input dim[0] of BatchSize should be C"));
       PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum")[0],
                         C,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The input dim[0] of BatchSum should be C"));
       PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum")[0],
                         C,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The input dim[0] of BatchSquareSum should be C"));
     }
 
@@ -112,21 +112,21 @@ class DataNormOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           scale_dim.size(),
           1UL,
-          platform::errors::InvalidArgument("the dimension of scale"
-                                            "must equal to 1. But received: "
-                                            "the shape of scale is [%s], "
-                                            "the dimension of scale is [%d]",
-                                            scale_dim,
-                                            scale_dim.size()));
+          phi::errors::InvalidArgument("the dimension of scale"
+                                       "must equal to 1. But received: "
+                                       "the shape of scale is [%s], "
+                                       "the dimension of scale is [%d]",
+                                       scale_dim,
+                                       scale_dim.size()));
       PADDLE_ENFORCE_EQ(
           bias_dim.size(),
           1UL,
-          platform::errors::InvalidArgument("the dimension of bias"
-                                            "must equal to 1. But received: "
-                                            "the shape of bias is [%s],"
-                                            "the dimension of bias is [%d]",
-                                            bias_dim,
-                                            bias_dim.size()));
+          phi::errors::InvalidArgument("the dimension of bias"
+                                       "must equal to 1. But received: "
+                                       "the shape of bias is [%s],"
+                                       "the dimension of bias is [%d]",
+                                       bias_dim,
+                                       bias_dim.size()));
 
       bool check = true;
       if ((!ctx->IsRuntime()) &&
@@ -137,14 +137,14 @@ class DataNormOp : public framework::OperatorWithKernel {
       if (check) {
         PADDLE_ENFORCE_EQ(scale_dim[0],
                           C,
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "the shape of scale must equal to [%d]"
                               "But received: the shape of scale is [%d]",
                               C,
                               scale_dim[0]));
         PADDLE_ENFORCE_EQ(bias_dim[0],
                           C,
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "the shape of bias must equal to [%d]"
                               "But received: the shape of bias is [%d]",
                               C,
@@ -171,28 +171,28 @@ class DataNormOp : public framework::OperatorWithKernel {
     }
     PADDLE_ENFORCE_EQ(dn_param_type,
                       OperatorWithKernel::IndicateVarDataType(ctx, "BatchSize"),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "BatchSize input should be of float type"));
-    PADDLE_ENFORCE_EQ(dn_param_type,
-                      OperatorWithKernel::IndicateVarDataType(ctx, "BatchSum"),
-                      platform::errors::InvalidArgument(
-                          "BatchSum input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        dn_param_type,
+        OperatorWithKernel::IndicateVarDataType(ctx, "BatchSum"),
+        phi::errors::InvalidArgument("BatchSum input should be of float type"));
     PADDLE_ENFORCE_EQ(
         dn_param_type,
         OperatorWithKernel::IndicateVarDataType(ctx, "BatchSquareSum"),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "BatchSquareSum input should be of float type"));
 
     bool enable_scale_and_shift = ctx.Attr<bool>("enable_scale_and_shift");
     if (enable_scale_and_shift) {
       PADDLE_ENFORCE_EQ(dn_param_type,
                         OperatorWithKernel::IndicateVarDataType(ctx, "scale_w"),
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "scale_w input should be of float type"));
-      PADDLE_ENFORCE_EQ(dn_param_type,
-                        OperatorWithKernel::IndicateVarDataType(ctx, "bias"),
-                        platform::errors::InvalidArgument(
-                            "bias input should be of float type"));
+      PADDLE_ENFORCE_EQ(
+          dn_param_type,
+          OperatorWithKernel::IndicateVarDataType(ctx, "bias"),
+          phi::errors::InvalidArgument("bias input should be of float type"));
     }
 
     return phi::KernelKey(input_data_type, ctx.GetPlace());
@@ -208,7 +208,7 @@ class DataNormOpMaker : public framework::OpProtoAndCheckerMaker {
         .AddCustomChecker([](const float &epsilon) {
           PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f,
                             true,
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "'epsilon' should be between 0.0 and 0.001."));
         });
     AddAttr<int>("slot_dim",
@@ -279,7 +279,7 @@ class DataNormKernel<T, phi::CPUContext> : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         x_dims.size(),
         2,
-        platform::errors::InvalidArgument("The Input dim size should be 2"));
+        phi::errors::InvalidArgument("The Input dim size should be 2"));
     const int N = static_cast<int>(x_dims[0]);
     const int C = static_cast<int>(data_layout == DataLayout::kNCHW
                                        ? x_dims[1]
@@ -287,11 +287,11 @@ class DataNormKernel<T, phi::CPUContext> : public framework::OpKernel<T> {
 
     PADDLE_ENFORCE_LT(0,
                       N,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dims of Input(X) should be greater than 0."));
     PADDLE_ENFORCE_LT(0,
                       C,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dims of Input(X) should be greater than 0."));
 
     auto *y = ctx.Output<phi::DenseTensor>("Y");
@@ -401,7 +401,7 @@ class DataNormKernel<T, phi::CPUContext> : public framework::OpKernel<T> {
         break;
       }
       default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
+        PADDLE_THROW(phi::errors::InvalidArgument(
             "Unknown storage order: %d, please use NCHW or NHWC", data_layout));
     }
   }
@@ -421,17 +421,17 @@ class DataNormGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("BatchSize"),
         true,
-        platform::errors::NotFound(
+        phi::errors::NotFound(
             "Output(BatchSize) of DataNormGradOp should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("BatchSum"),
         true,
-        platform::errors::NotFound(
+        phi::errors::NotFound(
             "Output(BatchSum) of DataNormGradOp should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("BatchSquareSum"),
         true,
-        platform::errors::NotFound(
+        phi::errors::NotFound(
             "Output(BatchSquareSum) of DataNormGradOp should not be null."));
     OP_INOUT_CHECK(ctx->HasInput("Means"), "Input", "Means", "DataNormGrad");
     OP_INOUT_CHECK(ctx->HasInput("Scales"), "Input", "Scales", "DataNormGrad");
@@ -471,7 +471,7 @@ class DataNormGradOp : public framework::OperatorWithKernel {
 
       PADDLE_ENFORCE_EQ((has_scale_grad == has_bias_grad),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "Output(Scale@GRAD) and Output(Bias@GRAD)"
                             "must be null or not be null at same time. "
                             "But now, has Scale@Grad=[%d], has Bias@GRAD=[%d]",
@@ -489,7 +489,7 @@ class DataNormGradOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     const auto *var = ctx.InputVar(framework::GradVarName("Y"));
     if (var == nullptr) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Y@GRAD can not be found for computation"));
     }
     const phi::DenseTensor *t = nullptr;
@@ -497,7 +497,7 @@ class DataNormGradOp : public framework::OperatorWithKernel {
       t = &var->Get<phi::DenseTensor>();
     }
     if (t == nullptr) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Y@GRAD can not be found for computation"));
     }
 
@@ -524,7 +524,7 @@ class DataNormGradKernel<T, phi::CPUContext> : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         x_dims.size(),
         2,
-        platform::errors::InvalidArgument("The Input dim size should be 2"));
+        phi::errors::InvalidArgument("The Input dim size should be 2"));
     const int N = static_cast<int>(x_dims[0]);
     const int C = static_cast<int>(data_layout == DataLayout::kNCHW
                                        ? x_dims[1]
@@ -710,7 +710,7 @@ class DataNormGradKernel<T, phi::CPUContext> : public framework::OpKernel<T> {
         break;
       }
       default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
+        PADDLE_THROW(phi::errors::InvalidArgument(
             "Unknown storage order: %s, please use NCHW or NHWC",
             data_layout_str));
     }
diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu
index 33cd6a8e6e49c..4be27b671d8a5 100644
--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
@@ -115,17 +115,17 @@ class DataNormKernel<T, phi::GPUContext> : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         x_dims.size(),
         2,
-        platform::errors::PreconditionNotMet("The Input dim size should be 2"));
+        phi::errors::PreconditionNotMet("The Input dim size should be 2"));
     const int N = x_dims[0];
     const int C = x_dims[1];
 
     PADDLE_ENFORCE_LT(0,
                       N,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dims of Input(X) should be greater than 0."));
     PADDLE_ENFORCE_LT(0,
                       C,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dims of Input(X) should be greater than 0."));
 
     const T *batch_size_in =
@@ -174,7 +174,7 @@ class DataNormGradKernel<T, phi::GPUContext> : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         x_dims.size(),
         2,
-        platform::errors::PreconditionNotMet("The Input dim size should be 2"));
+        phi::errors::PreconditionNotMet("The Input dim size should be 2"));
     const int N = x_dims[0];
     const int C = x_dims[1];
 
@@ -226,7 +226,7 @@ class DataNormGradKernel<T, phi::GPUContext> : public framework::OpKernel<T> {
         PADDLE_ENFORCE_EQ(
             comm_context_manager.Has(std::to_string(rid)),
             true,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "You choose to use new communication library by "
                 "setting environment "
                 "variable FLAGS_dynamic_static_unified_comm True. "
@@ -238,7 +238,7 @@ class DataNormGradKernel<T, phi::GPUContext> : public framework::OpKernel<T> {
         PADDLE_ENFORCE_NE(
             comm_ctx,
             nullptr,
-            platform::errors::Unavailable(
+            phi::errors::Unavailable(
                 "NCCLCommContext is nullptr, collective op should "
                 "has ring_id attr."));
       } else {
@@ -305,7 +305,7 @@ class DataNormGradKernel<T, phi::GPUContext> : public framework::OpKernel<T> {
       }
       platform::GpuStreamSync(stream);
 #else
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
+      PADDLE_THROW(phi::errors::PreconditionNotMet(
           "PaddlePaddle should compile with GPU, and need_sync_stats connot be "
           "supported on windows now."));
 #endif
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cc b/paddle/fluid/operators/deformable_psroi_pooling_op.cc
index 5b339cf96c2b1..1b6ed2ba0be62 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cc
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cc
@@ -148,7 +148,7 @@ class DeformablePSROIPoolOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         rois_dims.size(),
         2,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(ROIs) should be a 2-D phi::DenseTensor of shape (num_rois, "
             "4) "
             "given as [[ x1, y1, x2, y2], ...]. The rank of Input(ROIs) should "
@@ -158,12 +158,12 @@ class DeformablePSROIPoolOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         trans_dims.size(),
         4,
-        platform::errors::InvalidArgument("The rank of Input(Trans) should be "
-                                          "4 and the shape of Trans should be "
-                                          "(N, 2, H, W), but received Trans "
-                                          "rank is:%d and Trans shape is:[%s].",
-                                          trans_dims.size(),
-                                          trans_dims));
+        phi::errors::InvalidArgument("The rank of Input(Trans) should be "
+                                     "4 and the shape of Trans should be "
+                                     "(N, 2, H, W), but received Trans "
+                                     "rank is:%d and Trans shape is:[%s].",
+                                     trans_dims.size(),
+                                     trans_dims));
     auto pooled_height = ctx->Attrs().Get<int>("pooled_height");
     auto pooled_width = ctx->Attrs().Get<int>("pooled_width");
     auto spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
@@ -176,17 +176,17 @@ class DeformablePSROIPoolOp : public framework::OperatorWithKernel {
     auto part_width = part_size[1];
     auto sample_per_part = ctx->Attrs().Get<int>("sample_per_part");
     auto trans_std = ctx->Attrs().Get<float>("trans_std");
-    PADDLE_ENFORCE_GE(trans_std,
-                      0.,
-                      platform::errors::InvalidArgument(
-                          "Input(trans_std) should not be lower "
-                          "than 0.0, but received trans_std "
-                          "is:%f",
-                          trans_std));
+    PADDLE_ENFORCE_GE(
+        trans_std,
+        0.,
+        phi::errors::InvalidArgument("Input(trans_std) should not be lower "
+                                     "than 0.0, but received trans_std "
+                                     "is:%f",
+                                     trans_std));
     PADDLE_ENFORCE_GE(
         input_dims[1],
         output_channels,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The channel of Input(Input) should not be lower than "
             "Input(output_dim), "
             "but received Input channel is:%d and output_dim is:%d.",
@@ -195,70 +195,70 @@ class DeformablePSROIPoolOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GT(
         pooled_height,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(pooled_height) should be greater than 0, but received "
             "pooled_height is:%d.",
             pooled_height));
     PADDLE_ENFORCE_GT(
         pooled_width,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(pooled_width) should be greater than 0, but received "
             "pooled_width is:%d.",
             pooled_width));
     PADDLE_ENFORCE_GT(
         spatial_scale,
         0.,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(spatial_scale) should be greater than 0., but received "
             "spatial_scale is:%f.",
             spatial_scale));
     PADDLE_ENFORCE_EQ(
         group_size.size(),
         2,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The length of Input(group_size) should be 2, but received "
             "group_size length is:%d.",
             group_size.size()));
     PADDLE_ENFORCE_GT(
         group_height,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "group_height in Input(group_size) should be greater than 0, "
             "but received group_height is:%d.",
             group_height));
     PADDLE_ENFORCE_GT(
         group_width,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "group_width in Input(group_size) should be greater than 0 "
             "but received group_width is:%d.",
             group_width));
     PADDLE_ENFORCE_EQ(
         part_size.size(),
         2,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The length of Input(part_size) should be 2, but received "
             "part_size length is:%d.",
             part_size.size()));
     PADDLE_ENFORCE_GT(
         part_height,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "part_height in Input(part_size) should be greater than 0 "
             "but received part_height is:%d.",
             part_height));
     PADDLE_ENFORCE_GT(
         part_width,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "part_width in Input(part_size) should be greater than 0 "
             "but received part_width is:%d.",
             part_width));
     PADDLE_ENFORCE_LE(
         part_height,
         trans_dims[2],
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "part_height in Input(part_size) should not be greater than "
             "the height of Input(Trans), but received part_height is:%d, "
             "the height of Input(Trans) is:%d.",
@@ -267,7 +267,7 @@ class DeformablePSROIPoolOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_LE(
         part_width,
         trans_dims[3],
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "part_width in Input(part_size) should not be greater than "
             "the width of Input(Trans), but received part_width is:%d, "
             "the width of Input(Trans) is:%d.",
@@ -276,7 +276,7 @@ class DeformablePSROIPoolOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GT(
         sample_per_part,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(sample_per_part) should be greater than 0, but received "
             "sample_per_part is:%d.",
             sample_per_part));
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
index a3f045fd50a5f..1dfc02943b7fb 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
@@ -213,7 +213,7 @@ class DeformablePSROIPoolCUDAKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         num_rois,
         out->dims()[0],
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The number of Input(ROIs) should be same with the number of "
             "Output(Output), but received ROIs number is:%d, Output number "
             "is:%d.",
@@ -225,7 +225,7 @@ class DeformablePSROIPoolCUDAKernel : public framework::OpKernel<T> {
         no_trans ? output_dim : output_dim / num_classes;
     PADDLE_ENFORCE_GE(channels_each_class,
                       1,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "channels_each_class should not be lower than 1, but "
                           "channels_each_class is:%d.",
                           channels_each_class));
@@ -243,7 +243,7 @@ class DeformablePSROIPoolCUDAKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         rois_batch_size,
         batch,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "rois_batch_size should be equal to the batch_size, but "
             "rois_batch_size is:%d, batch_size is:%d.",
             rois_batch_size,
@@ -251,7 +251,7 @@ class DeformablePSROIPoolCUDAKernel : public framework::OpKernel<T> {
     int rois_num_with_lod = rois_lod[rois_batch_size];
     PADDLE_ENFORCE_EQ(num_rois,
                       rois_num_with_lod,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The rois_num from input and lod must be same, but"
                           "rois_num from input is:%d, rois_num from lod is:%d.",
                           num_rois,
@@ -555,7 +555,7 @@ class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         rois_batch_size,
         batch,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "rois_batch_size should be equal to the batch_size, but "
             "rois_batch_size is:%d, batch_size is:%d.",
             rois_batch_size,
@@ -564,7 +564,7 @@ class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel<T> {
     int rois_num_with_lod = rois_lod[rois_batch_size];
     PADDLE_ENFORCE_EQ(num_rois,
                       rois_num_with_lod,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The rois_num from input and lod must be same, but"
                           "rois_num from input is:%d, rois_num from lod is:%d.",
                           num_rois,
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.h b/paddle/fluid/operators/deformable_psroi_pooling_op.h
index 1ff1c83206f50..417e2da3468aa 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.h
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.h
@@ -187,7 +187,7 @@ class DeformablePSROIPoolCPUKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         num_rois,
         out->dims()[0],
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The number of Input(ROIs) should be same with the number of "
             "Output(Output), but received ROIs number is:%d, Output number "
             "is:%d.",
@@ -221,7 +221,7 @@ class DeformablePSROIPoolCPUKernel : public framework::OpKernel<T> {
     auto channels_each_class = no_trans ? output_dim : output_dim / num_classes;
     PADDLE_ENFORCE_GE(channels_each_class,
                       1,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "channels_each_class should not be lower than 1, but "
                           "channels_each_class is:%d.",
                           channels_each_class));
@@ -238,7 +238,7 @@ class DeformablePSROIPoolCPUKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         rois_batch_size,
         batch,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "rois_batch_size should be equal to the batch_size, but "
             "rois_batch_size is:%d, batch_size is:%d.",
             rois_batch_size,
@@ -246,7 +246,7 @@ class DeformablePSROIPoolCPUKernel : public framework::OpKernel<T> {
     int rois_num_with_lod = rois_lod[rois_batch_size];
     PADDLE_ENFORCE_EQ(num_rois,
                       rois_num_with_lod,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The rois_num from input and lod must be same, but"
                           "rois_num from input is:%d, rois_num from lod is:%d.",
                           num_rois,
@@ -542,7 +542,7 @@ class DeformablePSROIPoolGradCPUKernel : public framework::OpKernel<T> {
     int rois_num_with_lod = rois_lod[rois_batch_size];
     PADDLE_ENFORCE_EQ(num_rois,
                       rois_num_with_lod,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The rois_num from input and lod must be same, but"
                           "rois_num from input is:%d, rois_num from lod is:%d.",
                           num_rois,
diff --git a/paddle/fluid/operators/delete_var_op.cc b/paddle/fluid/operators/delete_var_op.cc
deleted file mode 100644
index 671c29d40cfb9..0000000000000
--- a/paddle/fluid/operators/delete_var_op.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-class Scope;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class DeleteVarOp : public framework::OperatorBase {
- public:
-  DeleteVarOp(const std::string &type,
-              const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    dev_ctx.Wait();
-
-    auto delete_var_names = Inputs("X");
-    const_cast<framework::Scope &>(scope).EraseVars(delete_var_names);
-  }
-};
-
-class DeleteVarOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {}
-};
-
-class DeleteVarOpInfoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input of delete op").AsDuplicable();
-    AddComment(R"DOC(
-Delete Operator.
-It should not be configured by users directly.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(
-    delete_var,
-    paddle::operators::DeleteVarOp,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    paddle::operators::DeleteVarOpInfoMaker,
-    paddle::operators::DeleteVarOpShapeInference);
diff --git a/paddle/fluid/operators/dequantize_log_op.cc b/paddle/fluid/operators/dequantize_log_op.cc
index 03ede45695148..7526bdb49eafd 100644
--- a/paddle/fluid/operators/dequantize_log_op.cc
+++ b/paddle/fluid/operators/dequantize_log_op.cc
@@ -62,14 +62,14 @@ class DequantizeLogOp : public framework::OperatorWithKernel {
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
-                      true,
-                      platform::errors::NotFound(
-                          "Input(X) of DequantizeLogOp is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"),
-                      true,
-                      platform::errors::NotFound(
-                          "Output(Out) of DequantizeLogOp is not found."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("X"),
+        true,
+        phi::errors::NotFound("Input(X) of DequantizeLogOp is not found."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("Out"),
+        true,
+        phi::errors::NotFound("Output(Out) of DequantizeLogOp is not found."));
 
     ctx->ShareDim("X", /*->*/ "Out");
     ctx->ShareLoD("X", /*->*/ "Out");
diff --git a/paddle/fluid/operators/dequeue_op.cc b/paddle/fluid/operators/dequeue_op.cc
deleted file mode 100644
index 9e5b809e772b6..0000000000000
--- a/paddle/fluid/operators/dequeue_op.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
-using LoDTensorBlockingQueueHolder =
-    paddle::operators::reader::LoDTensorBlockingQueueHolder;
-
-namespace paddle {
-namespace operators {
-
-class DequeueOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-  DequeueOp(const std::string& type,
-            const framework::VariableNameMap& inputs,
-            const framework::VariableNameMap& outputs,
-            const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    const std::string& queue_name = Attr<std::string>("queue_name");
-    auto* queue_holder_var = scope.FindVar(queue_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        queue_holder_var,
-        platform::errors::NotFound(
-            "No LoDTensorBlockingQueueHolder variable with name %s found.",
-            queue_name));
-    auto* queue_holder =
-        queue_holder_var->template GetMutable<LoDTensorBlockingQueueHolder>();
-    auto& out_names = Outputs("Out");
-    PADDLE_ENFORCE_GT(out_names.size(),
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The output for Op(dequeue) must be set."));
-    for (const auto& out_name : out_names) {
-      auto out_var = scope.FindVar(out_name);
-      PADDLE_ENFORCE_NOT_NULL(out_var,
-                              platform::errors::NotFound(
-                                  "No variable with name %s found", out_name));
-      auto* out_tensor = out_var->GetMutable<phi::DenseTensor>();
-      PADDLE_ENFORCE_NOT_NULL(
-          out_tensor,
-          platform::errors::InvalidArgument(
-              "Variable with name %s has not been initialized.", out_name));
-
-      paddle::framework::LoDTensorArray lod_tensor_vec;
-      bool success = false;
-      lod_tensor_vec = queue_holder->GetQueue()->Pop(&success);
-      PADDLE_ENFORCE_EQ(lod_tensor_vec.size(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Expected to pop only one element per Pop call for "
-                            "Op(dequeue), but poped %d element.",
-                            lod_tensor_vec.size()));
-      for (auto& lod_tensor : lod_tensor_vec) {
-        paddle::framework::TensorCopySync(lod_tensor, dev_place, out_tensor);
-      }
-    }
-  }
-};
-
-class DequeueOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddAttr<std::string>("queue_name",
-                         "Name of the `LoDTensorBlockingQueueHolder` variable");
-    AddOutput("Out", "A list of `lod_tensor` to dequeue and assigned.")
-        .AsDuplicable();
-    AddComment(R"DOC(
-      Dequeue operator.
-      )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = ::paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(dequeue, ops::DequeueOp, ops::DequeueOpMaker);
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index c9bee1eb60705..9aa19af0ba809 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -31,31 +31,17 @@ endfunction()
 detection_library(density_prior_box_op SRCS density_prior_box_op.cc
                   density_prior_box_op.cu)
 
-if(WITH_XPU)
-  detection_library(iou_similarity_op SRCS iou_similarity_op.cc
-                    iou_similarity_op_xpu.cc)
-else()
-  detection_library(iou_similarity_op SRCS iou_similarity_op.cc
-                    iou_similarity_op.cu)
-endif()
-
 detection_library(bipartite_match_op SRCS bipartite_match_op.cc)
-detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
 detection_library(anchor_generator_op SRCS anchor_generator_op.cc
                   anchor_generator_op.cu)
 detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
                   polygon_box_transform_op.cu)
-detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
 detection_library(generate_proposal_labels_op SRCS
                   generate_proposal_labels_op.cc)
 detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc DEPS phi common)
-detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS phi
-                  common)
 detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
 detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc
                   box_decoder_and_assign_op.cu)
-detection_library(retinanet_detection_output_op SRCS
-                  retinanet_detection_output_op.cc)
 
 if(WITH_GPU OR WITH_ROCM)
   if(WITH_GPU)
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cc b/paddle/fluid/operators/detection/anchor_generator_op.cc
index 8c3705ba3e760..3b826d5c249e1 100644
--- a/paddle/fluid/operators/detection/anchor_generator_op.cc
+++ b/paddle/fluid/operators/detection/anchor_generator_op.cc
@@ -25,24 +25,24 @@ class AnchorGeneratorOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("Input"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(Input) of AnchorGeneratorOp should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("Anchors"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Output(Anchors) of AnchorGeneratorOp should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("Variances"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Output(Variances) of AnchorGeneratorOp should not be null."));
 
     auto input_dims = ctx->GetInputDim("Input");
     PADDLE_ENFORCE_EQ(
         input_dims.size(),
         4,
-        platform::errors::InvalidArgument("The layout of input is NCHW."));
+        phi::errors::InvalidArgument("The layout of input is NCHW."));
 
     auto anchor_sizes = ctx->Attrs().Get<std::vector<float>>("anchor_sizes");
     auto aspect_ratios = ctx->Attrs().Get<std::vector<float>>("aspect_ratios");
@@ -98,12 +98,12 @@ class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker {
         .AddCustomChecker([](const std::vector<float>& anchor_sizes) {
           PADDLE_ENFORCE_GT(anchor_sizes.size(),
                             0UL,
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "Size of anchor_sizes must be at least 1."));
           for (size_t i = 0; i < anchor_sizes.size(); ++i) {
             PADDLE_ENFORCE_GT(anchor_sizes[i],
                               0.0,
-                              platform::errors::InvalidArgument(
+                              phi::errors::InvalidArgument(
                                   "anchor_sizes[%d] must be positive.", i));
           }
         });
@@ -118,14 +118,14 @@ class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker {
                                 "(vector<float>) List of variances to be used "
                                 "in box regression deltas")
         .AddCustomChecker([](const std::vector<float>& variances) {
-          PADDLE_ENFORCE_EQ(variances.size(),
-                            4UL,
-                            platform::errors::InvalidArgument(
-                                "Must provide 4 variance only."));
+          PADDLE_ENFORCE_EQ(
+              variances.size(),
+              4UL,
+              phi::errors::InvalidArgument("Must provide 4 variance only."));
           for (size_t i = 0; i < variances.size(); ++i) {
             PADDLE_ENFORCE_GT(variances[i],
                               0.0,
-                              platform::errors::InvalidArgument(
+                              phi::errors::InvalidArgument(
                                   "variance[%d] must be greater than 0.", i));
           }
         });
@@ -138,12 +138,12 @@ class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker {
           PADDLE_ENFORCE_EQ(
               stride.size(),
               2UL,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "Must provide 2 stride for width and height only."));
           for (size_t i = 0; i < stride.size(); ++i) {
             PADDLE_ENFORCE_GT(stride[i],
                               0.0,
-                              platform::errors::InvalidArgument(
+                              phi::errors::InvalidArgument(
                                   "stride[%d] should be larger than 0.", i));
           }
         });
diff --git a/paddle/fluid/operators/detection/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc
index 53c082add0fa5..32942a03f1ab4 100644
--- a/paddle/fluid/operators/detection/bipartite_match_op.cc
+++ b/paddle/fluid/operators/detection/bipartite_match_op.cc
@@ -26,24 +26,24 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("DistMat"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(DistMat) of BipartiteMatch should not be null."));
     PADDLE_ENFORCE_EQ(ctx->HasOutput("ColToRowMatchIndices"),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Output(ColToRowMatchIndices) of BipartiteMatch "
                           "should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("ColToRowMatchDist"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Output(ColToRowMatchDist) of BipartiteMatch should not be null."));
 
     auto dims = ctx->GetInputDim("DistMat");
-    PADDLE_ENFORCE_EQ(dims.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The rank of Input(DistMat) must be 2."));
+    PADDLE_ENFORCE_EQ(
+        dims.size(),
+        2,
+        phi::errors::InvalidArgument("The rank of Input(DistMat) must be 2."));
 
     ctx->SetOutputDim("ColToRowMatchIndices", dims);
     ctx->SetOutputDim("ColToRowMatchDist", dims);
@@ -75,7 +75,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         dist.dims().size(),
         2,
-        platform::errors::InvalidArgument("The rank of dist must be 2."));
+        phi::errors::InvalidArgument("The rank of dist must be 2."));
     int64_t row = dist.dims()[0];
     int64_t col = dist.dims()[1];
     auto* dist_data = dist.data<T>();
@@ -140,7 +140,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
           PADDLE_ENFORCE_EQ(
               match_indices[max_idx],
               -1,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "The match_indices must be initialized to -1 at [%d].",
                   max_idx));
           match_indices[max_idx] = max_row_idx;
@@ -183,7 +183,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_EQ(
             match_indices[j],
             -1,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "The match_indices must be initialized to -1 at [%d].", j));
         match_indices[j] = max_row_idx;
         match_dist[j] = max_dist;
@@ -208,7 +208,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           dist_mat->lod().size(),
           1UL,
-          platform::errors::InvalidArgument("Only support 1 level of LoD."));
+          phi::errors::InvalidArgument("Only support 1 level of LoD."));
     }
     match_indices->mutable_data<int>({n, col}, context.GetPlace());
     match_dist->mutable_data<T>({n, col}, context.GetPlace());
diff --git a/paddle/fluid/operators/detection/box_clip_op.cc b/paddle/fluid/operators/detection/box_clip_op.cc
index 5af100b8f6407..8df39b759cabb 100644
--- a/paddle/fluid/operators/detection/box_clip_op.cc
+++ b/paddle/fluid/operators/detection/box_clip_op.cc
@@ -24,12 +24,12 @@ class BoxClipOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE_EQ(ctx->HasInput("Input"),
                       true,
-                      platform::errors::NotFound("Input(Input) of BoxClipOp "
-                                                 "is not found."));
+                      phi::errors::NotFound("Input(Input) of BoxClipOp "
+                                            "is not found."));
     PADDLE_ENFORCE_EQ(ctx->HasInput("ImInfo"),
                       true,
-                      platform::errors::NotFound("Input(ImInfo) of BoxClipOp "
-                                                 "is not found."));
+                      phi::errors::NotFound("Input(ImInfo) of BoxClipOp "
+                                            "is not found."));
 
     auto input_box_dims = ctx->GetInputDim("Input");
     auto im_info_dims = ctx->GetInputDim("ImInfo");
@@ -39,20 +39,20 @@ class BoxClipOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           input_box_dims[input_box_size - 1],
           4,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The last dimension of Input(Input) in BoxClipOp must be 4. "
               "But received last dimension = %d",
               input_box_dims[input_box_size - 1]));
       PADDLE_ENFORCE_EQ(im_info_dims.size(),
                         2,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The rank of Input(Input) in BoxClipOp must be 2."
                             " But received rank = %d",
                             im_info_dims.size()));
       PADDLE_ENFORCE_EQ(
           im_info_dims[1],
           3,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The last dimension of Input(ImInfo) of BoxClipOp must be 3. "
               "But received last dimension = %d",
               im_info_dims[1]));
diff --git a/paddle/fluid/operators/detection/box_clip_op.h b/paddle/fluid/operators/detection/box_clip_op.h
index c07185dec167c..18faf1e2fbbcd 100644
--- a/paddle/fluid/operators/detection/box_clip_op.h
+++ b/paddle/fluid/operators/detection/box_clip_op.h
@@ -31,7 +31,7 @@ class BoxClipKernel : public framework::OpKernel<T> {
     if (input_box->lod().size()) {
       PADDLE_ENFORCE_EQ(input_box->lod().size(),
                         1UL,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "Input(Input) of BoxClip only supports 1 level "
                             "of LoD. But received the "
                             "level = %d",
diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
index 552a6da3b3425..a7b9ad490b56c 100644
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
@@ -23,33 +23,33 @@ class BoxDecoderAndAssignOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("PriorBox"),
         true,
-        platform::errors::NotFound("Input(PriorBox) of BoxDecoderAndAssignOp "
-                                   "is not found."));
+        phi::errors::NotFound("Input(PriorBox) of BoxDecoderAndAssignOp "
+                              "is not found."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("PriorBoxVar"),
         true,
-        platform::errors::NotFound("Input(PriorBoxVar) of BoxDecoderAndAssignOp"
-                                   " is not found."));
+        phi::errors::NotFound("Input(PriorBoxVar) of BoxDecoderAndAssignOp"
+                              " is not found."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("TargetBox"),
         true,
-        platform::errors::NotFound("Input(TargetBox) of BoxDecoderAndAssignOp "
-                                   "is not found."));
+        phi::errors::NotFound("Input(TargetBox) of BoxDecoderAndAssignOp "
+                              "is not found."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("BoxScore"),
         true,
-        platform::errors::NotFound("Input(BoxScore) of BoxDecoderAndAssignOp "
-                                   "is not found."));
+        phi::errors::NotFound("Input(BoxScore) of BoxDecoderAndAssignOp "
+                              "is not found."));
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("DecodeBox"),
         true,
-        platform::errors::NotFound("Output(DecodeBox) of BoxDecoderAndAssignOp"
-                                   " is not found."));
+        phi::errors::NotFound("Output(DecodeBox) of BoxDecoderAndAssignOp"
+                              " is not found."));
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("OutputAssignBox"),
         true,
-        platform::errors::NotFound("Output(OutputAssignBox) of "
-                                   "BoxDecoderAndAssignOp is not found."));
+        phi::errors::NotFound("Output(OutputAssignBox) of "
+                              "BoxDecoderAndAssignOp is not found."));
 
     auto prior_box_dims = ctx->GetInputDim("PriorBox");
     auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
@@ -59,45 +59,45 @@ class BoxDecoderAndAssignOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         prior_box_dims.size(),
         2,
-        platform::errors::InvalidArgument("The rank of Input of PriorBox must"
-                                          " be 2. But received rank = %d",
-                                          prior_box_dims.size()));
+        phi::errors::InvalidArgument("The rank of Input of PriorBox must"
+                                     " be 2. But received rank = %d",
+                                     prior_box_dims.size()));
     PADDLE_ENFORCE_EQ(
         prior_box_dims[1],
         4,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The shape of PriorBox is [N, 4], "
             "and the second dimension must be 4. But received dimension = %d",
             prior_box_dims[1]));
     PADDLE_ENFORCE_EQ(
         prior_box_var_dims.size(),
         1,
-        platform::errors::InvalidArgument("The rank of Input of PriorBoxVar "
-                                          "must be 1. But received rank = %d",
-                                          prior_box_var_dims.size()));
+        phi::errors::InvalidArgument("The rank of Input of PriorBoxVar "
+                                     "must be 1. But received rank = %d",
+                                     prior_box_var_dims.size()));
     PADDLE_ENFORCE_EQ(
         prior_box_var_dims[0],
         4,
-        platform::errors::InvalidArgument("The shape of PriorBoxVar is [4]. "
-                                          "But received dimension = %d",
-                                          prior_box_var_dims[0]));
+        phi::errors::InvalidArgument("The shape of PriorBoxVar is [4]. "
+                                     "But received dimension = %d",
+                                     prior_box_var_dims[0]));
     PADDLE_ENFORCE_EQ(
         target_box_dims.size(),
         2,
-        platform::errors::InvalidArgument("The rank of Input of TargetBox must "
-                                          "be 2. But received rank = %d",
-                                          target_box_dims.size()));
+        phi::errors::InvalidArgument("The rank of Input of TargetBox must "
+                                     "be 2. But received rank = %d",
+                                     target_box_dims.size()));
     PADDLE_ENFORCE_EQ(
         box_score_dims.size(),
         2,
-        platform::errors::InvalidArgument("The rank of Input of BoxScore must "
-                                          "be 2. But received rank = %d",
-                                          box_score_dims.size()));
+        phi::errors::InvalidArgument("The rank of Input of BoxScore must "
+                                     "be 2. But received rank = %d",
+                                     box_score_dims.size()));
     if (ctx->IsRuntime()) {
       PADDLE_ENFORCE_EQ(
           prior_box_dims[0],
           target_box_dims[0],
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The first dimension of prior_box and "
               "target_box is the number of box and should be same. But "
               "received dimension of prior_box is %d, dimension of target_box "
@@ -107,7 +107,7 @@ class BoxDecoderAndAssignOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           prior_box_dims[0],
           box_score_dims[0],
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The first dimension of prior_box and "
               "box_score is the number of box and should be same. But received "
               "dimension of prior_box is %d, dimension of box_score is %d",
@@ -116,7 +116,7 @@ class BoxDecoderAndAssignOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           target_box_dims[1],
           box_score_dims[1] * prior_box_dims[1],
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The shape of target_box is "
               "[N, classnum * 4], The shape of box_score is [N, classnum], "
               "The shape of prior_box is [N, 4]. But received second dimension "
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
index db2f9726db56a..fd5161932ff22 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
@@ -24,18 +24,18 @@ class CollectFpnProposalsOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         context->HasInputs("MultiLevelRois"),
         true,
-        platform::errors::NotFound("Inputs(MultiLevelRois) of "
-                                   "CollectFpnProposalsOp is not found"));
+        phi::errors::NotFound("Inputs(MultiLevelRois) of "
+                              "CollectFpnProposalsOp is not found"));
     PADDLE_ENFORCE_EQ(
         context->HasInputs("MultiLevelScores"),
         true,
-        platform::errors::NotFound("Inputs(MultiLevelScores) of "
-                                   "CollectFpnProposalsOp is not found"));
+        phi::errors::NotFound("Inputs(MultiLevelScores) of "
+                              "CollectFpnProposalsOp is not found"));
     PADDLE_ENFORCE_EQ(
         context->HasOutput("FpnRois"),
         true,
-        platform::errors::NotFound("Outputs(MultiFpnRois) of "
-                                   "CollectFpnProposalsOp is not found"));
+        phi::errors::NotFound("Outputs(MultiFpnRois) of "
+                              "CollectFpnProposalsOp is not found"));
     auto roi_dims = context->GetInputsDim("MultiLevelRois");
     auto score_dims = context->GetInputsDim("MultiLevelScores");
     auto post_nms_topN = context->Attrs().Get<int>("post_nms_topN");
@@ -44,7 +44,7 @@ class CollectFpnProposalsOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           roi_dim[1],
           4,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Second dimension of Input"
               "(MultiLevelRois) must be 4. But received dimension = %d",
               roi_dim[1]));
@@ -53,7 +53,7 @@ class CollectFpnProposalsOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           score_dim[1],
           1,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Second dimension of Input"
               "(MultiLevelScores) must be 1. But received dimension = %d",
               score_dim[1]));
@@ -79,7 +79,7 @@ class CollectFpnProposalsOp : public framework::OperatorWithKernel {
         PADDLE_ENFORCE_EQ(
             roi_lod,
             score_lod,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "Inputs(MultiLevelRois) and "
                 "Inputs(MultiLevelScores) should have same lod."));
       }
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
index 462b4a4584ece..81356170598bf 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
@@ -76,7 +76,7 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
 
     PADDLE_ENFORCE_GE(post_nms_topN,
                       0UL,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The parameter post_nms_topN must be "
                           "a positive integer. But received post_nms_topN = %d",
                           post_nms_topN));
@@ -85,7 +85,7 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         multi_layer_rois.size(),
         multi_layer_scores.size(),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The number of RoIs and Scores should"
             " be the same. But received number of RoIs is %d, number of Scores "
             "is %d",
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cc b/paddle/fluid/operators/detection/density_prior_box_op.cc
index e79de60b7690d..4a533615aab15 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op.cc
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cc
@@ -29,7 +29,7 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         image_dims.size(),
         4,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The Input(Image) of Op(density_prior_box) should be a 4-D Tensor "
             "and data format is NCHW. But received Image's dimensions = %d, "
             "shape = [%s].",
@@ -38,7 +38,7 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         input_dims.size(),
         4,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The Input(Input) of Op(density_prior_box) should be a 4-D Tensor "
             "and data format is NCHW. But received Input's dimensions = %d, "
             "shape = [%s].",
@@ -49,7 +49,7 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_LT(
           input_dims[2],
           image_dims[2],
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The input tensor Input's height"
               "of DensityPriorBoxOp should be smaller than input tensor Image's"
               "height. But received Input's height = %d, Image's height = %d",
@@ -59,7 +59,7 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_LT(
           input_dims[3],
           image_dims[3],
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The input tensor Input's width"
               "of DensityPriorBoxOp should be smaller than input tensor Image's"
               "width. But received Input's width = %d, Image's width = %d",
@@ -76,7 +76,7 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         fixed_sizes.size(),
         densities.size(),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The length of fixed_sizes and densities must be equal. "
             "But received: fixed_sizes's length is %d, densities's length "
             "is %d",
@@ -139,14 +139,14 @@ class DensityPriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
         .AddCustomChecker([](const std::vector<float>& variances) {
           PADDLE_ENFORCE_EQ(variances.size(),
                             4,
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "The length of variance must "
                                 "be 4. But received: variances' length is %d.",
                                 variances.size()));
           for (size_t i = 0; i < variances.size(); ++i) {
             PADDLE_ENFORCE_GT(variances[i],
                               0.0,
-                              platform::errors::OutOfRange(
+                              phi::errors::OutOfRange(
                                   "variance[%d] must be greater "
                                   "than 0. But received: variance[%d] = %f",
                                   i,
@@ -165,24 +165,24 @@ class DensityPriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
         "Density prior boxes step across width, 0.0 for auto calculation.")
         .SetDefault(0.0)
         .AddCustomChecker([](const float& step_w) {
-          PADDLE_ENFORCE_GE(step_w,
-                            0.0,
-                            platform::errors::InvalidArgument(
-                                "step_w should be larger "
-                                "than 0. But received: step_w = %f.",
-                                step_w));
+          PADDLE_ENFORCE_GE(
+              step_w,
+              0.0,
+              phi::errors::InvalidArgument("step_w should be larger "
+                                           "than 0. But received: step_w = %f.",
+                                           step_w));
         });
     AddAttr<float>(
         "step_h",
         "Density prior boxes step across height, 0.0 for auto calculation.")
         .SetDefault(0.0)
         .AddCustomChecker([](const float& step_h) {
-          PADDLE_ENFORCE_GE(step_h,
-                            0.0,
-                            platform::errors::InvalidArgument(
-                                "step_h should be larger "
-                                "than 0. But received: step_h = %f.",
-                                step_h));
+          PADDLE_ENFORCE_GE(
+              step_h,
+              0.0,
+              phi::errors::InvalidArgument("step_h should be larger "
+                                           "than 0. But received: step_h = %f.",
+                                           step_h));
         });
 
     AddAttr<float>("offset",
@@ -198,7 +198,7 @@ class DensityPriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
             PADDLE_ENFORCE_GT(
                 fixed_sizes[i],
                 0.0,
-                platform::errors::OutOfRange(
+                phi::errors::OutOfRange(
                     "fixed_sizes[%d] should be "
                     "larger than 0. But received: fixed_sizes[%d] = %f",
                     i,
@@ -216,7 +216,7 @@ class DensityPriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
             PADDLE_ENFORCE_GT(
                 fixed_ratios[i],
                 0.0,
-                platform::errors::OutOfRange(
+                phi::errors::OutOfRange(
                     "fixed_ratios[%d] should be "
                     "larger than 0. But received: fixed_ratios[%d] = %f",
                     i,
@@ -234,7 +234,7 @@ class DensityPriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
             PADDLE_ENFORCE_GT(
                 densities[i],
                 0,
-                platform::errors::OutOfRange(
+                phi::errors::OutOfRange(
                     "densities[%d] should be "
                     "larger than 0. But received: densities[%d] = %f.",
                     i,
diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
index bf56a6f857e0d..5ee843d72387b 100644
--- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -44,57 +44,57 @@ class GenerateMaskLabelsOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("ImInfo"),
         true,
-        platform::errors::InvalidArgument("Input(ImInfo) shouldn't be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("GtClasses"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(GtClasses) shouldn't be null."));
+        phi::errors::InvalidArgument("Input(ImInfo) shouldn't be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("GtClasses"),
+        true,
+        phi::errors::InvalidArgument("Input(GtClasses) shouldn't be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("IsCrowd"),
         true,
-        platform::errors::InvalidArgument("Input(IsCrowd) shouldn't be null."));
+        phi::errors::InvalidArgument("Input(IsCrowd) shouldn't be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("GtSegms"),
         true,
-        platform::errors::InvalidArgument("Input(GtSegms) shouldn't be null."));
+        phi::errors::InvalidArgument("Input(GtSegms) shouldn't be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("Rois"),
         true,
-        platform::errors::InvalidArgument("Input(Rois) shouldn't be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("LabelsInt32"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(LabelsInt32) shouldn't be null."));
+        phi::errors::InvalidArgument("Input(Rois) shouldn't be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("LabelsInt32"),
+        true,
+        phi::errors::InvalidArgument("Input(LabelsInt32) shouldn't be null."));
 
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("MaskRois"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Output(MaskRois) of GenerateMaskLabelsOp should not be null"));
     PADDLE_ENFORCE_EQ(ctx->HasOutput("RoiHasMaskInt32"),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Output(RoiHasMaskInt32) of GenerateMaskLabelsOp "
                           "should not be null"));
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("MaskInt32"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Output(MaskInt32) of GenerateMaskLabelsOp should not be null"));
 
     auto im_info_dims = ctx->GetInputDim("ImInfo");
     auto gt_segms_dims = ctx->GetInputDim("GtSegms");
-    PADDLE_ENFORCE_EQ(im_info_dims.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The rank of Input(ImInfo) must be 2."));
-    PADDLE_ENFORCE_EQ(gt_segms_dims.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The rank of Input(GtSegms) must be 2."));
+    PADDLE_ENFORCE_EQ(
+        im_info_dims.size(),
+        2,
+        phi::errors::InvalidArgument("The rank of Input(ImInfo) must be 2."));
+    PADDLE_ENFORCE_EQ(
+        gt_segms_dims.size(),
+        2,
+        phi::errors::InvalidArgument("The rank of Input(GtSegms) must be 2."));
     PADDLE_ENFORCE_EQ(gt_segms_dims[1],
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The second dim of Input(GtSegms) must be 2."));
     int num_classes = ctx->Attrs().Get<int>("num_classes");
     int resolution = ctx->Attrs().Get<int>("resolution");
@@ -170,7 +170,7 @@ std::vector<phi::DenseTensor> SampleMaskForOneImage(
   const int* label_int32_data = label_int32.data<int>();
   PADDLE_ENFORCE_EQ(roi_size,
                     label_int32.dims()[0],
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The first dim of label [%d] is the different from "
                         "roi_size [%d], they should be same.",
                         label_int32.dims()[0],
@@ -197,7 +197,7 @@ std::vector<phi::DenseTensor> SampleMaskForOneImage(
         int e = static_cast<int>(lod2[s_idx + j + 1]);
         PADDLE_ENFORCE_NE(s,
                           e,
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "The start point and the end point in the poly "
                               "segment [%d] should not be same, but received "
                               "the start point [%d] and the end point [%d].",
@@ -349,34 +349,34 @@ class GenerateMaskLabelsKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         gt_classes->lod().size(),
         1UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "GenerateMaskLabelsOp gt_classes needs 1 level of LoD"));
     PADDLE_ENFORCE_EQ(
         is_crowd->lod().size(),
         1UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "GenerateMaskLabelsOp is_crowd needs 1 level of LoD"));
     PADDLE_ENFORCE_EQ(rois->lod().size(),
                       1UL,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "GenerateMaskLabelsOp rois needs 1 level of LoD"));
     PADDLE_ENFORCE_EQ(
         label_int32->lod().size(),
         1UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "GenerateMaskLabelsOp label_int32 needs 1 level of LoD"));
 
     PADDLE_ENFORCE_EQ(
         gt_segms->lod().size(),
         3UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "GenerateMaskLabelsOp gt_segms needs 3 level of LoD"));
 
     int64_t n = static_cast<int64_t>(gt_classes->lod().back().size() - 1);
     PADDLE_ENFORCE_EQ(
         gt_segms->lod()[0].size() - 1,
         n,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Batchsize of Input(gt_segms) and Input(gt_classes) should be "
             "same, but received gt_segms[%d], gt_classes[%d].",
             gt_segms->lod()[0].size() - 1,
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index a0fb3ec799eea..ad37aa2ae682f 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -68,49 +68,49 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("RpnRois"),
         true,
-        platform::errors::NotFound("Input(RpnRois) shouldn't be null."));
+        phi::errors::NotFound("Input(RpnRois) shouldn't be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("GtClasses"),
         true,
-        platform::errors::NotFound("Input(GtClasses) shouldn't be null."));
+        phi::errors::NotFound("Input(GtClasses) shouldn't be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("IsCrowd"),
         true,
-        platform::errors::NotFound("Input(IsCrowd) shouldn't be null."));
+        phi::errors::NotFound("Input(IsCrowd) shouldn't be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("GtBoxes"),
         true,
-        platform::errors::NotFound("Input(GtBoxes) shouldn't be null."));
+        phi::errors::NotFound("Input(GtBoxes) shouldn't be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("ImInfo"),
         true,
-        platform::errors::NotFound("Input(ImInfo) shouldn't be null."));
+        phi::errors::NotFound("Input(ImInfo) shouldn't be null."));
 
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("Rois"),
         true,
-        platform::errors::NotFound(
+        phi::errors::NotFound(
             "Output(Rois) of GenerateProposalLabelsOp should not be null"));
     PADDLE_ENFORCE_EQ(ctx->HasOutput("LabelsInt32"),
                       true,
-                      platform::errors::NotFound("Output(LabelsInt32) of "
-                                                 "GenerateProposalLabelsOp "
-                                                 "should not be null"));
+                      phi::errors::NotFound("Output(LabelsInt32) of "
+                                            "GenerateProposalLabelsOp "
+                                            "should not be null"));
     PADDLE_ENFORCE_EQ(ctx->HasOutput("BboxTargets"),
                       true,
-                      platform::errors::NotFound("Output(BboxTargets) of "
-                                                 "GenerateProposalLabelsOp "
-                                                 "should not be null"));
+                      phi::errors::NotFound("Output(BboxTargets) of "
+                                            "GenerateProposalLabelsOp "
+                                            "should not be null"));
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("BboxInsideWeights"),
         true,
-        platform::errors::NotFound(
+        phi::errors::NotFound(
             "Output(BboxInsideWeights) of GenerateProposalLabelsOp "
             "should not be null"));
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("BboxOutsideWeights"),
         true,
-        platform::errors::NotFound(
+        phi::errors::NotFound(
             "Output(BboxOutsideWeights) of GenerateProposalLabelsOp "
             "should not be null"));
 
@@ -120,21 +120,21 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_EQ(rpn_rois_dims.size(),
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dimensions size of Input(RpnRois) must be 2. "
                           "But received dimensions size=[%d], dimensions=[%s].",
                           rpn_rois_dims.size(),
                           rpn_rois_dims));
     PADDLE_ENFORCE_EQ(gt_boxes_dims.size(),
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dimensions size of Input(GtBoxes) must be 2. "
                           "But received dimensions size=[%d], dimensions=[%s].",
                           gt_boxes_dims.size(),
                           gt_boxes_dims));
     PADDLE_ENFORCE_EQ(im_info_dims.size(),
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dimensions size of Input(ImInfo) must be 2. But "
                           "received dimensions size=[%d], dimensions=[%s].",
                           im_info_dims.size(),
@@ -146,7 +146,7 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           ctx->HasInput("MaxOverlap"),
           true,
-          platform::errors::NotFound(
+          phi::errors::NotFound(
               "Input(MaxOverlap) of GenerateProposalLabelsOp "
               "should not be null when is_cascade_rcnn is True."));
     }
@@ -544,7 +544,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         rpn_rois->lod().size(),
         1UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "GenerateProposalLabelsOp rpn_rois needs 1 level of LoD. But "
             "received level of LoD is [%d], LoD is [%s].",
             rpn_rois->lod().size(),
@@ -552,7 +552,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         gt_classes->lod().size(),
         1UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "GenerateProposalLabelsOp gt_classes needs 1 level of LoD. But "
             "received level of LoD is [%d], LoD is [%s].",
             gt_classes->lod().size(),
@@ -560,7 +560,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         is_crowd->lod().size(),
         1UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "GenerateProposalLabelsOp is_crowd needs 1 level of LoD. But "
             "received level of LoD is [%d], LoD is [%s].",
             is_crowd->lod().size(),
@@ -568,7 +568,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         gt_boxes->lod().size(),
         1UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "GenerateProposalLabelsOp gt_boxes needs 1 level of LoD. But "
             "received level of LoD is [%d], LoD is [%s].",
             gt_boxes->lod().size(),
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index 710db1668e237..5e961674cd774 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -35,23 +35,23 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("Scores"),
         true,
-        platform::errors::NotFound("Input(Scores) shouldn't be null."));
+        phi::errors::NotFound("Input(Scores) shouldn't be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("BboxDeltas"),
         true,
-        platform::errors::NotFound("Input(BboxDeltas) shouldn't be null."));
+        phi::errors::NotFound("Input(BboxDeltas) shouldn't be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("ImInfo"),
         true,
-        platform::errors::NotFound("Input(ImInfo) shouldn't be null."));
+        phi::errors::NotFound("Input(ImInfo) shouldn't be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("Anchors"),
         true,
-        platform::errors::NotFound("Input(Anchors) shouldn't be null."));
+        phi::errors::NotFound("Input(Anchors) shouldn't be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("Variances"),
         true,
-        platform::errors::NotFound("Input(Variances) shouldn't be null."));
+        phi::errors::NotFound("Input(Variances) shouldn't be null."));
 
     ctx->SetOutputDim("RpnRois", {-1, 4});
     ctx->SetOutputDim("RpnRoiProbs", {-1, 1});
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index d24cbcb81d019..1bb494f7fa508 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -151,7 +151,7 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
     float eta = context.Attr<float>("eta");
     PADDLE_ENFORCE_GE(eta,
                       1.,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Not support adaptive NMS. The attribute 'eta' "
                           "should not less than 1. But received eta=[%d]",
                           eta));
diff --git a/paddle/fluid/operators/detection/iou_similarity_op.cc b/paddle/fluid/operators/detection/iou_similarity_op.cc
deleted file mode 100644
index 0f2ac1c86d628..0000000000000
--- a/paddle/fluid/operators/detection/iou_similarity_op.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/iou_similarity_op.h"
-
-namespace paddle {
-namespace operators {
-
-class IOUSimilarityOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "iou_similarity");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "iou_similarity");
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    PADDLE_ENFORCE_EQ(
-        x_dims.size(),
-        2UL,
-        platform::errors::InvalidArgument(
-            "The rank of Input(X) must be 2, but got dimension = %d.",
-            x_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        x_dims[1],
-        4UL,
-        platform::errors::InvalidArgument(
-            "The shape of X is [N, 4], bug got dimension = %d.", x_dims[1]));
-    PADDLE_ENFORCE_EQ(
-        y_dims.size(),
-        2UL,
-        platform::errors::InvalidArgument(
-            "The rank of Input(Y) must be 2, but got dimension = %d.",
-            y_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        y_dims[1],
-        4UL,
-        platform::errors::InvalidArgument(
-            "The shape of Y is [M, 4], but got dimension = %d.", y_dims[1]));
-
-    ctx->ShareLoD("X", /*->*/ "Out");
-    ctx->SetOutputDim("Out", common::make_ddim({x_dims[0], y_dims[0]}));
-  }
-};
-
-class IOUSimilarityOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "X",
-        "(phi::DenseTensor, default phi::DenseTensor<float>) "
-        "Box list X is a 2-D phi::DenseTensor with shape [N, 4] holds N boxes, "
-        "each box is represented as [xmin, ymin, xmax, ymax], "
-        "the shape of X is [N, 4]. [xmin, ymin] is the left top "
-        "coordinate of the box if the input is image feature map, they "
-        "are close to the origin of the coordinate system. "
-        "[xmax, ymax] is the right bottom coordinate of the box. "
-        "This tensor can contain LoD information to represent a batch "
-        "of inputs. One instance of this batch can contain different "
-        "numbers of entities.");
-    AddInput("Y",
-             "(Tensor, default Tensor<float>) "
-             "Box list Y holds M boxes, each box is represented as "
-             "[xmin, ymin, xmax, ymax], the shape of X is [N, 4]. "
-             "[xmin, ymin] is the left top coordinate of the box if the "
-             "input is image feature map, and [xmax, ymax] is the right "
-             "bottom coordinate of the box.");
-    AddAttr<bool>("box_normalized",
-                  "(bool, default true) "
-                  "whether treat the priorbox as a normalized box")
-        .SetDefault(true);
-    AddOutput("Out",
-              "(phi::DenseTensor, the lod is same as input X) The output of "
-              "iou_similarity op, a tensor with shape [N, M] "
-              "representing pairwise iou scores.");
-
-    AddComment(R"DOC(
-**IOU Similarity Operator**
-
-Computes intersection-over-union (IOU) between two box lists.
-Box list 'X' should be a phi::DenseTensor and 'Y' is a common Tensor,
-boxes in 'Y' are shared by all instance of the batched inputs of X.
-Given two boxes A and B, the calculation of IOU is as follows:
-
-$$
-IOU(A, B) =
-\\frac{area(A\\cap B)}{area(A)+area(B)-area(A\\cap B)}
-$$
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    iou_similarity,
-    ops::IOUSimilarityOp,
-    ops::IOUSimilarityOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-PD_REGISTER_STRUCT_KERNEL(
-    iou_similarity, CPU, ALL_LAYOUT, ops::IOUSimilarityKernel, float, double) {}
diff --git a/paddle/fluid/operators/detection/iou_similarity_op.cu b/paddle/fluid/operators/detection/iou_similarity_op.cu
deleted file mode 100644
index e4e001e096558..0000000000000
--- a/paddle/fluid/operators/detection/iou_similarity_op.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/iou_similarity_op.h"
-
-namespace ops = paddle::operators;
-PD_REGISTER_STRUCT_KERNEL(
-    iou_similarity, GPU, ALL_LAYOUT, ops::IOUSimilarityKernel, float, double) {}
diff --git a/paddle/fluid/operators/detection/iou_similarity_op.h b/paddle/fluid/operators/detection/iou_similarity_op.h
deleted file mode 100644
index 75e7b9096241a..0000000000000
--- a/paddle/fluid/operators/detection/iou_similarity_op.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-
-template <typename T>
-inline HOSTDEVICE T IOUSimilarity(T xmin1,
-                                  T ymin1,
-                                  T xmax1,
-                                  T ymax1,
-                                  T xmin2,
-                                  T ymin2,
-                                  T xmax2,
-                                  T ymax2,
-                                  bool normalized,
-                                  T eps) {
-  constexpr T zero = static_cast<T>(0);
-  T area1;
-  T area2;
-  if (!normalized) {
-    area1 = (ymax1 - ymin1 + 1) * (xmax1 - xmin1 + 1);
-    area2 = (ymax2 - ymin2 + 1) * (xmax2 - xmin2 + 1);
-  } else {
-    area1 = (ymax1 - ymin1) * (xmax1 - xmin1);
-    area2 = (ymax2 - ymin2) * (xmax2 - xmin2);
-  }
-
-  T inter_xmax = xmax1 > xmax2 ? xmax2 : xmax1;
-  T inter_ymax = ymax1 > ymax2 ? ymax2 : ymax1;
-  T inter_xmin = xmin1 > xmin2 ? xmin1 : xmin2;
-  T inter_ymin = ymin1 > ymin2 ? ymin1 : ymin2;
-  T inter_height = inter_ymax - inter_ymin;
-  T inter_width = inter_xmax - inter_xmin;
-  if (!normalized) {
-    inter_height = inter_height + 1;
-    inter_width = inter_width + 1;
-  }
-  inter_height = inter_height > zero ? inter_height : zero;
-  inter_width = inter_width > zero ? inter_width : zero;
-  T inter_area = inter_width * inter_height;
-  T union_area = area1 + area2 - inter_area + eps;
-  T sim_score = inter_area / union_area;
-  return sim_score;
-}
-
-template <typename T>
-struct IOUSimilarityFunctor {
-  IOUSimilarityFunctor(
-      const T* x, const T* y, T* z, int cols, bool normalized, T eps)
-      : x_(x),
-        y_(y),
-        z_(z),
-        cols_(static_cast<size_t>(cols)),
-        normalized_(normalized),
-        eps_(eps) {}
-
-  inline HOSTDEVICE void operator()(size_t tid) const {
-    size_t row_id = tid / cols_;
-    size_t col_id = tid % cols_;
-
-    T x_min1 = x_[row_id * 4];
-    T y_min1 = x_[row_id * 4 + 1];
-    T x_max1 = x_[row_id * 4 + 2];
-    T y_max1 = x_[row_id * 4 + 3];
-
-    T x_min2 = y_[col_id * 4];
-    T y_min2 = y_[col_id * 4 + 1];
-    T x_max2 = y_[col_id * 4 + 2];
-    T y_max2 = y_[col_id * 4 + 3];
-
-    T sim = IOUSimilarity(x_min1,
-                          y_min1,
-                          x_max1,
-                          y_max1,
-                          x_min2,
-                          y_min2,
-                          x_max2,
-                          y_max2,
-                          normalized_,
-                          eps_);
-
-    z_[row_id * cols_ + col_id] = sim;
-  }
-  const T* x_;
-  const T* y_;
-  T* z_;
-  const size_t cols_;
-  bool normalized_;
-  T eps_;
-};
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class IOUSimilarityKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* in_x = ctx.Input<phi::DenseTensor>("X");
-    const phi::DenseTensor* in_y = ctx.Input<phi::DenseTensor>("Y");
-    bool normalized = ctx.Attr<bool>("box_normalized");
-    phi::DenseTensor* out = ctx.Output<phi::DenseTensor>("Out");
-
-    int x_n = in_x->dims()[0];
-    int y_n = in_y->dims()[0];
-    T eps = static_cast<T>(1e-10);
-    IOUSimilarityFunctor<T> functor(in_x->data<T>(),
-                                    in_y->data<T>(),
-                                    out->mutable_data<T>(ctx.GetPlace()),
-                                    y_n,
-                                    normalized,
-                                    eps);
-
-    platform::ForRange<DeviceContext> for_range(
-        static_cast<const DeviceContext&>(ctx.device_context()), x_n * y_n);
-    for_range(functor);
-  }
-};  // namespace operators
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/iou_similarity_op_xpu.cc b/paddle/fluid/operators/detection/iou_similarity_op_xpu.cc
deleted file mode 100644
index 27ffa64c2a892..0000000000000
--- a/paddle/fluid/operators/detection/iou_similarity_op_xpu.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU
-
-#include "paddle/fluid/operators/detection/iou_similarity_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class XPUIOUSimilarityKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* in_x = ctx.Input<phi::DenseTensor>("X");
-    const phi::DenseTensor* in_y = ctx.Input<phi::DenseTensor>("Y");
-    bool normalized = ctx.Attr<bool>("box_normalized");
-    phi::DenseTensor* out = ctx.Output<phi::DenseTensor>("Out");
-
-    int x_n = in_x->dims()[0];
-    int y_n = in_y->dims()[0];
-    T eps = static_cast<T>(1e-10);
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    int r = xpu::iou_similarity(dev_ctx.x_context(),
-                                in_x->data<T>(),
-                                in_y->data<T>(),
-                                out->mutable_data<T>(ctx.GetPlace()),
-                                x_n,
-                                y_n,
-                                eps,
-                                normalized);
-    PADDLE_ENFORCE_EQ(
-        r,
-        XPU_SUCCESS,
-        platform::errors::External(
-            "XPU iou_similarity kernel return wrong value[%d %s].",
-            r,
-            XPUAPIErrorMsg[r]));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using XPU = paddle::platform::XPUDeviceContext;
-
-PD_REGISTER_STRUCT_KERNEL(
-    iou_similarity, XPU, ALL_LAYOUT, ops::XPUIOUSimilarityKernel, float) {}
-
-#endif
diff --git a/paddle/fluid/operators/detection/locality_aware_nms_op.cc b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
deleted file mode 100644
index 8d3ed1a033acf..0000000000000
--- a/paddle/fluid/operators/detection/locality_aware_nms_op.cc
+++ /dev/null
@@ -1,524 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-limitations under the License. */
-
-#include <glog/logging.h>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/detection/nms_util.h"
-
-namespace paddle {
-namespace operators {
-
-class LocalityAwareNMSOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("BBoxes"), "Input", "BBoxes", "locality_aware_nms");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Scores"), "Input", "Scores", "locality_aware_nms");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Out"), "Output", "Out", "locality_aware_nms");
-
-    auto box_dims = ctx->GetInputDim("BBoxes");
-    auto score_dims = ctx->GetInputDim("Scores");
-    auto score_size = score_dims.size();
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          score_size,
-          3,
-          platform::errors::InvalidArgument(
-              "The rank of Input(Scores) must be 3. But received %d.",
-              score_size));
-      PADDLE_ENFORCE_EQ(
-          box_dims.size(),
-          3,
-          platform::errors::InvalidArgument(
-              "The rank of Input(BBoxes) must be 3. But received %d.",
-              box_dims.size()));
-      PADDLE_ENFORCE_EQ(
-          box_dims[2] == 4 || box_dims[2] == 8 || box_dims[2] == 16 ||
-              box_dims[2] == 24 || box_dims[2] == 32,
-          true,
-          platform::errors::InvalidArgument(
-              "The last dimension of Input(BBoxes) must be 4 or 8, "
-              "represents the layout of coordinate "
-              "[xmin, ymin, xmax, ymax] or "
-              "4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or "
-              "8 points: [xi, yi] i= 1,2,...,8 or "
-              "12 points: [xi, yi] i= 1,2,...,12 or "
-              "16 points: [xi, yi] i= 1,2,...,16. "
-              "But received %d.",
-              box_dims[2]));
-      PADDLE_ENFORCE_EQ(
-          box_dims[1],
-          score_dims[2],
-          platform::errors::InvalidArgument(
-              "The 2nd dimension of Input(BBoxes) must be equal to "
-              "last dimension of Input(Scores), which represents the "
-              "predicted bboxes. But received the 2nd dimension of "
-              "Input(BBoxes) was %d, last dimension of Input(Scores) was %d.",
-              box_dims[1],
-              score_dims[2]));
-    }
-    // Here the box_dims[0] is not the real dimension of output.
-    // It will be rewritten in the computing kernel.
-    ctx->SetOutputDim("Out", {box_dims[1], box_dims[2] + 2});
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(
-        OperatorWithKernel::IndicateVarDataType(ctx, "Scores"),
-        platform::CPUPlace());
-  }
-};
-
-template <class T>
-void PolyWeightedMerge(const T* box1,
-                       T* box2,
-                       const T score1,
-                       const T score2,
-                       const size_t box_size) {
-  for (size_t i = 0; i < box_size; ++i) {
-    box2[i] = (box1[i] * score1 + box2[i] * score2) / (score1 + score2);
-  }
-}
-
-template <class T>
-void GetMaxScoreIndexWithLocalityAware(
-    T* scores,
-    T* bbox_data,
-    int64_t box_size,
-    const T threshold,
-    int top_k,
-    int64_t num_boxes,
-    std::vector<std::pair<T, int>>* sorted_indices,
-    const T nms_threshold,
-    const bool normalized) {
-  std::vector<bool> skip(num_boxes, true);
-  int index = -1;
-  for (int64_t i = 0; i < num_boxes; ++i) {
-    if (index > -1) {
-      T overlap = T(0.);
-      if (box_size == 4) {
-        overlap = phi::funcs::JaccardOverlap<T>(
-            bbox_data + i * box_size, bbox_data + index * box_size, normalized);
-      }
-      // 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32
-      if (box_size == 8 || box_size == 16 || box_size == 24 || box_size == 32) {
-        overlap = phi::funcs::PolyIoU<T>(bbox_data + i * box_size,
-                                         bbox_data + index * box_size,
-                                         box_size,
-                                         normalized);
-      }
-
-      if (overlap > nms_threshold) {
-        PolyWeightedMerge(bbox_data + i * box_size,
-                          bbox_data + index * box_size,
-                          scores[i],
-                          scores[index],
-                          box_size);
-        scores[index] += scores[i];
-      } else {
-        skip[index] = false;
-        index = static_cast<int>(i);
-      }
-    } else {
-      index = static_cast<int>(i);
-    }
-  }
-
-  if (index > -1) {
-    skip[index] = false;
-  }
-  for (int64_t i = 0; i < num_boxes; ++i) {
-    if (scores[i] > threshold && skip[i] == false) {
-      sorted_indices->push_back(std::make_pair(scores[i], i));
-    }
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices->begin(),
-                   sorted_indices->end(),
-                   phi::funcs::SortScorePairDescend<int>);
-  // Keep top_k scores if needed.
-  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
-    sorted_indices->resize(top_k);
-  }
-}
-
-template <typename T, typename DeviceContext>
-class LocalityAwareNMSKernel : public framework::OpKernel<T> {
- public:
-  void LocalityAwareNMSFast(phi::DenseTensor* bbox,
-                            phi::DenseTensor* scores,
-                            const T score_threshold,
-                            const T nms_threshold,
-                            const T eta,
-                            const int64_t top_k,
-                            std::vector<int>* selected_indices,
-                            const bool normalized) const {
-    // The total boxes for each instance.
-    int64_t num_boxes = bbox->dims()[0];
-    // 4: [xmin ymin xmax ymax]
-    // 8: [x1 y1 x2 y2 x3 y3 x4 y4]
-    // 16, 24, or 32: [x1 y1 x2 y2 ...  xn yn], n = 8, 12 or 16
-    int64_t box_size = bbox->dims()[1];
-
-    std::vector<std::pair<T, int>> sorted_indices;
-    T adaptive_threshold = nms_threshold;
-    T* bbox_data = bbox->data<T>();
-    T* scores_data = scores->data<T>();
-
-    GetMaxScoreIndexWithLocalityAware(scores_data,
-                                      bbox_data,
-                                      box_size,
-                                      score_threshold,
-                                      top_k,
-                                      num_boxes,
-                                      &sorted_indices,
-                                      nms_threshold,
-                                      normalized);
-
-    selected_indices->clear();
-
-    while (!sorted_indices.empty()) {
-      const int idx = sorted_indices.front().second;
-      bool keep = true;
-      for (int kept_idx : *selected_indices) {
-        if (keep) {
-          T overlap = T(0.);
-          // 4: [xmin ymin xmax ymax]
-          if (box_size == 4) {
-            overlap =
-                phi::funcs::JaccardOverlap<T>(bbox_data + idx * box_size,
-                                              bbox_data + kept_idx * box_size,
-                                              normalized);
-          }
-          // 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32
-          if (box_size == 8 || box_size == 16 || box_size == 24 ||
-              box_size == 32) {
-            overlap = phi::funcs::PolyIoU<T>(bbox_data + idx * box_size,
-                                             bbox_data + kept_idx * box_size,
-                                             box_size,
-                                             normalized);
-          }
-          keep = overlap <= adaptive_threshold;
-        } else {
-          break;
-        }
-      }
-      if (keep) {
-        selected_indices->push_back(idx);
-      }
-      sorted_indices.erase(sorted_indices.begin());
-      if (keep && eta < 1 && adaptive_threshold > 0.5) {
-        adaptive_threshold *= eta;
-      }
-    }
-    //    delete bbox_data;
-  }
-
-  void LocalityAwareNMS(const framework::ExecutionContext& ctx,
-                        phi::DenseTensor* scores,
-                        phi::DenseTensor* bboxes,
-                        const int scores_size,
-                        std::map<int, std::vector<int>>* indices,
-                        int* num_nmsed_out) const {
-    int64_t background_label = ctx.Attr<int>("background_label");
-    int64_t nms_top_k = ctx.Attr<int>("nms_top_k");
-    int64_t keep_top_k = ctx.Attr<int>("keep_top_k");
-    bool normalized = ctx.Attr<bool>("normalized");
-    T nms_threshold = static_cast<T>(ctx.Attr<float>("nms_threshold"));
-    T nms_eta = static_cast<T>(ctx.Attr<float>("nms_eta"));
-    T score_threshold = static_cast<T>(ctx.Attr<float>("score_threshold"));
-
-    int num_det = 0;
-
-    int64_t class_num = scores->dims()[0];
-    phi::DenseTensor bbox_slice, score_slice;
-    for (int64_t c = 0; c < class_num; ++c) {
-      if (c == background_label) continue;
-
-      score_slice = scores->Slice(c, c + 1);
-      bbox_slice = *bboxes;
-
-      LocalityAwareNMSFast(&bbox_slice,
-                           &score_slice,
-                           score_threshold,
-                           nms_threshold,
-                           nms_eta,
-                           nms_top_k,
-                           &((*indices)[c]),  // NOLINT
-                           normalized);
-      num_det += (*indices)[c].size();  // NOLINT
-    }
-
-    *num_nmsed_out = num_det;
-    const T* scores_data = scores->data<T>();
-    if (keep_top_k > -1 && num_det > keep_top_k) {
-      const T* sdata;
-      std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-      for (const auto& it : *indices) {
-        int label = it.first;
-
-        sdata = scores_data + label * scores->dims()[1];
-
-        const std::vector<int>& label_indices = it.second;
-        for (auto idx : label_indices) {
-          score_index_pairs.push_back(
-              std::make_pair(sdata[idx], std::make_pair(label, idx)));
-        }
-      }
-      // Keep top k results per image.
-      std::stable_sort(score_index_pairs.begin(),
-                       score_index_pairs.end(),
-                       phi::funcs::SortScorePairDescend<std::pair<int, int>>);
-      score_index_pairs.resize(keep_top_k);
-
-      // Store the new indices.
-      std::map<int, std::vector<int>> new_indices;
-      for (auto& score_index_pair : score_index_pairs) {
-        int label = score_index_pair.second.first;
-        int idx = score_index_pair.second.second;
-        new_indices[label].push_back(idx);
-      }
-
-      new_indices.swap(*indices);
-      *num_nmsed_out = keep_top_k;  // NOLINT
-    }
-  }
-
-  void LocalityAwareNMSOutput(
-      const platform::DeviceContext& ctx,
-      const phi::DenseTensor& scores,
-      const phi::DenseTensor& bboxes,
-      const std::map<int, std::vector<int>>& selected_indices,
-      const int scores_size,
-      phi::DenseTensor* outs,
-      int* oindices = nullptr,
-      const int offset = 0) const {
-    int64_t predict_dim = scores.dims()[1];
-    int64_t box_size = bboxes.dims()[1];
-    if (scores_size == 2) {
-      box_size = bboxes.dims()[2];
-    }
-    int64_t out_dim = box_size + 2;
-    auto* scores_data = scores.data<T>();
-    auto* bboxes_data = bboxes.data<T>();
-    auto* odata = outs->data<T>();
-    const T* sdata;
-    phi::DenseTensor bbox;
-    bbox.Resize({scores.dims()[0], box_size});
-    int count = 0;
-    for (const auto& it : selected_indices) {
-      int label = it.first;
-      const std::vector<int>& indices = it.second;
-      sdata = scores_data + label * predict_dim;
-      for (auto idx : indices) {
-        odata[count * out_dim] = label;  // label
-        const T* bdata;
-        bdata = bboxes_data + idx * box_size;
-        odata[count * out_dim + 1] = sdata[idx];  // score
-        if (oindices != nullptr) {
-          oindices[count] = offset + idx;
-        }
-
-        // xmin, ymin, xmax, ymax or multi-points coordinates
-        std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T));
-        count++;
-      }
-    }
-  }
-
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* boxes_input = ctx.Input<phi::DenseTensor>("BBoxes");
-    auto* scores_input = ctx.Input<phi::DenseTensor>("Scores");
-    auto* outs = ctx.Output<phi::DenseTensor>("Out");
-    auto& score_dims = scores_input->dims();
-    auto score_size = score_dims.size();
-    auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
-
-    phi::DenseTensor scores;
-    phi::DenseTensor boxes;
-    paddle::framework::TensorCopySync(
-        *scores_input, platform::CPUPlace(), &scores);
-    paddle::framework::TensorCopySync(
-        *boxes_input, platform::CPUPlace(), &boxes);
-    std::vector<std::map<int, std::vector<int>>> all_indices;
-    std::vector<size_t> batch_starts = {0};
-    int64_t batch_size = score_dims[0];
-    int64_t box_dim = boxes.dims()[2];
-    int64_t out_dim = box_dim + 2;
-    int num_nmsed_out = 0;
-    phi::DenseTensor boxes_slice, scores_slice;
-    int n = static_cast<int>(batch_size);
-    for (int i = 0; i < n; ++i) {
-      scores_slice = scores.Slice(i, i + 1);
-      scores_slice.Resize({score_dims[1], score_dims[2]});
-      boxes_slice = boxes.Slice(i, i + 1);
-      boxes_slice.Resize({score_dims[2], box_dim});
-
-      std::map<int, std::vector<int>> indices;
-      LocalityAwareNMS(ctx,
-                       &scores_slice,
-                       &boxes_slice,
-                       score_size,
-                       &indices,
-                       &num_nmsed_out);
-      all_indices.push_back(indices);
-      batch_starts.push_back(batch_starts.back() + num_nmsed_out);
-    }
-
-    int num_kept = static_cast<int>(batch_starts.back());
-    if (num_kept == 0) {
-      T* od = outs->mutable_data<T>({1, 1}, ctx.GetPlace());
-      od[0] = -1;
-      batch_starts = {0, 1};
-    } else {
-      outs->mutable_data<T>({num_kept, out_dim}, ctx.GetPlace());
-      int offset = 0;
-      int* oindices = nullptr;
-      for (int i = 0; i < n; ++i) {
-        scores_slice = scores.Slice(i, i + 1);
-        boxes_slice = boxes.Slice(i, i + 1);
-        scores_slice.Resize({score_dims[1], score_dims[2]});
-        boxes_slice.Resize({score_dims[2], box_dim});
-
-        int64_t s = static_cast<int64_t>(batch_starts[i]);
-        int64_t e = static_cast<int64_t>(batch_starts[i + 1]);
-        if (e > s) {
-          phi::DenseTensor out = outs->Slice(s, e);
-          LocalityAwareNMSOutput(dev_ctx,
-                                 scores_slice,
-                                 boxes_slice,
-                                 all_indices[i],
-                                 score_dims.size(),
-                                 &out,
-                                 oindices,
-                                 offset);
-        }
-      }
-    }
-
-    framework::LoD lod;
-    lod.emplace_back(batch_starts);
-    outs->set_lod(lod);
-  }
-};
-
-class LocalityAwareNMSOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("BBoxes",
-             "Two types of bboxes are supported:"
-             "1. (Tensor) A 3-D Tensor with shape "
-             "[N, M, 4 or 8 16 24 32] represents the "
-             "predicted locations of M bounding bboxes, N is the batch size. "
-             "Each bounding box has four coordinate values and the layout is "
-             "[xmin, ymin, xmax, ymax], when box size equals to 4.");
-    AddInput("Scores",
-             "Two types of scores are supported:"
-             "1. (Tensor) A 3-D Tensor with shape [N, C, M] represents the "
-             "predicted confidence predictions. N is the batch size, C is the "
-             "class number, M is number of bounding boxes. For each category "
-             "there are total M scores which corresponding M bounding boxes. "
-             " Please note, M is equal to the 2nd dimension of BBoxes. ");
-    AddAttr<int>(
-        "background_label",
-        "(int, default: -1) "
-        "The index of background label, the background label will be ignored. "
-        "If set to -1, then all categories will be considered.")
-        .SetDefault(-1);
-    AddAttr<float>("score_threshold",
-                   "(float) "
-                   "Threshold to filter out bounding boxes with low "
-                   "confidence score. If not provided, consider all boxes.");
-    AddAttr<int>("nms_top_k",
-                 "(int64_t) "
-                 "Maximum number of detections to be kept according to the "
-                 "confidences after the filtering detections based on "
-                 "score_threshold");
-    AddAttr<float>("nms_threshold",
-                   "(float, default: 0.3) "
-                   "The threshold to be used in NMS.")
-        .SetDefault(0.3);
-    AddAttr<float>("nms_eta",
-                   "(float) "
-                   "The parameter for adaptive NMS.")
-        .SetDefault(1.0);
-    AddAttr<int>("keep_top_k",
-                 "(int64_t) "
-                 "Number of total bboxes to be kept per image after NMS "
-                 "step. -1 means keeping all bboxes after NMS step.");
-    AddAttr<bool>("normalized",
-                  "(bool, default true) "
-                  "Whether detections are normalized.")
-        .SetDefault(true);
-    AddOutput("Out",
-              "(phi::DenseTensor) A 2-D phi::DenseTensor with shape [No, 6] "
-              "represents the "
-              "detections. Each row has 6 values: "
-              "[label, confidence, xmin, ymin, xmax, ymax] or "
-              "(phi::DenseTensor) A 2-D phi::DenseTensor with shape [No, 10] "
-              "represents the "
-              "detections. Each row has 10 values: "
-              "[label, confidence, x1, y1, x2, y2, x3, y3, x4, y4]. No is the "
-              "total number of detections in this mini-batch."
-              "For each instance, "
-              "the offsets in first dimension are called LoD, the number of "
-              "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
-              "no detected bbox.");
-    AddComment(R"DOC(
-This operator is to do locality-aware non maximum suppression (NMS) on a batched
-of boxes and scores.
-Firstly, this operator merge box and score according their IOU(intersection over union).
-In the NMS step, this operator greedily selects a subset of detection bounding
-boxes that have high scores larger than score_threshold, if providing this
-threshold, then selects the largest nms_top_k confidences scores if nms_top_k
-is larger than -1. Then this operator prunes away boxes that have high IOU
-(intersection over union) overlap with already selected boxes by adaptive
-threshold NMS based on parameters of nms_threshold and nms_eta.
-After NMS step, at most keep_top_k number of total bboxes are to be kept
-per image if keep_top_k is larger than -1.
-This operator support multi-class and batched inputs. It applying NMS
-independently for each class. The outputs is a 2-D LoDTensor, for each
-image, the offsets in first dimension of phi::DenseTensor are called LoD, the number
-of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0,
-means there is no detected bbox for this image.
-
-Please get more information from the following papers:
-https://arxiv.org/abs/1704.03155.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    locality_aware_nms,
-    ops::LocalityAwareNMSOp,
-    ops::LocalityAwareNMSOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-PD_REGISTER_STRUCT_KERNEL(locality_aware_nms,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::LocalityAwareNMSKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/detection/mine_hard_examples_op.cc b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
deleted file mode 100644
index 0ce9979ff2a3d..0000000000000
--- a/paddle/fluid/operators/detection/mine_hard_examples_op.cc
+++ /dev/null
@@ -1,412 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-enum MiningType { kNone = 0, kMaxNegative, kHardExample };
-
-template <typename T>
-bool SortScoreDescend(const std::pair<float, T>& pair1,
-                      const std::pair<float, T>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-inline bool IsEligibleMining(const MiningType mining_type,
-                             const int match_idx,
-                             const float match_dist,
-                             const float neg_dist_threshold) {
-  if (mining_type == MiningType::kMaxNegative) {
-    return match_idx == -1 && match_dist < neg_dist_threshold;
-  } else if (mining_type == MiningType::kHardExample) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-inline MiningType GetMiningType(std::string str) {
-  if (str == "max_negative") {
-    return MiningType::kMaxNegative;
-  } else if (str == "hard_example") {
-    return MiningType::kHardExample;
-  } else {
-    return MiningType::kNone;
-  }
-}
-
-template <typename T, typename DeviceContext>
-class MineHardExamplesKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_cls_loss = ctx.Input<phi::DenseTensor>("ClsLoss");
-    auto* in_loc_loss = ctx.Input<phi::DenseTensor>("LocLoss");
-    auto* in_matched_indices = ctx.Input<phi::DenseTensor>("MatchIndices");
-    auto* in_match_dist = ctx.Input<phi::DenseTensor>("MatchDist");
-    float neg_pos_ratio = ctx.Attr<float>("neg_pos_ratio");
-    T neg_dist_threshold =
-        static_cast<T>(ctx.Attr<float>("neg_dist_threshold"));
-    int sample_size = ctx.Attr<int>("sample_size");
-    MiningType mining_type =
-        GetMiningType(ctx.Attr<std::string>("mining_type"));
-
-    auto out_neg_indices = ctx.Output<phi::DenseTensor>("NegIndices");
-    auto out_match_indices =
-        ctx.Output<phi::DenseTensor>("UpdatedMatchIndices");
-
-    framework::TensorCopy(
-        *in_matched_indices, ctx.GetPlace(), out_match_indices);
-
-    int batch_size = static_cast<int>(in_matched_indices->dims()[0]);
-    int prior_num = static_cast<int>(in_matched_indices->dims()[1]);
-
-    auto match_indices = framework::EigenMatrix<int>::From(*in_matched_indices);
-
-    auto match_indices_et =
-        framework::EigenMatrix<int>::From(*out_match_indices);
-
-    auto match_dist = framework::EigenMatrix<T>::From(*in_match_dist);
-
-    const T* cls_loss = in_cls_loss->data<T>();
-    const T* loc_loss = nullptr;
-    if (in_loc_loss) {
-      loc_loss = in_loc_loss->data<T>();
-    }
-
-    std::vector<std::vector<int>> all_neg_indices;
-    std::vector<size_t> batch_starts = {0};
-    for (int n = 0; n < batch_size; ++n) {
-      std::vector<std::pair<T, size_t>> loss_idx;
-      int neg_sel = 0;
-      for (int m = 0; m < prior_num; ++m) {
-        if (IsEligibleMining(mining_type,
-                             match_indices(n, m),
-                             match_dist(n, m),
-                             neg_dist_threshold)) {
-          T loss = cls_loss[n * prior_num + m];
-          if (mining_type == MiningType::kHardExample && loc_loss != nullptr) {
-            loss = cls_loss[n * prior_num + m] + loc_loss[n * prior_num + m];
-          }
-          loss_idx.push_back(std::make_pair(loss, m));
-          ++neg_sel;
-        }
-      }
-
-      if (mining_type == MiningType::kMaxNegative) {
-        int num_pos = 0;
-        for (int m = 0; m < prior_num; ++m) {
-          if (match_indices(n, m) != -1) ++num_pos;
-        }
-        neg_sel = std::min(static_cast<int>(num_pos * neg_pos_ratio),  // NOLINT
-                           neg_sel);
-      } else if (mining_type == MiningType::kHardExample) {
-        neg_sel = std::min(sample_size, neg_sel);
-      }
-
-      std::sort(loss_idx.begin(), loss_idx.end(), SortScoreDescend<size_t>);
-      std::set<int> sel_indices;
-      std::vector<int> neg_indices;
-      std::transform(loss_idx.begin(),
-                     loss_idx.begin() + neg_sel,
-                     std::inserter(sel_indices, sel_indices.begin()),
-                     [](std::pair<T, size_t>& l) -> int {
-                       return static_cast<int>(l.second);
-                     });
-
-      if (mining_type == MiningType::kHardExample) {
-        for (int m = 0; m < prior_num; ++m) {
-          if (match_indices(n, m) > -1) {
-            if (sel_indices.find(m) == sel_indices.end()) {
-              match_indices_et(n, m) = -1;
-            }
-          } else {
-            if (sel_indices.find(m) != sel_indices.end()) {
-              neg_indices.push_back(m);
-            }
-          }
-        }
-      } else {
-        neg_indices.resize(sel_indices.size());
-        std::copy(sel_indices.begin(), sel_indices.end(), neg_indices.begin());
-      }
-
-      all_neg_indices.push_back(neg_indices);
-      batch_starts.push_back(batch_starts.back() + neg_indices.size());
-    }
-
-    framework::LoD out_neg_indices_lod;
-    out_neg_indices_lod.emplace_back(batch_starts);
-    int neg_offset = 0;
-    auto neg_data = out_neg_indices->mutable_data<int>(
-        common::make_ddim({static_cast<int>(batch_starts.back()), 1}),
-        ctx.GetPlace());
-
-    for (auto neg_indices : all_neg_indices) {
-      std::copy(neg_indices.begin(), neg_indices.end(), neg_data + neg_offset);
-      neg_offset += static_cast<int>(neg_indices.size());
-    }
-    out_neg_indices->set_lod(out_neg_indices_lod);
-    return;
-  }
-};
-
-class MineHardExamplesOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("ClsLoss"), "Input", "ClsLoss", "mine_hard_examples");
-    OP_INOUT_CHECK(ctx->HasInput("MatchIndices"),
-                   "Input",
-                   "MatchIndices",
-                   "mine_hard_examples");
-    OP_INOUT_CHECK(
-        ctx->HasInput("MatchDist"), "Input", "MatchDist", "mine_hard_examples");
-    OP_INOUT_CHECK(ctx->HasOutput("NegIndices"),
-                   "Output",
-                   "NegIndices",
-                   "mine_hard_examples");
-    OP_INOUT_CHECK(ctx->HasOutput("UpdatedMatchIndices"),
-                   "Output",
-                   "UpdatedMatchIndices",
-                   "mine_hard_examples");
-
-    auto cls_loss_dims = ctx->GetInputDim("ClsLoss");
-    auto idx_dims = ctx->GetInputDim("MatchIndices");
-    auto dis_dims = ctx->GetInputDim("MatchDist");
-
-    PADDLE_ENFORCE_EQ(cls_loss_dims.size(),
-                      2UL,
-                      platform::errors::InvalidArgument(
-                          "The shape of ClsLoss is [N, Np]. But received %d.",
-                          cls_loss_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        idx_dims.size(),
-        2UL,
-        platform::errors::InvalidArgument(
-            "The shape of MatchIndices is [N, Np]. But received %d.",
-            idx_dims.size()));
-    PADDLE_ENFORCE_EQ(dis_dims.size(),
-                      2UL,
-                      platform::errors::InvalidArgument(
-                          "The shape of MatchDist is [N, Np]. But received %d.",
-                          dis_dims.size()));
-
-    if (ctx->HasInput("LocLoss")) {
-      auto loc_loss_dims = ctx->GetInputDim("LocLoss");
-      PADDLE_ENFORCE_EQ(loc_loss_dims.size(),
-                        2UL,
-                        platform::errors::InvalidArgument(
-                            "The shape of LocLoss is [N, Np]. But received %d.",
-                            loc_loss_dims.size()));
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(cls_loss_dims[0],
-                          loc_loss_dims[0],
-                          platform::errors::InvalidArgument(
-                              "Batch size of ClsLoss and LocLoss must be the "
-                              "same. But received batch size of ClsLoss was "
-                              "%d, batch size of LocLoss was %d.",
-                              cls_loss_dims[0],
-                              loc_loss_dims[0]));
-        PADDLE_ENFORCE_EQ(cls_loss_dims[1],
-                          loc_loss_dims[1],
-                          platform::errors::InvalidArgument(
-                              "Prior box number of ClsLoss and LocLoss must be "
-                              "the same. But received box number of ClsLoss "
-                              "was %d, box number of LocLoss was %d.",
-                              cls_loss_dims[1],
-                              loc_loss_dims[1]));
-      }
-    }
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(cls_loss_dims[0],
-                        idx_dims[0],
-                        platform::errors::InvalidArgument(
-                            "Batch size of ClsLoss and MatchIndices must be "
-                            "the same. But received batch size of ClsLoss was "
-                            "%d, batch size of MatchIndices was %d.",
-                            cls_loss_dims[0],
-                            idx_dims[0]));
-      PADDLE_ENFORCE_EQ(
-          cls_loss_dims[1],
-          idx_dims[1],
-          platform::errors::InvalidArgument(
-              "Prior box number of ClsLoss and "
-              "MatchIndices must be the same. But received box number of "
-              "ClsLoss was %d, box number of MatchIndices was %d.",
-              cls_loss_dims[1],
-              idx_dims[1]));
-
-      PADDLE_ENFORCE_EQ(cls_loss_dims[0],
-                        dis_dims[0],
-                        platform::errors::InvalidArgument(
-                            "Batch size of ClsLoss and MatchDist must be the "
-                            "same. But received batch size of ClsLoss was %d, "
-                            "batch size of MatchDist was %d.",
-                            cls_loss_dims[0],
-                            dis_dims[0]));
-      PADDLE_ENFORCE_EQ(cls_loss_dims[1],
-                        idx_dims[1],
-                        platform::errors::InvalidArgument(
-                            "Prior box number of ClsLoss and MatchDist must be "
-                            "the same. But received box number of ClsLoss was "
-                            "%d, box number of MatchDist was %d.",
-                            cls_loss_dims[1],
-                            idx_dims[1]));
-    }
-
-    auto mining_type =
-        GetMiningType(ctx->Attrs().Get<std::string>("mining_type"));
-
-    PADDLE_ENFORCE_NE(mining_type,
-                      MiningType::kNone,
-                      platform::errors::InvalidArgument(
-                          "mining_type must be hard_example or max_negative"));
-
-    if (mining_type == MiningType::kMaxNegative) {
-      auto neg_pos_ratio = ctx->Attrs().Get<float>("neg_pos_ratio");
-      auto neg_dist_threshold = ctx->Attrs().Get<float>("neg_dist_threshold");
-      PADDLE_ENFORCE_GT(neg_pos_ratio,
-                        0.0f,
-                        platform::errors::InvalidArgument(
-                            "neg_pos_ratio must greater than zero in "
-                            "max_negative mode. But received %f.",
-                            neg_pos_ratio));
-      PADDLE_ENFORCE_LT(neg_dist_threshold,
-                        1.0f,
-                        platform::errors::InvalidArgument(
-                            "neg_dist_threshold must less than one in "
-                            "max_negative mode. But received %f.",
-                            neg_dist_threshold));
-      PADDLE_ENFORCE_GT(neg_dist_threshold,
-                        0.0f,
-                        platform::errors::InvalidArgument(
-                            "neg_dist_threshold must greater "
-                            "than zero in max_negative mode. But received %f.",
-                            neg_dist_threshold));
-    } else if (mining_type == MiningType::kHardExample) {
-      auto sample_size = ctx->Attrs().Get<int>("sample_size");
-      PADDLE_ENFORCE_GT(sample_size,
-                        0,
-                        platform::errors::InvalidArgument(
-                            "sample_size must greater than zero in "
-                            "hard_example mode. But received %d.",
-                            sample_size));
-    }
-
-    ctx->SetOutputDim("UpdatedMatchIndices", idx_dims);
-    // The first dimension of NegIndices will be set correcttly in Compute.
-    ctx->SetOutputDim("NegIndices", {-1, 1});
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(
-        OperatorWithKernel::IndicateVarDataType(ctx, "ClsLoss"),
-        platform::CPUPlace());
-  }
-};
-
-class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "ClsLoss",
-        "(Tensor, default Tensor<float>), The classification loss with shape "
-        "[N, Np], N is the batch size and Np is the number of prior box.");
-    AddInput("LocLoss",
-             "(Tensor, optional, default Tensor<float>), The localization loss "
-             "with shape [N, Np], N is the batch size and Np is the number of "
-             "prior box.")
-        .AsDispensable();
-    AddInput("MatchIndices",
-             "(Tensor, Tensor<int>), Matched indices with shape [N, Np], N is "
-             "the batch size and Np is the number of prior box. "
-             "MatchIndices[i][j] equal -1 means the j-th prior box in i-th "
-             "instance does not match any entity, otherwise means it is "
-             "matched to row.");
-    AddInput("MatchDist",
-             "(Tensor, default Tensor<float>) Matched indices with shape [N, "
-             "Np], N is the batch size and Np is the number of prior box.");
-    AddAttr<float>("neg_pos_ratio",
-                   "(float) The ratio of the negative box to the positive "
-                   "box. Use only when mining_type is max_negative.")
-        .SetDefault(1.0);
-    AddAttr<float>("neg_dist_threshold",
-                   "(float) The negative overlap upper bound for the unmatched "
-                   "predictions. Use only when mining_type is max_negative.")
-        .SetDefault(0.5);
-    AddAttr<int>("sample_size",
-                 "(float) The max sample size of negative box. Use only when "
-                 "mining_type is hard_example.")
-        .SetDefault(0);
-    AddAttr<std::string>("mining_type",
-                         "(float) The mining algorithm name, the value is "
-                         "hard_example or max_negative.")
-        .SetDefault("max_negative")
-        .InEnum({"hard_example", "max_negative"});
-
-    AddOutput("NegIndices",
-              "(phi::DenseTensor<int>) The output of negative example indices. "
-              "a phi::DenseTensor "
-              "with shape [Neg, 1]. The size of lod[0] minus 1 is batch size, "
-              "and each element is the prior box index. "
-              "For example, the batch size is 2, the lod is [[0, 1, 2]], "
-              "the sample 0's box 1(MatchIndices[0][1]) is selected, "
-              "and sample 1's box 0 is selected. The output NegIndices is "
-              "[[1], [0]].");
-
-    AddOutput("UpdatedMatchIndices",
-              "(Tensor<int>) The output of updated MatchIndices, a tensor with "
-              "shape [N, Np]. Only update when mining_type is "
-              "hard_example. The input MatchIndices elements will be update to "
-              "-1 when it is not in the candidate high loss list of negative "
-              "examples.");
-
-    AddComment(R"DOC(
-Mine hard examples Operator.
-This operator implements hard example mining to select a subset of negative box indices.
-For each image, selects the box with highest losses. subject to the condition that the
-box cannot have an Matcht > neg_dist_threshold when mining_type is max_negative.
-The selected number is min(sample_size, max_negative_box_number) when mining_type is
-hard_example, or min(neg_pos_ratio * positive_box_number, max_negative_box_number)
-when mining_type is max_negative, where the max_negative_box_number is the count of
-MatchIndices elements with value -1.
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    mine_hard_examples,
-    ops::MineHardExamplesOp,
-    ops::MineHardExamplesOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-PD_REGISTER_STRUCT_KERNEL(mine_hard_examples,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::MineHardExamplesKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index 9cd9e76772424..73ec6caa61c27 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -47,22 +47,22 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
     if (ctx->IsRuntime()) {
       PADDLE_ENFORCE_EQ(score_size == 2 || score_size == 3,
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The rank of Input(Scores) must be 2 or 3"
                             ". But received rank = %d",
                             score_size));
-      PADDLE_ENFORCE_EQ(box_dims.size(),
-                        3,
-                        platform::errors::InvalidArgument(
-                            "The rank of Input(BBoxes) must be 3"
-                            ". But received rank = %d",
-                            box_dims.size()));
+      PADDLE_ENFORCE_EQ(
+          box_dims.size(),
+          3,
+          phi::errors::InvalidArgument("The rank of Input(BBoxes) must be 3"
+                                       ". But received rank = %d",
+                                       box_dims.size()));
       if (score_size == 3) {
         PADDLE_ENFORCE_EQ(box_dims[2] == 4 || box_dims[2] == 8 ||
                               box_dims[2] == 16 || box_dims[2] == 24 ||
                               box_dims[2] == 32,
                           true,
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "The last dimension of Input"
                               "(BBoxes) must be 4 or 8, "
                               "represents the layout of coordinate "
@@ -74,7 +74,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
         PADDLE_ENFORCE_EQ(
             box_dims[1],
             score_dims[2],
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "The 2nd dimension of Input(BBoxes) must be equal to "
                 "last dimension of Input(Scores), which represents the "
                 "predicted bboxes."
@@ -84,14 +84,14 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
       } else {
         PADDLE_ENFORCE_EQ(box_dims[2],
                           4,
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "The last dimension of Input"
                               "(BBoxes) must be 4. But received dimension = %d",
                               box_dims[2]));
         PADDLE_ENFORCE_EQ(
             box_dims[1],
             score_dims[1],
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "The 2nd dimension of Input"
                 "(BBoxes) must be equal to the 2nd dimension of Input(Scores). "
                 "But received box dimension = %d, score dimension = %d",
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cc b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
index 0059aedcdc86c..35518b224e5ad 100644
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cc
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
@@ -21,10 +21,9 @@ template <typename T, typename DeviceContext>
 class PolygonBoxTransformCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()),
-        true,
-        platform::errors::InvalidArgument("It must use CUDAPlace."));
+    PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()),
+                      true,
+                      phi::errors::InvalidArgument("It must use CUDAPlace."));
     auto* in = ctx.Input<phi::DenseTensor>("Input");
     auto in_dims = common::vectorize<int>(in->dims());
     const T* in_data = in->data<T>();
@@ -66,12 +65,12 @@ class PolygonBoxTransformOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         in_dim.size(),
         4,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "input's rank must be 4. But received: Input rank is [%d]",
             in_dim.size()));
     PADDLE_ENFORCE_EQ(in_dim[1] % 2,
                       0,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "input's second dimension must be even. But "
                           "received: Input 2nd dimension is [%d]",
                           in_dim[1]));
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
index 4f182464f77b5..b23a8d4e41bc5 100644
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
@@ -45,7 +45,7 @@ class PolygonBoxTransformOpCUDAKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(ctx.GetPlace()),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The polygon_box_transform operator needs to be executed on GPU."));
     auto* in = ctx.Input<phi::DenseTensor>("Input");
     auto in_dims = in->dims();
diff --git a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
deleted file mode 100644
index b97cfe81a5a17..0000000000000
--- a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
+++ /dev/null
@@ -1,676 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-limitations under the License. */
-
-#include <glog/logging.h>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class RetinanetDetectionOutputOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_GE(
-        ctx->Inputs("BBoxes").size(),
-        1UL,
-        platform::errors::InvalidArgument("The length of Input(BBoxes) should "
-                                          "be greater than 0, but received "
-                                          "BBoxes length is:%d.",
-                                          ctx->Inputs("BBoxes").size()));
-    PADDLE_ENFORCE_GE(
-        ctx->Inputs("Scores").size(),
-        1UL,
-        platform::errors::InvalidArgument("The length of Input(Scores) should "
-                                          "be greater than 0, but received "
-                                          "Scores length is:%d.",
-                                          ctx->Inputs("Scores").size()));
-    PADDLE_ENFORCE_GE(
-        ctx->Inputs("Anchors").size(),
-        1UL,
-        platform::errors::InvalidArgument("The length of Input(Anchors) should "
-                                          "be greater than 0, but received "
-                                          "Anchors length is:%d.",
-                                          ctx->Inputs("Anchors").size()));
-    PADDLE_ENFORCE_EQ(
-        ctx->Inputs("BBoxes").size(),
-        ctx->Inputs("Scores").size(),
-        platform::errors::InvalidArgument(
-            "Input(BBoxes) and Input(Scores) should have the same length, but "
-            "received BBoxes length is:%d, Scores length is:%d.",
-            ctx->Inputs("BBoxes").size(),
-            ctx->Inputs("Scores").size()));
-    PADDLE_ENFORCE_EQ(
-        ctx->Inputs("BBoxes").size(),
-        ctx->Inputs("Anchors").size(),
-        platform::errors::InvalidArgument(
-            "Input(BBoxes) and Input(Anchors) should have the same length, but "
-            "received BBoxes length is:%d, Anchors length is:%d.",
-            ctx->Inputs("BBoxes").size(),
-            ctx->Inputs("Anchors").size()));
-    OP_INOUT_CHECK(ctx->HasInput("ImInfo"),
-                   "Input",
-                   "ImInfo",
-                   "retinanet_detection_output");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Out"), "Output", "Out", "retinanet_detection_output");
-
-    auto bboxes_dims = ctx->GetInputsDim("BBoxes");
-    auto scores_dims = ctx->GetInputsDim("Scores");
-    auto anchors_dims = ctx->GetInputsDim("Anchors");
-    auto im_info_dims = ctx->GetInputDim("ImInfo");
-
-    const size_t b_n = bboxes_dims.size();
-    PADDLE_ENFORCE_GT(b_n,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The number of Variables in Input(BBoxes) "
-                          "should be greater than 0, "
-                          "but received number is:%d.",
-                          b_n));
-    const size_t s_n = scores_dims.size();
-    PADDLE_ENFORCE_GT(s_n,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The number of Variables in Input(Scores) "
-                          "should be greater than 0, "
-                          "but received number is:%d.",
-                          s_n));
-    const size_t a_n = anchors_dims.size();
-    PADDLE_ENFORCE_GT(a_n,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The number of Variables in Input(Anchors) "
-                          "should be greater than 0, "
-                          "but received number is:%d.",
-                          a_n));
-    auto bbox_dims = bboxes_dims[0];
-    auto score_dims = scores_dims[0];
-    auto anchor_dims = anchors_dims[0];
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          score_dims.size(),
-          3,
-          platform::errors::InvalidArgument(
-              "The rank of each Variable in Input(Scores) must be 3, "
-              "but received rank is:%d.",
-              score_dims.size()));
-      PADDLE_ENFORCE_EQ(
-          bbox_dims.size(),
-          3,
-          platform::errors::InvalidArgument(
-              "The rank of each Variable in Input(BBoxes) must be 3, "
-              "but received rank is:%d.",
-              bbox_dims.size()));
-      PADDLE_ENFORCE_EQ(
-          anchor_dims.size(),
-          2,
-          platform::errors::InvalidArgument(
-              "The rank of each Variable in Input(Anchors) must be 2, "
-              "but received rank is:%d.",
-              anchor_dims.size()));
-      PADDLE_ENFORCE_EQ(
-          bbox_dims[2],
-          4,
-          platform::errors::InvalidArgument(
-              "The last dimension of each Variable in Input(BBoxes) must be 4 "
-              "representing the layout of coordinate [xmin, ymin, xmax, ymax], "
-              "but received dimension is:%d.",
-              bbox_dims[2]));
-      PADDLE_ENFORCE_EQ(bbox_dims[1],
-                        score_dims[1],
-                        platform::errors::InvalidArgument(
-                            "The 2nd dimension of Variables in Input(BBoxes) "
-                            "and Input(Scores) "
-                            "must be same, which represents the number of the "
-                            "predicted boxes, "
-                            "but received BBoxes 2nd dimension is:%d, Scores "
-                            "2nd dimension is:%d.",
-                            bbox_dims[1],
-                            score_dims[1]));
-      PADDLE_ENFORCE_EQ(
-          anchor_dims[0],
-          bbox_dims[1],
-          platform::errors::InvalidArgument(
-              "The 1st dimension of each Variables in Input(Anchors) must be "
-              "equal "
-              "to the 2nd dimension of corresponding Variables in "
-              "Input(BBoxes), "
-              "which represents the number of the predicted boxes, but "
-              "received "
-              "Anchors 1st dimension is:%d, BBoxes 2nd dimension is:%d.",
-              anchor_dims[0],
-              bbox_dims[1]));
-      PADDLE_ENFORCE_EQ(im_info_dims.size(),
-                        2,
-                        platform::errors::InvalidArgument(
-                            "The rank of Input(ImInfo) must be 2,  but "
-                            "received ImInfo rank is:%d.",
-                            im_info_dims.size()));
-    }
-    // Here the box_dims[0] is not the real dimension of output.
-    // It will be rewritten in the computing kernel.
-    ctx->SetOutputDim("Out", {bbox_dims[1], bbox_dims[2] + 2});
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto input_data_type =
-        OperatorWithKernel::IndicateVarDataType(ctx, "Scores");
-    return phi::KernelKey(input_data_type,
-                          platform::CPUPlace());  // ctx.GetPlace());
-  }
-};
-
-template <class T>
-bool SortScorePairDescend(const std::pair<float, T>& pair1,
-                          const std::pair<float, T>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-template <class T>
-bool SortScoreTwoPairDescend(const std::pair<float, std::pair<T, T>>& pair1,
-                             const std::pair<float, std::pair<T, T>>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-template <class T>
-static inline void GetMaxScoreIndex(
-    const std::vector<T>& scores,
-    const T threshold,
-    int top_k,
-    std::vector<std::pair<T, int>>* sorted_indices) {
-  for (size_t i = 0; i < scores.size(); ++i) {
-    if (scores[i] > threshold) {
-      sorted_indices->push_back(std::make_pair(scores[i], i));
-    }
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices->begin(),
-                   sorted_indices->end(),
-                   SortScorePairDescend<int>);
-  // Keep top_k scores if needed.
-  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
-    sorted_indices->resize(top_k);
-  }
-}
-
-template <class T>
-static inline T BBoxArea(const std::vector<T>& box, const bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-template <class T>
-static inline T JaccardOverlap(const std::vector<T>& box1,
-                               const std::vector<T>& box2,
-                               const bool normalized) {
-  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-      box2[3] < box1[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
-    T norm = normalized ? static_cast<T>(0.) : static_cast<T>(1.);
-    T inter_w = inter_xmax - inter_xmin + norm;
-    T inter_h = inter_ymax - inter_ymin + norm;
-    const T inter_area = inter_w * inter_h;
-    const T bbox1_area = BBoxArea<T>(box1, normalized);
-    const T bbox2_area = BBoxArea<T>(box2, normalized);
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <typename T, typename DeviceContext>
-class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
- public:
-  void NMSFast(const std::vector<std::vector<T>>& cls_dets,
-               const T nms_threshold,
-               const T eta,
-               std::vector<int>* selected_indices) const {
-    int64_t num_boxes = cls_dets.size();
-    std::vector<std::pair<T, int>> sorted_indices;
-    for (int64_t i = 0; i < num_boxes; ++i) {
-      sorted_indices.push_back(std::make_pair(cls_dets[i][4], i));
-    }
-    // Sort the score pair according to the scores in descending order
-    std::stable_sort(sorted_indices.begin(),
-                     sorted_indices.end(),
-                     SortScorePairDescend<int>);
-    selected_indices->clear();
-    T adaptive_threshold = nms_threshold;
-
-    while (!sorted_indices.empty()) {
-      const int idx = sorted_indices.front().second;
-      bool keep = true;
-      for (const auto kept_idx : *selected_indices) {
-        if (keep) {
-          T overlap = T(0.);
-          overlap = JaccardOverlap<T>(cls_dets[idx], cls_dets[kept_idx], false);
-          keep = overlap <= adaptive_threshold;
-        } else {
-          break;
-        }
-      }
-      if (keep) {
-        selected_indices->push_back(idx);
-      }
-      sorted_indices.erase(sorted_indices.begin());
-      if (keep && eta < 1 && adaptive_threshold > 0.5) {
-        adaptive_threshold *= eta;
-      }
-    }
-  }
-
-  void DeltaScoreToPrediction(
-      const std::vector<T>& bboxes_data,
-      const std::vector<T>& anchors_data,
-      T im_height,
-      T im_width,
-      T im_scale,
-      int class_num,
-      const std::vector<std::pair<T, int>>& sorted_indices,
-      std::map<int, std::vector<std::vector<T>>>* preds) const {
-    im_height = static_cast<T>(round(im_height / im_scale));
-    im_width = static_cast<T>(round(im_width / im_scale));
-    T zero(0);
-    int i = 0;
-    for (const auto& it : sorted_indices) {
-      T score = it.first;
-      int idx = it.second;
-      int a = idx / class_num;
-      int c = idx % class_num;
-
-      int box_offset = a * 4;
-      T anchor_box_width =
-          anchors_data[box_offset + 2] - anchors_data[box_offset] + 1;
-      T anchor_box_height =
-          anchors_data[box_offset + 3] - anchors_data[box_offset + 1] + 1;
-      T anchor_box_center_x = anchors_data[box_offset] + anchor_box_width / 2;
-      T anchor_box_center_y =
-          anchors_data[box_offset + 1] + anchor_box_height / 2;
-      T target_box_center_x = 0, target_box_center_y = 0;
-      T target_box_width = 0, target_box_height = 0;
-      target_box_center_x =
-          bboxes_data[box_offset] * anchor_box_width + anchor_box_center_x;
-      target_box_center_y =
-          bboxes_data[box_offset + 1] * anchor_box_height + anchor_box_center_y;
-      target_box_width =
-          std::exp(bboxes_data[box_offset + 2]) * anchor_box_width;
-      target_box_height =
-          std::exp(bboxes_data[box_offset + 3]) * anchor_box_height;
-      T pred_box_xmin = target_box_center_x - target_box_width / 2;
-      T pred_box_ymin = target_box_center_y - target_box_height / 2;
-      T pred_box_xmax = target_box_center_x + target_box_width / 2 - 1;
-      T pred_box_ymax = target_box_center_y + target_box_height / 2 - 1;
-      pred_box_xmin = pred_box_xmin / im_scale;
-      pred_box_ymin = pred_box_ymin / im_scale;
-      pred_box_xmax = pred_box_xmax / im_scale;
-      pred_box_ymax = pred_box_ymax / im_scale;
-
-      pred_box_xmin = std::max(std::min(pred_box_xmin, im_width - 1), zero);
-      pred_box_ymin = std::max(std::min(pred_box_ymin, im_height - 1), zero);
-      pred_box_xmax = std::max(std::min(pred_box_xmax, im_width - 1), zero);
-      pred_box_ymax = std::max(std::min(pred_box_ymax, im_height - 1), zero);
-
-      std::vector<T> one_pred;
-      one_pred.push_back(pred_box_xmin);
-      one_pred.push_back(pred_box_ymin);
-      one_pred.push_back(pred_box_xmax);
-      one_pred.push_back(pred_box_ymax);
-      one_pred.push_back(score);
-      (*preds)[c].push_back(one_pred);
-      i++;
-    }
-  }
-
-  void MultiClassNMS(const std::map<int, std::vector<std::vector<T>>>& preds,
-                     int class_num,
-                     const int keep_top_k,
-                     const T nms_threshold,
-                     const T nms_eta,
-                     std::vector<std::vector<T>>* nmsed_out,
-                     int* num_nmsed_out) const {
-    std::map<int, std::vector<int>> indices;
-    int num_det = 0;
-    for (int c = 0; c < class_num; ++c) {
-      if (static_cast<bool>(preds.count(c))) {
-        const std::vector<std::vector<T>> cls_dets = preds.at(c);
-        NMSFast(cls_dets, nms_threshold, nms_eta, &(indices[c]));
-        num_det += static_cast<int>(indices[c].size());
-      }
-    }
-
-    std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-    for (const auto& it : indices) {
-      int label = it.first;
-      const std::vector<int>& label_indices = it.second;
-      for (auto idx : label_indices) {
-        score_index_pairs.push_back(std::make_pair(preds.at(label)[idx][4],
-                                                   std::make_pair(label, idx)));
-      }
-    }
-    // Keep top k results per image.
-    std::stable_sort(score_index_pairs.begin(),
-                     score_index_pairs.end(),
-                     SortScoreTwoPairDescend<int>);
-    if (num_det > keep_top_k) {
-      score_index_pairs.resize(keep_top_k);
-    }
-
-    // Store the new indices.
-    std::map<int, std::vector<int>> new_indices;
-    for (const auto& it : score_index_pairs) {
-      int label = it.second.first;
-      int idx = it.second.second;
-      std::vector<T> one_pred;
-      one_pred.push_back(label);
-      one_pred.push_back(preds.at(label)[idx][4]);
-      one_pred.push_back(preds.at(label)[idx][0]);
-      one_pred.push_back(preds.at(label)[idx][1]);
-      one_pred.push_back(preds.at(label)[idx][2]);
-      one_pred.push_back(preds.at(label)[idx][3]);
-      nmsed_out->push_back(one_pred);
-    }
-
-    *num_nmsed_out = (num_det > keep_top_k ? keep_top_k : num_det);
-  }
-
-  void RetinanetDetectionOutput(const framework::ExecutionContext& ctx,
-                                const std::vector<phi::DenseTensor>& scores,
-                                const std::vector<phi::DenseTensor>& bboxes,
-                                const std::vector<phi::DenseTensor>& anchors,
-                                const phi::DenseTensor& im_info,
-                                std::vector<std::vector<T>>* nmsed_out,
-                                int* num_nmsed_out) const {
-    int64_t nms_top_k = ctx.Attr<int>("nms_top_k");
-    int64_t keep_top_k = ctx.Attr<int>("keep_top_k");
-    T nms_threshold = static_cast<T>(ctx.Attr<float>("nms_threshold"));
-    T nms_eta = static_cast<T>(ctx.Attr<float>("nms_eta"));
-    T score_threshold = static_cast<T>(ctx.Attr<float>("score_threshold"));
-
-    int64_t class_num = scores[0].dims()[1];
-    std::map<int, std::vector<std::vector<T>>> preds;
-    for (size_t l = 0; l < scores.size(); ++l) {
-      // Fetch per level score
-      phi::DenseTensor scores_per_level = scores[l];
-      // Fetch per level bbox
-      phi::DenseTensor bboxes_per_level = bboxes[l];
-      // Fetch per level anchor
-      phi::DenseTensor anchors_per_level = anchors[l];
-
-      int64_t scores_num = scores_per_level.numel();
-      int64_t bboxes_num = bboxes_per_level.numel();
-      std::vector<T> scores_data(scores_num);
-      std::vector<T> bboxes_data(bboxes_num);
-      std::vector<T> anchors_data(bboxes_num);
-      std::copy_n(scores_per_level.data<T>(), scores_num, scores_data.begin());
-      std::copy_n(bboxes_per_level.data<T>(), bboxes_num, bboxes_data.begin());
-      std::copy_n(
-          anchors_per_level.data<T>(), bboxes_num, anchors_data.begin());
-      std::vector<std::pair<T, int>> sorted_indices;
-
-      // For the highest level, we take the threshold 0.0
-      T threshold = (l < (scores.size() - 1) ? score_threshold : 0.0);
-      GetMaxScoreIndex(scores_data, threshold, nms_top_k, &sorted_indices);
-      auto* im_info_data = im_info.data<T>();
-      auto im_height = im_info_data[0];
-      auto im_width = im_info_data[1];
-      auto im_scale = im_info_data[2];
-      DeltaScoreToPrediction(bboxes_data,
-                             anchors_data,
-                             im_height,
-                             im_width,
-                             im_scale,
-                             class_num,
-                             sorted_indices,
-                             &preds);
-    }
-
-    MultiClassNMS(preds,
-                  class_num,
-                  keep_top_k,
-                  nms_threshold,
-                  nms_eta,
-                  nmsed_out,
-                  num_nmsed_out);
-  }
-
-  void MultiClassOutput(const platform::DeviceContext& ctx,
-                        const std::vector<std::vector<T>>& nmsed_out,
-                        phi::DenseTensor* outs) const {
-    auto* odata = outs->data<T>();
-    int count = 0;
-    int64_t out_dim = 6;
-    for (size_t i = 0; i < nmsed_out.size(); ++i) {
-      odata[count * out_dim] = nmsed_out[i][0] + 1;  // label
-      odata[count * out_dim + 1] = nmsed_out[i][1];  // score
-      odata[count * out_dim + 2] = nmsed_out[i][2];  // xmin
-      odata[count * out_dim + 3] = nmsed_out[i][3];  // xmin
-      odata[count * out_dim + 4] = nmsed_out[i][4];  // xmin
-      odata[count * out_dim + 5] = nmsed_out[i][5];  // xmin
-      count++;
-    }
-  }
-
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto boxes = ctx.MultiInput<phi::DenseTensor>("BBoxes");
-    auto scores = ctx.MultiInput<phi::DenseTensor>("Scores");
-    auto anchors = ctx.MultiInput<phi::DenseTensor>("Anchors");
-    auto* im_info = ctx.Input<phi::DenseTensor>("ImInfo");
-    auto* outs = ctx.Output<phi::DenseTensor>("Out");
-
-    std::vector<phi::DenseTensor> boxes_list(boxes.size());
-    std::vector<phi::DenseTensor> scores_list(scores.size());
-    std::vector<phi::DenseTensor> anchors_list(anchors.size());
-    for (size_t j = 0; j < boxes_list.size(); ++j) {
-      boxes_list[j] = *boxes[j];
-      scores_list[j] = *scores[j];
-      anchors_list[j] = *anchors[j];
-    }
-    auto score_dims = scores_list[0].dims();
-    int64_t batch_size = score_dims[0];
-    auto box_dims = boxes_list[0].dims();
-    int64_t box_dim = box_dims[2];
-    int64_t out_dim = box_dim + 2;
-
-    auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
-
-    std::vector<std::vector<std::vector<T>>> all_nmsed_out;
-    std::vector<size_t> batch_starts = {0};
-    for (int i = 0; i < batch_size; ++i) {
-      int num_nmsed_out = 0;
-      std::vector<phi::DenseTensor> box_per_batch_list(boxes_list.size());
-      std::vector<phi::DenseTensor> score_per_batch_list(scores_list.size());
-      for (size_t j = 0; j < boxes_list.size(); ++j) {
-        const auto& score_dims = scores_list[j].dims();
-        score_per_batch_list[j] = scores_list[j].Slice(i, i + 1);
-        score_per_batch_list[j].Resize({score_dims[1], score_dims[2]});
-        box_per_batch_list[j] = boxes_list[j].Slice(i, i + 1);
-        box_per_batch_list[j].Resize({score_dims[1], box_dim});
-      }
-      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
-
-      std::vector<std::vector<T>> nmsed_out;
-      RetinanetDetectionOutput(ctx,
-                               score_per_batch_list,
-                               box_per_batch_list,
-                               anchors_list,
-                               im_info_slice,
-                               &nmsed_out,
-                               &num_nmsed_out);
-      all_nmsed_out.push_back(nmsed_out);
-      batch_starts.push_back(batch_starts.back() + num_nmsed_out);
-    }
-
-    int num_kept = static_cast<int>(batch_starts.back());
-    if (num_kept == 0) {
-      outs->Resize({0, out_dim});
-    } else {
-      outs->mutable_data<T>({num_kept, out_dim}, ctx.GetPlace());
-      for (int i = 0; i < batch_size; ++i) {
-        int64_t s = static_cast<int64_t>(batch_starts[i]);
-        int64_t e = static_cast<int64_t>(batch_starts[i + 1]);
-        if (e > s) {
-          phi::DenseTensor out = outs->Slice(s, e);
-          MultiClassOutput(dev_ctx, all_nmsed_out[i], &out);
-        }
-      }
-    }
-
-    framework::LoD lod;
-    lod.emplace_back(batch_starts);
-
-    outs->set_lod(lod);
-  }
-};
-
-class RetinanetDetectionOutputOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("BBoxes",
-             "(List) A list of tensors from multiple FPN levels. Each "
-             "element is a 3-D phi::DenseTensor with shape [N, Mi, 4] "
-             "represents the "
-             "predicted locations of Mi bounding boxes, N is the batch size. "
-             "Mi is the number of bounding boxes from i-th FPN level. Each "
-             "bounding box has four coordinate values and the layout is "
-             "[xmin, ymin, xmax, ymax].")
-        .AsDuplicable();
-    AddInput("Scores",
-             "(List) A list of tensors from multiple FPN levels. Each "
-             "element is a 3-D phi::DenseTensor with shape [N, Mi, C] "
-             "represents the "
-             "predicted confidence from its FPN level. N is the batch size, "
-             "C is the class number (excluding background), Mi is the number "
-             "of bounding boxes from i-th FPN level. For each bounding box, "
-             "there are total C scores.")
-        .AsDuplicable();
-    AddInput(
-        "Anchors",
-        "(List) A list of tensors from multiple FPN levels. Each"
-        "element is a 2-D phi::DenseTensor with shape [Mi, 4] represents the "
-        "locations of Mi anchor boxes from i-th FPN level. Each "
-        "bounding box has four coordinate values and the layout is "
-        "[xmin, ymin, xmax, ymax].")
-        .AsDuplicable();
-    AddInput("ImInfo",
-             "(phi::DenseTensor) A 2-D phi::DenseTensor with shape [N, 3] "
-             "represents the "
-             "image information. N is the batch size, each image information "
-             "includes height, width and scale.");
-    AddAttr<float>("score_threshold",
-                   "(float) "
-                   "Threshold to filter out bounding boxes with a confidence "
-                   "score.");
-    AddAttr<int>("nms_top_k",
-                 "(int64_t) "
-                 "Maximum number of detections per FPN layer to be kept "
-                 "according to the confidence before NMS.");
-    AddAttr<float>("nms_threshold",
-                   "(float) "
-                   "The threshold to be used in NMS.");
-    AddAttr<float>("nms_eta",
-                   "(float) "
-                   "The parameter for adaptive NMS.");
-    AddAttr<int>(
-        "keep_top_k",
-        "(int64_t) "
-        "Number of total bounding boxes to be kept per image after NMS "
-        "step.");
-    AddOutput("Out",
-              "(phi::DenseTensor) A 2-D phi::DenseTensor with shape [No, 6] "
-              "represents the "
-              "detections. Each row has 6 values: "
-              "[label, confidence, xmin, ymin, xmax, ymax]"
-              "No is the total number of detections in this mini-batch."
-              "For each instance, "
-              "the offsets in first dimension are called LoD, the number of "
-              "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
-              "no detected bbox.");
-    AddComment(R"DOC(
-This operator is to decode boxes and scores from each FPN layer and do
-multi-class non maximum suppression (NMS) on merged predictions.
-
-Top-scoring predictions per FPN layer are decoded with the anchor
-information. This operator greedily selects a subset of detection bounding
-boxes from each FPN layer that have high scores larger than score_threshold,
-if providing this threshold, then selects the largest nms_top_k confidences
-scores per FPN layer, if nms_top_k is larger than -1.
-The decoding schema is described below:
-
-ox = (pw * pxv * tx * + px) - tw / 2
-
-oy = (ph * pyv * ty * + py) - th / 2
-
-ow = exp(pwv * tw) * pw + tw / 2
-
-oh = exp(phv * th) * ph + th / 2
-
-where `tx`, `ty`, `tw`, `th` denote the predicted box's center coordinates, width
-and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
-anchor's center coordinates, width and height. `pxv`, `pyv`, `pwv`,
-`phv` denote the variance of the anchor box and `ox`, `oy`, `ow`, `oh` denote the
-decoded coordinates, width and height.
-
-Then the top decoded prediction from all levels are merged followed by NMS.
-In the NMS step, this operator prunes away boxes that have high IOU
-(intersection over union) overlap with already selected boxes by adaptive
-threshold NMS based on parameters of nms_threshold and nms_eta.
-After NMS step, at most keep_top_k number of total bounding boxes are to be kept
-per image if keep_top_k is larger than -1.
-This operator support multi-class and batched inputs. It applying NMS
-independently for each class. The outputs is a 2-D LoDTensor, for each
-image, the offsets in first dimension of phi::DenseTensor are called LoD, the number
-of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0,
-means there is no detected bounding box for this image. If there is no detected boxes
-for all images, all the elements in LoD are set to 0, and the output tensor is
-empty (None).
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    retinanet_detection_output,
-    ops::RetinanetDetectionOutputOp,
-    ops::RetinanetDetectionOutputOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-PD_REGISTER_STRUCT_KERNEL(retinanet_detection_output,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::RetinanetDetectionOutputKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
deleted file mode 100644
index 81e8d0d3edf7e..0000000000000
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ /dev/null
@@ -1,1262 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <random>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detection/bbox_util.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-class RpnTargetAssignOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("Anchor"), "Input", "Anchor", "rpn_target_assign");
-    OP_INOUT_CHECK(
-        ctx->HasInput("GtBoxes"), "Input", "GtBoxes", "rpn_target_assign");
-    OP_INOUT_CHECK(
-        ctx->HasInput("IsCrowd"), "Input", "IsCrowd", "rpn_target_assign");
-    OP_INOUT_CHECK(
-        ctx->HasInput("ImInfo"), "Input", "ImInfo", "rpn_target_assign");
-
-    OP_INOUT_CHECK(ctx->HasOutput("LocationIndex"),
-                   "Output",
-                   "LocationIndex",
-                   "rpn_target_assign");
-    OP_INOUT_CHECK(ctx->HasOutput("ScoreIndex"),
-                   "Output",
-                   "ScoreIndex",
-                   "rpn_target_assign");
-    OP_INOUT_CHECK(ctx->HasOutput("TargetLabel"),
-                   "Output",
-                   "TargetLabel",
-                   "rpn_target_assign");
-    OP_INOUT_CHECK(ctx->HasOutput("TargetBBox"),
-                   "Output",
-                   "TargetBBox",
-                   "rpn_target_assign");
-    OP_INOUT_CHECK(ctx->HasOutput("BBoxInsideWeight"),
-                   "Output",
-                   "BBoxInsideWeight",
-                   "rpn_target_assign");
-
-    auto anchor_dims = ctx->GetInputDim("Anchor");
-    auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
-    auto im_info_dims = ctx->GetInputDim("ImInfo");
-    PADDLE_ENFORCE_EQ(anchor_dims.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The dimensions size of Input(Anchor) must be 2. But "
-                          "received dimensions size=[%d], dimensions=[%s].",
-                          anchor_dims.size(),
-                          anchor_dims));
-    PADDLE_ENFORCE_EQ(gt_boxes_dims.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The dimensions size of Input(GtBoxes) must be 2. "
-                          "But received dimensions size=[%d], dimensions=[%s].",
-                          gt_boxes_dims.size(),
-                          gt_boxes_dims));
-    PADDLE_ENFORCE_EQ(im_info_dims.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The dimensions size of Input(ImInfo) must be 2. But "
-                          "received dimensions size=[%d], dimensions=[%s].",
-                          im_info_dims.size(),
-                          im_info_dims));
-
-    ctx->SetOutputDim("LocationIndex", {-1});
-    ctx->SetOutputDim("ScoreIndex", {-1});
-    ctx->SetOutputDim("TargetLabel", {-1, 1});
-    ctx->SetOutputDim("TargetBBox", {-1, 4});
-    ctx->SetOutputDim("BBoxInsideWeight", {-1, 4});
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(
-        OperatorWithKernel::IndicateVarDataType(ctx, "Anchor"),
-        platform::CPUPlace());
-  }
-};
-
-template <typename T>
-void AppendRpns(phi::DenseTensor* out,
-                int64_t offset,
-                phi::DenseTensor* to_add) {
-  auto* out_data = out->data<T>();
-  auto* to_add_data = to_add->data<T>();
-  memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T));
-}
-
-template <typename T>
-std::vector<phi::DenseTensor> FilterStraddleAnchor(
-    const phi::CPUContext& context,
-    const phi::DenseTensor* anchor,
-    const float rpn_straddle_thresh,
-    T im_height,
-    T im_width) {
-  std::vector<int> inds_inside;
-  int anchor_num = static_cast<int>(anchor->dims()[0]);
-  auto* anchor_data = anchor->data<T>();
-  if (rpn_straddle_thresh >= 0) {
-    int index = 0;
-    for (int i = 0; i < anchor_num; ++i) {
-      index = i * 4;
-      if ((anchor_data[index + 0] >= -rpn_straddle_thresh) &&
-          (anchor_data[index + 1] >= -rpn_straddle_thresh) &&
-          (anchor_data[index + 2] < im_width + rpn_straddle_thresh) &&
-          (anchor_data[index + 3] < im_height + rpn_straddle_thresh)) {
-        inds_inside.emplace_back(i);
-      }
-    }
-  } else {
-    for (int i = 0; i < anchor_num; ++i) {
-      inds_inside.emplace_back(i);
-    }
-  }
-  int inside_num = static_cast<int>(inds_inside.size());
-  phi::DenseTensor inds_inside_t;
-  int* inds_inside_data =
-      inds_inside_t.mutable_data<int>({inside_num}, context.GetPlace());
-  std::copy(inds_inside.begin(), inds_inside.end(), inds_inside_data);
-  phi::DenseTensor inside_anchor_t;
-  T* inside_anchor_data =
-      inside_anchor_t.mutable_data<T>({inside_num, 4}, context.GetPlace());
-  Gather<T>(
-      anchor->data<T>(), 4, inds_inside_data, inside_num, inside_anchor_data);
-  std::vector<phi::DenseTensor> res;
-  res.emplace_back(inds_inside_t);
-  res.emplace_back(inside_anchor_t);
-  return res;
-}
-
-template <typename T>
-phi::DenseTensor FilterCrowdGt(const phi::CPUContext& context,
-                               phi::DenseTensor* gt_boxes,
-                               phi::DenseTensor* is_crowd) {
-  int gt_num = static_cast<int>(gt_boxes->dims()[0]);
-  std::vector<int> not_crowd_inds;
-  auto* is_crowd_data = is_crowd->data<int>();
-  for (int i = 0; i < gt_num; ++i) {
-    if (is_crowd_data[i] == 0) {
-      not_crowd_inds.emplace_back(i);
-    }
-  }
-  int ncrowd_num = static_cast<int>(not_crowd_inds.size());
-  phi::DenseTensor ncrowd_gt_boxes;
-  T* ncrowd_gt_boxes_data =
-      ncrowd_gt_boxes.mutable_data<T>({ncrowd_num, 4}, context.GetPlace());
-  Gather<T>(gt_boxes->data<T>(),
-            4,
-            not_crowd_inds.data(),
-            ncrowd_num,
-            ncrowd_gt_boxes_data);
-  return ncrowd_gt_boxes;
-}
-
-void ReservoirSampling(const int num,
-                       std::vector<int>* inds,
-                       std::minstd_rand engine,
-                       bool use_random) {
-  std::uniform_real_distribution<float> uniform(0, 1);
-  int len = static_cast<int>(inds->size());
-  if (len > num) {
-    if (use_random) {
-      for (int i = num; i < len; ++i) {
-        int rng_ind = std::floor(uniform(engine) * i);  // NOLINT
-        if (rng_ind < num)
-          std::iter_swap(inds->begin() + rng_ind, inds->begin() + i);
-      }
-    }
-    inds->resize(num);
-  }
-}
-
-template <typename T>
-void ScoreAssign(const T* anchor_by_gt_overlap_data,
-                 const phi::DenseTensor& anchor_to_gt_max,
-                 const phi::DenseTensor& gt_to_anchor_max,
-                 const int rpn_batch_size_per_im,
-                 const float rpn_fg_fraction,
-                 const float rpn_positive_overlap,
-                 const float rpn_negative_overlap,
-                 std::vector<int>* fg_inds,
-                 std::vector<int>* bg_inds,
-                 std::vector<int>* tgt_lbl,
-                 std::vector<int>* fg_fake,
-                 std::vector<T>* bbox_inside_weight,
-                 std::minstd_rand engine,
-                 bool use_random) {
-  float epsilon = 0.00001;
-  int anchor_num = static_cast<int>(anchor_to_gt_max.dims()[0]);
-  int gt_num = static_cast<int>(gt_to_anchor_max.dims()[0]);
-  std::vector<int> target_label(anchor_num, -1);
-  std::vector<int> fg_inds_fake;
-  std::vector<int> bg_inds_fake;
-  const T* anchor_to_gt_max_data = anchor_to_gt_max.data<T>();
-  const T* gt_to_anchor_max_data = gt_to_anchor_max.data<T>();
-  // TODO(buxingyuan): Match with Detectron now
-  // but it seems here is a bug in two directions assignment
-  // in which the later one may overwrites the former one.
-  for (int64_t i = 0; i < anchor_num; ++i) {
-    bool is_anchors_with_max_overlap = false;
-    for (int64_t j = 0; j < gt_num; ++j) {
-      T value = anchor_by_gt_overlap_data[i * gt_num + j];
-      T diff = std::abs(value - gt_to_anchor_max_data[j]);
-      if (diff < epsilon) {
-        is_anchors_with_max_overlap = true;
-        break;
-      }
-    }
-    bool is_anchor_great_than_thresh =
-        (anchor_to_gt_max_data[i] >= rpn_positive_overlap);
-    if (is_anchors_with_max_overlap || is_anchor_great_than_thresh) {
-      fg_inds_fake.push_back(i);  // NOLINT
-    }
-  }
-
-  // Reservoir Sampling
-  int fg_num = 0;
-  if (rpn_fg_fraction > 0 && rpn_batch_size_per_im > 0) {
-    fg_num =
-        static_cast<int>(rpn_fg_fraction * rpn_batch_size_per_im);  // NOLINT
-    ReservoirSampling(fg_num, &fg_inds_fake, engine, use_random);
-  } else {
-    fg_num = static_cast<int>(fg_inds_fake.size());
-  }
-  int fg_fake_num = static_cast<int>(fg_inds_fake.size());
-  for (int64_t i = 0; i < fg_fake_num; ++i) {
-    target_label[fg_inds_fake[i]] = 1;
-  }
-
-  for (int64_t i = 0; i < anchor_num; ++i) {
-    if (anchor_to_gt_max_data[i] < rpn_negative_overlap) {
-      bg_inds_fake.push_back(i);  // NOLINT
-    }
-  }
-  int bg_num = 0;
-  if (rpn_fg_fraction > 0 && rpn_batch_size_per_im > 0) {
-    bg_num = rpn_batch_size_per_im - fg_fake_num;
-    ReservoirSampling(bg_num, &bg_inds_fake, engine, use_random);
-    bg_num = static_cast<int>(bg_inds_fake.size());
-  } else {
-    bg_num = static_cast<int>(bg_inds_fake.size());
-  }
-
-  int fake_num = 0;
-  for (int64_t i = 0; i < bg_num; ++i) {
-    // fg fake found
-    if (target_label[bg_inds_fake[i]] == 1) {
-      fake_num++;
-      fg_fake->emplace_back(fg_inds_fake[0]);
-      for (int j = 0; j < 4; ++j) {
-        bbox_inside_weight->emplace_back(T(0.));
-      }
-    }
-    target_label[bg_inds_fake[i]] = 0;
-  }
-
-  for (int64_t i = 0; i < (fg_fake_num - fake_num) * 4; ++i) {
-    bbox_inside_weight->emplace_back(T(1.));
-  }
-
-  for (int64_t i = 0; i < anchor_num; ++i) {
-    if (target_label[i] == 1) {
-      fg_inds->emplace_back(i);
-      fg_fake->emplace_back(i);
-    }
-    if (target_label[i] == 0) bg_inds->emplace_back(i);
-  }
-  fg_num = static_cast<int>(fg_inds->size());
-  bg_num = static_cast<int>(bg_inds->size());
-
-  tgt_lbl->resize(fg_num + bg_num, 0);
-  std::vector<int> fg_lbl(fg_num, 1);
-  std::vector<int> bg_lbl(bg_num, 0);
-  std::copy(fg_lbl.begin(), fg_lbl.end(), tgt_lbl->data());
-  std::copy(bg_lbl.begin(), bg_lbl.end(), tgt_lbl->data() + fg_num);
-}
-
-template <typename T>
-std::vector<phi::DenseTensor> SampleRpnFgBgGt(
-    const phi::CPUContext& ctx,
-    const phi::DenseTensor& anchor_by_gt_overlap,
-    const int rpn_batch_size_per_im,
-    const float rpn_positive_overlap,
-    const float rpn_negative_overlap,
-    const float rpn_fg_fraction,
-    std::minstd_rand engine,
-    bool use_random) {
-  auto* anchor_by_gt_overlap_data = anchor_by_gt_overlap.data<T>();
-  int anchor_num = static_cast<int>(anchor_by_gt_overlap.dims()[0]);
-  int gt_num = static_cast<int>(anchor_by_gt_overlap.dims()[1]);
-
-  std::vector<int> fg_inds;
-  std::vector<int> bg_inds;
-  std::vector<int> gt_inds;
-  std::vector<int> tgt_lbl;
-  std::vector<int> fg_fake;
-  std::vector<T> bbox_inside_weight;
-  // Calculate the max IoU between anchors and gt boxes
-  // Map from anchor to gt box that has highest overlap
-  auto place = ctx.GetPlace();
-  phi::DenseTensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max;
-  anchor_to_gt_max.mutable_data<T>({anchor_num}, place);
-  int* argmax = anchor_to_gt_argmax.mutable_data<int>({anchor_num}, place);
-  gt_to_anchor_max.mutable_data<T>({gt_num}, place);
-
-  auto anchor_by_gt_overlap_et =
-      framework::EigenMatrix<T>::From(anchor_by_gt_overlap);
-  auto anchor_to_gt_max_et =
-      framework::EigenVector<T>::Flatten(anchor_to_gt_max);
-  auto gt_to_anchor_max_et =
-      framework::EigenVector<T>::Flatten(gt_to_anchor_max);
-  auto anchor_to_gt_argmax_et =
-      framework::EigenVector<int>::Flatten(anchor_to_gt_argmax);
-  anchor_to_gt_max_et =
-      anchor_by_gt_overlap_et.maximum(Eigen::DSizes<int, 1>(1));
-  anchor_to_gt_argmax_et =
-      anchor_by_gt_overlap_et.argmax(1).template cast<int>();
-  gt_to_anchor_max_et =
-      anchor_by_gt_overlap_et.maximum(Eigen::DSizes<int, 1>(0));
-
-  // Follow the Faster RCNN's implementation
-  ScoreAssign(anchor_by_gt_overlap_data,
-              anchor_to_gt_max,
-              gt_to_anchor_max,
-              rpn_batch_size_per_im,
-              rpn_fg_fraction,
-              rpn_positive_overlap,
-              rpn_negative_overlap,
-              &fg_inds,
-              &bg_inds,
-              &tgt_lbl,
-              &fg_fake,
-              &bbox_inside_weight,
-              engine,
-              use_random);
-
-  int fg_num = static_cast<int>(fg_inds.size());
-  int bg_num = static_cast<int>(bg_inds.size());
-  int fg_fake_num = static_cast<int>(fg_fake.size());
-  gt_inds.reserve(fg_fake_num);
-  for (int i = 0; i < fg_fake_num; ++i) {
-    gt_inds.emplace_back(argmax[fg_fake[i]]);
-  }
-  phi::DenseTensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t,
-      bbox_inside_weight_t;
-  int* loc_index_data = loc_index_t.mutable_data<int>({fg_fake_num}, place);
-  int* score_index_data =
-      score_index_t.mutable_data<int>({fg_num + bg_num}, place);
-  int* tgt_lbl_data = tgt_lbl_t.mutable_data<int>({fg_num + bg_num}, place);
-  int* gt_inds_data = gt_inds_t.mutable_data<int>({fg_fake_num}, place);
-  T* bbox_inside_weight_data =
-      bbox_inside_weight_t.mutable_data<T>({fg_fake_num, 4}, place);
-  std::copy(fg_fake.begin(), fg_fake.end(), loc_index_data);
-  std::copy(fg_inds.begin(), fg_inds.end(), score_index_data);
-  std::copy(bg_inds.begin(), bg_inds.end(), score_index_data + fg_num);
-  std::copy(tgt_lbl.begin(), tgt_lbl.end(), tgt_lbl_data);
-  std::copy(gt_inds.begin(), gt_inds.end(), gt_inds_data);
-  std::copy(bbox_inside_weight.begin(),
-            bbox_inside_weight.end(),
-            bbox_inside_weight_data);
-  std::vector<phi::DenseTensor> loc_score_tgtlbl_gt;
-  loc_score_tgtlbl_gt.emplace_back(loc_index_t);
-  loc_score_tgtlbl_gt.emplace_back(score_index_t);
-  loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t);
-  loc_score_tgtlbl_gt.emplace_back(gt_inds_t);
-  loc_score_tgtlbl_gt.emplace_back(bbox_inside_weight_t);
-
-  return loc_score_tgtlbl_gt;
-}
-
-template <typename T, typename DeviceContext>
-class RpnTargetAssignKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* anchor = context.Input<phi::DenseTensor>("Anchor");  // (H*W*A) * 4
-    auto* gt_boxes = context.Input<phi::DenseTensor>("GtBoxes");
-    auto* is_crowd = context.Input<phi::DenseTensor>("IsCrowd");
-    auto* im_info = context.Input<phi::DenseTensor>("ImInfo");
-
-    auto* loc_index = context.Output<phi::DenseTensor>("LocationIndex");
-    auto* score_index = context.Output<phi::DenseTensor>("ScoreIndex");
-    auto* tgt_bbox = context.Output<phi::DenseTensor>("TargetBBox");
-    auto* tgt_lbl = context.Output<phi::DenseTensor>("TargetLabel");
-    auto* bbox_inside_weight =
-        context.Output<phi::DenseTensor>("BBoxInsideWeight");
-
-    PADDLE_ENFORCE_EQ(gt_boxes->lod().size(),
-                      1UL,
-                      platform::errors::InvalidArgument(
-                          "RpnTargetAssignOp gt_boxes needs 1 level of LoD. "
-                          "But received level of LoD is [%d], LoD is [%s].",
-                          gt_boxes->lod().size(),
-                          gt_boxes->lod()));
-    PADDLE_ENFORCE_EQ(is_crowd->lod().size(),
-                      1UL,
-                      platform::errors::InvalidArgument(
-                          "RpnTargetAssignOp is_crowd needs 1 level of LoD. "
-                          "But received level of LoD is [%d], LoD is [%s].",
-                          is_crowd->lod().size(),
-                          is_crowd->lod()));
-    int64_t anchor_num = static_cast<int64_t>(anchor->dims()[0]);
-    int64_t batch_num = static_cast<int64_t>(gt_boxes->lod().back().size() - 1);
-
-    int rpn_batch_size_per_im = context.Attr<int>("rpn_batch_size_per_im");
-    float rpn_straddle_thresh = context.Attr<float>("rpn_straddle_thresh");
-    float rpn_positive_overlap = context.Attr<float>("rpn_positive_overlap");
-    float rpn_negative_overlap = context.Attr<float>("rpn_negative_overlap");
-    float rpn_fg_fraction = context.Attr<float>("rpn_fg_fraction");
-    bool use_random = context.Attr<bool>("use_random");
-
-    int64_t max_num = batch_num * rpn_batch_size_per_im;
-    auto place = context.GetPlace();
-
-    loc_index->mutable_data<int>({max_num}, place);
-    score_index->mutable_data<int>({max_num}, place);
-    tgt_bbox->mutable_data<T>({max_num, 4}, place);
-    tgt_lbl->mutable_data<int>({max_num, 1}, place);
-    bbox_inside_weight->mutable_data<T>({max_num, 4}, place);
-    auto& dev_ctx = context.device_context<phi::CPUContext>();
-
-    std::random_device rnd;
-    std::minstd_rand engine;
-    int seed = static_cast<int>(rnd());
-    engine.seed(seed);
-
-    framework::LoD lod_loc, loc_score;
-    std::vector<size_t> lod0_loc(1, 0);
-    std::vector<size_t> lod0_score(1, 0);
-
-    int total_loc_num = 0;
-    int total_score_num = 0;
-    auto gt_boxes_lod = gt_boxes->lod().back();
-    auto is_crowd_lod = is_crowd->lod().back();
-    for (int i = 0; i < batch_num; ++i) {
-      phi::DenseTensor gt_boxes_slice =
-          gt_boxes->Slice(static_cast<int64_t>(gt_boxes_lod[i]),
-                          static_cast<int64_t>(gt_boxes_lod[i + 1]));
-      phi::DenseTensor is_crowd_slice =
-          is_crowd->Slice(static_cast<int64_t>(is_crowd_lod[i]),
-                          static_cast<int64_t>(is_crowd_lod[i + 1]));
-      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
-      auto* im_info_data = im_info_slice.data<T>();
-      auto im_height = im_info_data[0];
-      auto im_width = im_info_data[1];
-      auto im_scale = im_info_data[2];
-
-      // Filter straddle anchor
-      std::vector<phi::DenseTensor> filter_output = FilterStraddleAnchor<T>(
-          dev_ctx, anchor, rpn_straddle_thresh, im_height, im_width);
-      phi::DenseTensor inds_inside = filter_output[0];
-      phi::DenseTensor inside_anchor = filter_output[1];
-
-      // Filter crowd gt
-      phi::DenseTensor ncrowd_gt_boxes =
-          FilterCrowdGt<T>(dev_ctx, &gt_boxes_slice, &is_crowd_slice);
-      auto ncrowd_gt_boxes_et =
-          framework::EigenTensor<T, 2>::From(ncrowd_gt_boxes);
-      ncrowd_gt_boxes_et = ncrowd_gt_boxes_et * im_scale;
-
-      phi::DenseTensor anchor_by_gt_overlap;
-      anchor_by_gt_overlap.mutable_data<T>(
-          {inside_anchor.dims()[0], ncrowd_gt_boxes.dims()[0]}, place);
-      BboxOverlaps<T>(inside_anchor, ncrowd_gt_boxes, &anchor_by_gt_overlap);
-
-      auto loc_score_tgtlbl_gt = SampleRpnFgBgGt<T>(dev_ctx,
-                                                    anchor_by_gt_overlap,
-                                                    rpn_batch_size_per_im,
-                                                    rpn_positive_overlap,
-                                                    rpn_negative_overlap,
-                                                    rpn_fg_fraction,
-                                                    engine,
-                                                    use_random);
-
-      phi::DenseTensor sampled_loc_index = loc_score_tgtlbl_gt[0];
-      phi::DenseTensor sampled_score_index = loc_score_tgtlbl_gt[1];
-      phi::DenseTensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
-      phi::DenseTensor sampled_gt_index = loc_score_tgtlbl_gt[3];
-      phi::DenseTensor sampled_bbox_inside_weight = loc_score_tgtlbl_gt[4];
-
-      int loc_num = static_cast<int>(sampled_loc_index.dims()[0]);
-      int score_num = static_cast<int>(sampled_score_index.dims()[0]);
-      // unmap to all anchor
-      phi::DenseTensor sampled_loc_index_unmap, sampled_score_index_unmap;
-      sampled_loc_index_unmap.mutable_data<int>({loc_num}, place);
-      sampled_score_index_unmap.mutable_data<int>({score_num}, place);
-      Gather<int>(inds_inside.data<int>(),
-                  1,
-                  sampled_loc_index.data<int>(),
-                  loc_num,
-                  sampled_loc_index_unmap.data<int>());
-      Gather<int>(inds_inside.data<int>(),
-                  1,
-                  sampled_score_index.data<int>(),
-                  score_num,
-                  sampled_score_index_unmap.data<int>());
-
-      // get target bbox deltas
-      phi::DenseTensor sampled_anchor, sampled_gt, sampled_tgt_bbox;
-      auto* sampled_anchor_data =
-          sampled_anchor.mutable_data<T>({loc_num, 4}, place);
-      auto* sampled_gt_data = sampled_gt.mutable_data<T>({loc_num, 4}, place);
-      Gather<T>(anchor->data<T>(),
-                4,
-                sampled_loc_index_unmap.data<int>(),
-                loc_num,
-                sampled_anchor_data);
-      Gather<T>(ncrowd_gt_boxes.data<T>(),
-                4,
-                sampled_gt_index.data<int>(),
-                loc_num,
-                sampled_gt_data);
-      sampled_tgt_bbox.mutable_data<T>({loc_num, 4}, place);
-      BoxToDelta<T>(loc_num,
-                    sampled_anchor,
-                    sampled_gt,
-                    nullptr,
-                    false,
-                    &sampled_tgt_bbox);
-
-      // Add anchor offset
-      int anchor_offset = static_cast<int>(i * anchor_num);
-      auto sampled_loc_index_unmap_et =
-          framework::EigenTensor<int, 1>::From(sampled_loc_index_unmap);
-      sampled_loc_index_unmap_et = sampled_loc_index_unmap_et + anchor_offset;
-      auto sampled_score_index_unmap_et =
-          framework::EigenTensor<int, 1>::From(sampled_score_index_unmap);
-      sampled_score_index_unmap_et =
-          sampled_score_index_unmap_et + anchor_offset;
-      AppendRpns<int>(loc_index, total_loc_num, &sampled_loc_index_unmap);
-      AppendRpns<int>(score_index, total_score_num, &sampled_score_index_unmap);
-      AppendRpns<T>(tgt_bbox, total_loc_num * 4, &sampled_tgt_bbox);
-      AppendRpns<int>(tgt_lbl, total_score_num, &sampled_tgtlbl);
-      AppendRpns<T>(
-          bbox_inside_weight, total_loc_num * 4, &sampled_bbox_inside_weight);
-      total_loc_num += loc_num;
-
-      total_score_num += score_num;
-      lod0_loc.emplace_back(total_loc_num);
-      lod0_score.emplace_back(total_score_num);
-    }
-
-    PADDLE_ENFORCE_LE(
-        total_loc_num,
-        max_num,
-        platform::errors::InvalidArgument(
-            "The number of sampled bboxes should not be greater than the "
-            "number of all anchor boxes(%d), but the number of sampled "
-            "bboxes is :%d.",
-            max_num,
-            total_loc_num));
-    PADDLE_ENFORCE_LE(
-        total_score_num,
-        max_num,
-        platform::errors::InvalidArgument(
-            "The number of sampled scores should not be greater than the "
-            "number of all anchor boxes(%d), but the number of sampled "
-            "scores is :%d.",
-            max_num,
-            total_score_num));
-
-    lod_loc.emplace_back(lod0_loc);
-    loc_score.emplace_back(lod0_score);
-    loc_index->set_lod(lod_loc);
-    score_index->set_lod(loc_score);
-    tgt_bbox->set_lod(lod_loc);
-    tgt_lbl->set_lod(loc_score);
-    bbox_inside_weight->set_lod(lod_loc);
-    loc_index->Resize({total_loc_num});
-    score_index->Resize({total_score_num});
-    tgt_bbox->Resize({total_loc_num, 4});
-    tgt_lbl->Resize({total_score_num, 1});
-    bbox_inside_weight->Resize({total_loc_num, 4});
-  }
-};
-
-class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Anchor",
-             "(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4].");
-    AddInput("GtBoxes",
-             "(phi::DenseTensor) input ground-truth bbox with shape [K, 4].");
-    AddInput("IsCrowd",
-             "(phi::DenseTensor) input which indicates ground-truth is crowd.");
-    AddInput("ImInfo",
-             "(phi::DenseTensor) input image information with shape [N, 3]. "
-             "N is the batch size, each image information includes height, "
-             "width and scale.");
-    AddAttr<int>("rpn_batch_size_per_im",
-                 "Total number of RPN examples per image.")
-        .SetDefault(256);
-    AddAttr<float>(
-        "rpn_straddle_thresh",
-        "Remove RPN anchors that go outside the image by straddle_thresh "
-        "pixels, "
-        "Set to -1 or a large value, e.g. 100000, to disable pruning anchors.");
-    AddAttr<float>(
-        "rpn_positive_overlap",
-        "Minimum overlap required between an anchor and ground-truth "
-        "box for the (anchor, gt box) pair to be a positive example.")
-        .SetDefault(0.7);
-    AddAttr<float>(
-        "rpn_negative_overlap",
-        "Maximum overlap allowed between an anchor and ground-truth "
-        "box for the (anchor, gt box) pair to be a negative examples.")
-        .SetDefault(0.3);
-    AddAttr<float>(
-        "rpn_fg_fraction",
-        "Target fraction of RoI minibatch that "
-        "is labeled foreground (i.e. class > 0), 0-th class is background.")
-        .SetDefault(0.25);
-    AddAttr<bool>("use_random",
-                  "A flag indicating whether to use a ReservoirSampling. "
-                  "NOTE: DO NOT set this flag to false in training. "
-                  "Setting this flag to false is only useful in unittest.")
-        .SetDefault(true);
-    AddOutput(
-        "LocationIndex",
-        "(Tensor), The indexes of foreground anchors in all RPN anchors, the "
-        "shape of the LocationIndex is [F], F depends on the value of input "
-        "tensor and attributes.");
-    AddOutput(
-        "ScoreIndex",
-        "(Tensor), The indexes of foreground and background anchors in all "
-        "RPN anchors(The rest anchors are ignored). The shape of the "
-        "ScoreIndex is [F + B], F and B are sampled foreground and background "
-        " number.");
-    AddOutput("TargetBBox",
-              "(Tensor), The target bbox deltas with shape "
-              "[F, 4], F is the sampled foreground number.");
-    AddOutput(
-        "TargetLabel",
-        "(Tensor<int>), The target labels of each anchor with shape "
-        "[F + B, 1], F and B are sampled foreground and background number.");
-    AddOutput("BBoxInsideWeight",
-              "(Tensor), The bbox inside weight with shape "
-              "[F, 4], F is the sampled foreground number.");
-    AddComment(R"DOC(
-This operator can be, for a given set of ground truth bboxes and the
-anchors, to assign classification and regression targets to each prediction.
-The ScoreIndex and LocationIndex will be generated according to the anchor-groundtruth IOU.
-The rest anchors would not contibute to the RPN training loss
-
-ScoreIndex is composed of foreground anchor indexes(positive labels) and
-background anchor indexes(negative labels). LocationIndex is exactly same
-as the foreground anchor indexes since we can not assign regression target to
-the background anchors.
-
-The classification targets(TargetLabel) is a binary class label (of being
-an object or not). Following the paper of Faster-RCNN, the positive labels
-are two kinds of anchors: (i) the anchor/anchors with the highest IoU
-overlap with a ground-truth box, or (ii) an anchor that has an IoU overlap
-higher than rpn_positive_overlap(0.7) with any ground-truth box. Note that
-a single ground-truth box may assign positive labels to multiple anchors.
-A non-positive anchor is when its IoU ratio is lower than rpn_negative_overlap
-(0.3) for all ground-truth boxes. Anchors that are neither positive nor
-negative do not contribute to the training objective.
-
-)DOC");
-  }
-};
-
-class RetinanetTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Anchor",
-             "(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4].");
-    AddInput("GtBoxes",
-             "(phi::DenseTensor) input ground-truth bbox with shape [K, 4].");
-    AddInput("GtLabels",
-             "(phi::DenseTensor) input ground-truth label with shape [K, 1].");
-    AddInput("IsCrowd",
-             "(phi::DenseTensor) input which indicates ground-truth is crowd.");
-    AddInput("ImInfo",
-             "(phi::DenseTensor) input image information with shape [N, 3]. "
-             "N is the batch size, each image information includes height, "
-             "width and scale.");
-    AddAttr<float>(
-        "positive_overlap",
-        "Minimum overlap required between an anchor and ground-truth "
-        "box for the (anchor, gt box) pair to be a positive example.")
-        .SetDefault(0.5);
-    AddAttr<float>(
-        "negative_overlap",
-        "Maximum overlap allowed between an anchor and ground-truth "
-        "box for the (anchor, gt box) pair to be a negative examples.")
-        .SetDefault(0.4);
-    AddOutput(
-        "LocationIndex",
-        "(Tensor), The indexes of foreground anchors in all anchors, the "
-        "shape of the LocationIndex is [F], F depends on the value of input "
-        "tensor and attributes.");
-    AddOutput(
-        "ScoreIndex",
-        "(Tensor), The indexes of foreground and background anchors in all "
-        "RPN anchors(The rest anchors are ignored). The shape of the "
-        "ScoreIndex is [F + B], F and B are foreground and background "
-        " number.");
-    AddOutput("TargetBBox",
-              "(Tensor), The target bbox deltas with shape "
-              "[F, 4], F is the foreground number.");
-    AddOutput("TargetLabel",
-              "(Tensor<int>), The target labels of each anchor with shape "
-              "[F + B, 1], F and B are foreground and background number.");
-    AddOutput("BBoxInsideWeight",
-              "(Tensor), The bbox inside weight with shape "
-              "[F, 4], F is the foreground number.");
-    AddOutput("ForegroundNumber",
-              "(Tensor), The foreground number. "
-              "[1, 1].");
-    AddComment(R"DOC(
-    This layer can be, for given the Intersection-over-Union (IoU) overlap
-    between anchors and ground truth boxes, to assign classification and
-    regression targets to each anchor, these target labels are used for
-    train retinanet.
-
-    Every anchor is assigned with a length C one-hot vector of
-    classification targets, and a 4-vector of box regression targets,
-    where C is the class number. The assignment rules are as followed:
-
-    1. Anchors are assigned to ground-truth boxes when: (i) it has the highest
-    IoU overlap with a ground-truth box, or (ii) it has an IoU overlap higher
-    than positive_overlap(0.5) with any ground-truth box.
-
-    2. Anchors are assigned to background when its IoU ratio is lower than
-    negative_overlap (0.4) for all ground-truth boxes.
-
-    When an anchor is assigned with a ground-truth box which is the i-th category,
-    the i-th entry in its C vector of targets is set to 1 and all other entries
-    are set to 0. When an anchor is assigned with background, all entries are set
-    to 0. Anchors that are not assigned do not contribute to the training
-    objective. The regression targets are the encoded ground-truth boxes
-    associated with the assigned anchors.
-
-)DOC");
-  }
-};
-
-class RetinanetTargetAssignOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("Anchor"), "Input", "Anchor", "retinanet_target_assign");
-    OP_INOUT_CHECK(ctx->HasInput("GtBoxes"),
-                   "Input",
-                   "GtBoxes",
-                   "retinanet_target_assign");
-    OP_INOUT_CHECK(ctx->HasInput("GtLabels"),
-                   "Input",
-                   "GtLabels",
-                   "retinanet_target_assign");
-    OP_INOUT_CHECK(ctx->HasInput("IsCrowd"),
-                   "Input",
-                   "IsCrowd",
-                   "retinanet_target_assign");
-    OP_INOUT_CHECK(
-        ctx->HasInput("ImInfo"), "Input", "ImInfo", "retinanet_target_assign");
-    OP_INOUT_CHECK(ctx->HasOutput("LocationIndex"),
-                   "Output",
-                   "LocationIndex",
-                   "retinanet_target_assign");
-    OP_INOUT_CHECK(ctx->HasOutput("ScoreIndex"),
-                   "Output",
-                   "ScoreIndex",
-                   "retinanet_target_assign");
-    OP_INOUT_CHECK(ctx->HasOutput("TargetLabel"),
-                   "Output",
-                   "TargetLabel",
-                   "retinanet_target_assign");
-    OP_INOUT_CHECK(ctx->HasOutput("TargetBBox"),
-                   "Output",
-                   "TargetBBox",
-                   "retinanet_target_assign");
-    OP_INOUT_CHECK(ctx->HasOutput("BBoxInsideWeight"),
-                   "Output",
-                   "BBoxInsideWeight",
-                   "retinanet_target_assign");
-    OP_INOUT_CHECK(ctx->HasOutput("ForegroundNumber"),
-                   "Output",
-                   "ForegroundNumber",
-                   "retinanet_target_assign");
-
-    auto anchor_dims = ctx->GetInputDim("Anchor");
-    auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
-    auto gt_labels_dims = ctx->GetInputDim("GtLabels");
-    auto im_info_dims = ctx->GetInputDim("ImInfo");
-
-    PADDLE_ENFORCE_EQ(
-        anchor_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "The rank of Input(Anchor) should be 2, but received Anchor "
-            "rank is :%d, Anchor shape is:[%s].",
-            anchor_dims.size(),
-            anchor_dims));
-    PADDLE_ENFORCE_EQ(
-        gt_boxes_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "The rank of Input(GtBoxes) should be 2, but received GtBoxes "
-            "rank is :%d, GtBoxes shape is:[%s].",
-            gt_boxes_dims.size(),
-            gt_boxes_dims));
-    PADDLE_ENFORCE_EQ(
-        gt_labels_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "The rank of Input(GtLabels) should be 2, but received GtLabels "
-            "rank is :%d, GtLabels shape is:[%s].",
-            gt_labels_dims.size(),
-            gt_labels_dims));
-    PADDLE_ENFORCE_EQ(
-        im_info_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "The rank of Input(ImInfo) should be 2, but received ImInfo "
-            "rank is :%d, ImInfo shape is:[%s].",
-            im_info_dims.size(),
-            im_info_dims));
-
-    ctx->SetOutputDim("LocationIndex", {gt_labels_dims[0]});
-    ctx->SetOutputDim("ScoreIndex", {gt_labels_dims[0]});
-    ctx->SetOutputDim("TargetBBox", {gt_labels_dims[0], 4});
-    ctx->SetOutputDim("TargetLabel", {gt_labels_dims[0], 1});
-    ctx->SetOutputDim("BBoxInsideWeight", {gt_labels_dims[0], 4});
-    ctx->SetOutputDim("ForegroundNumber", {gt_labels_dims[0], 1});
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(
-        OperatorWithKernel::IndicateVarDataType(ctx, "Anchor"),
-        platform::CPUPlace());
-  }
-};
-
-template <typename T>
-std::vector<phi::DenseTensor> FilterCrowdGtBoxLabel(
-    const phi::CPUContext& context,
-    phi::DenseTensor* gt_boxes,
-    phi::DenseTensor* gt_labels,
-    phi::DenseTensor* is_crowd) {
-  int gt_num = static_cast<int>(gt_boxes->dims()[0]);
-  std::vector<int> not_crowd_inds;
-  auto* is_crowd_data = is_crowd->data<int>();
-  for (int i = 0; i < gt_num; ++i) {
-    if (is_crowd_data[i] == 0) {
-      not_crowd_inds.emplace_back(i);
-    }
-  }
-  int ncrowd_num = static_cast<int>(not_crowd_inds.size());
-  phi::DenseTensor ncrowd_gt_boxes, ncrowd_gt_labels;
-  T* ncrowd_gt_boxes_data =
-      ncrowd_gt_boxes.mutable_data<T>({ncrowd_num, 4}, context.GetPlace());
-  int* ncrowd_gt_labels_data =
-      ncrowd_gt_labels.mutable_data<int>({ncrowd_num, 1}, context.GetPlace());
-  Gather<T>(gt_boxes->data<T>(),
-            4,
-            not_crowd_inds.data(),
-            ncrowd_num,
-            ncrowd_gt_boxes_data);
-  Gather<int>(gt_labels->data<int>(),
-              1,
-              not_crowd_inds.data(),
-              ncrowd_num,
-              ncrowd_gt_labels_data);
-  std::vector<phi::DenseTensor> res;
-  res.emplace_back(ncrowd_gt_boxes);
-  res.emplace_back(ncrowd_gt_labels);
-  return res;
-}
-
-template <typename T>
-std::vector<phi::DenseTensor> GetAllFgBgGt(
-    const phi::CPUContext& ctx,
-    const phi::DenseTensor& anchor_by_gt_overlap,
-    const phi::DenseTensor& ncrowd_gt_labels,
-    const float positive_overlap,
-    const float negative_overlap,
-    std::minstd_rand engine) {
-  auto* anchor_by_gt_overlap_data = anchor_by_gt_overlap.data<T>();
-  int anchor_num = static_cast<int>(anchor_by_gt_overlap.dims()[0]);
-  int gt_num = static_cast<int>(anchor_by_gt_overlap.dims()[1]);
-
-  std::vector<int> fg_inds;
-  std::vector<int> bg_inds;
-  std::vector<int> gt_inds;
-  std::vector<int> tgt_lbl;
-  std::vector<int> fg_fake;
-  std::vector<T> bbox_inside_weight;
-  // Calculate the max IoU between anchors and gt boxes
-  // Map from anchor to gt box that has highest overlap
-  auto place = ctx.GetPlace();
-  phi::DenseTensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max;
-  anchor_to_gt_max.mutable_data<T>({anchor_num}, place);
-  int* argmax = anchor_to_gt_argmax.mutable_data<int>({anchor_num}, place);
-  gt_to_anchor_max.mutable_data<T>({gt_num}, place);
-
-  auto anchor_by_gt_overlap_et =
-      framework::EigenMatrix<T>::From(anchor_by_gt_overlap);
-  auto anchor_to_gt_max_et =
-      framework::EigenVector<T>::Flatten(anchor_to_gt_max);
-  auto gt_to_anchor_max_et =
-      framework::EigenVector<T>::Flatten(gt_to_anchor_max);
-  auto anchor_to_gt_argmax_et =
-      framework::EigenVector<int>::Flatten(anchor_to_gt_argmax);
-  anchor_to_gt_max_et =
-      anchor_by_gt_overlap_et.maximum(Eigen::DSizes<int, 1>(1));
-  anchor_to_gt_argmax_et =
-      anchor_by_gt_overlap_et.argmax(1).template cast<int>();
-  gt_to_anchor_max_et =
-      anchor_by_gt_overlap_et.maximum(Eigen::DSizes<int, 1>(0));
-
-  ScoreAssign(anchor_by_gt_overlap_data,
-              anchor_to_gt_max,
-              gt_to_anchor_max,
-              -1,
-              -1,
-              positive_overlap,
-              negative_overlap,
-              &fg_inds,
-              &bg_inds,
-              &tgt_lbl,
-              &fg_fake,
-              &bbox_inside_weight,
-              engine,
-              false);
-  const int* gt_labels_data = ncrowd_gt_labels.data<int>();
-  int64_t fg_num = static_cast<int64_t>(fg_inds.size());
-  for (int64_t i = 0; i < fg_num; ++i) {
-    int gt_idx = argmax[fg_inds[i]];
-    tgt_lbl[i] = gt_labels_data[gt_idx];
-  }
-
-  int bg_num = static_cast<int>(bg_inds.size());
-  int fg_fake_num = static_cast<int>(fg_fake.size());
-  gt_inds.reserve(fg_fake_num);
-  for (int i = 0; i < fg_fake_num; ++i) {
-    gt_inds.emplace_back(argmax[fg_fake[i]]);
-  }
-
-  phi::DenseTensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t,
-      bbox_inside_weight_t;
-  phi::DenseTensor fg_num_t;
-  int* loc_index_data = loc_index_t.mutable_data<int>({fg_fake_num}, place);
-  int* score_index_data =
-      score_index_t.mutable_data<int>({fg_num + bg_num}, place);
-  int* tgt_lbl_data = tgt_lbl_t.mutable_data<int>({fg_num + bg_num}, place);
-  int* gt_inds_data = gt_inds_t.mutable_data<int>({fg_fake_num}, place);
-  int* fg_num_data = fg_num_t.mutable_data<int>({1}, place);
-  T* bbox_inside_weight_data =
-      bbox_inside_weight_t.mutable_data<T>({fg_fake_num, 4}, place);
-  std::copy(fg_fake.begin(), fg_fake.end(), loc_index_data);
-  std::copy(fg_inds.begin(), fg_inds.end(), score_index_data);
-  std::copy(bg_inds.begin(), bg_inds.end(), score_index_data + fg_num);
-  std::copy(tgt_lbl.begin(), tgt_lbl.end(), tgt_lbl_data);
-  std::copy(gt_inds.begin(), gt_inds.end(), gt_inds_data);
-  std::copy(bbox_inside_weight.begin(),
-            bbox_inside_weight.end(),
-            bbox_inside_weight_data);
-  fg_num_data[0] = static_cast<int>(fg_fake.size()) + 1;
-  std::vector<phi::DenseTensor> loc_score_tgtlbl_gt;
-  loc_score_tgtlbl_gt.emplace_back(loc_index_t);
-  loc_score_tgtlbl_gt.emplace_back(score_index_t);
-  loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t);
-  loc_score_tgtlbl_gt.emplace_back(gt_inds_t);
-  loc_score_tgtlbl_gt.emplace_back(bbox_inside_weight_t);
-  loc_score_tgtlbl_gt.emplace_back(fg_num_t);
-
-  return loc_score_tgtlbl_gt;
-}
-
-template <typename T, typename DeviceContext>
-class RetinanetTargetAssignKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* anchor = context.Input<phi::DenseTensor>("Anchor");  // (H*W*A) * 4
-    auto* gt_boxes = context.Input<phi::DenseTensor>("GtBoxes");
-    auto* gt_labels = context.Input<phi::DenseTensor>("GtLabels");
-    auto* is_crowd = context.Input<phi::DenseTensor>("IsCrowd");
-    auto* im_info = context.Input<phi::DenseTensor>("ImInfo");
-
-    auto* loc_index = context.Output<phi::DenseTensor>("LocationIndex");
-    auto* score_index = context.Output<phi::DenseTensor>("ScoreIndex");
-    auto* tgt_bbox = context.Output<phi::DenseTensor>("TargetBBox");
-    auto* tgt_lbl = context.Output<phi::DenseTensor>("TargetLabel");
-    auto* bbox_inside_weight =
-        context.Output<phi::DenseTensor>("BBoxInsideWeight");
-    auto* fg_num = context.Output<phi::DenseTensor>("ForegroundNumber");
-
-    PADDLE_ENFORCE_EQ(
-        gt_boxes->lod().size(),
-        1UL,
-        platform::errors::InvalidArgument(
-            "The LoD level of Input(GtBoxes) should be 1, but received GtBoxes "
-            "LoD level is :%d.",
-            gt_boxes->lod().size()));
-    PADDLE_ENFORCE_EQ(
-        gt_labels->lod().size(),
-        1UL,
-        platform::errors::InvalidArgument("The LoD level of Input(GtLabels) "
-                                          "should be 1, but received GtLabels "
-                                          "LoD level is :%d.",
-                                          gt_labels->lod().size()));
-    PADDLE_ENFORCE_EQ(
-        is_crowd->lod().size(),
-        1UL,
-        platform::errors::InvalidArgument(
-            "The LoD level of Input(IsCrowd) should be 1, but received IsCrowd "
-            "LoD level is :%d.",
-            is_crowd->lod().size()));
-
-    int64_t anchor_num = static_cast<int64_t>(anchor->dims()[0]);
-    int64_t batch_num = static_cast<int64_t>(gt_boxes->lod().back().size() - 1);
-
-    float positive_overlap = context.Attr<float>("positive_overlap");
-    float negative_overlap = context.Attr<float>("negative_overlap");
-
-    int64_t max_num = batch_num * anchor_num;
-    auto place = context.GetPlace();
-
-    loc_index->mutable_data<int>({max_num}, place);
-    score_index->mutable_data<int>({max_num}, place);
-    tgt_bbox->mutable_data<T>({max_num, 4}, place);
-    tgt_lbl->mutable_data<int>({max_num, 1}, place);
-    bbox_inside_weight->mutable_data<T>({max_num, 4}, place);
-    fg_num->mutable_data<int>({batch_num, 1}, place);
-    auto& dev_ctx = context.device_context<phi::CPUContext>();
-
-    std::random_device rnd;
-    std::minstd_rand engine;
-    int seed = static_cast<int>(rnd());
-    engine.seed(seed);
-
-    framework::LoD lod_loc, loc_score, lod_fg;
-    std::vector<size_t> lod0_loc(1, 0);
-    std::vector<size_t> lod0_score(1, 0);
-    std::vector<size_t> lod0_fg(1, 0);
-
-    int total_loc_num = 0;
-    int total_score_num = 0;
-    int total_fg_num = 0;
-    auto gt_boxes_lod = gt_boxes->lod().back();
-    auto gt_labels_lod = gt_labels->lod().back();
-    auto is_crowd_lod = is_crowd->lod().back();
-    for (int i = 0; i < batch_num; ++i) {
-      phi::DenseTensor gt_boxes_slice =
-          gt_boxes->Slice(static_cast<int64_t>(gt_boxes_lod[i]),
-                          static_cast<int64_t>(gt_boxes_lod[i + 1]));
-      phi::DenseTensor gt_labels_slice =
-          gt_labels->Slice(static_cast<int64_t>(gt_labels_lod[i]),
-                           static_cast<int64_t>(gt_labels_lod[i + 1]));
-      phi::DenseTensor is_crowd_slice =
-          is_crowd->Slice(static_cast<int64_t>(is_crowd_lod[i]),
-                          static_cast<int64_t>(is_crowd_lod[i + 1]));
-      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
-      auto* im_info_data = im_info_slice.data<T>();
-      auto im_height = im_info_data[0];
-      auto im_width = im_info_data[1];
-      auto im_scale = im_info_data[2];
-
-      // Filter straddle anchor
-      std::vector<phi::DenseTensor> filter_output =
-          FilterStraddleAnchor<T>(dev_ctx, anchor, -1, im_height, im_width);
-      phi::DenseTensor inds_inside = filter_output[0];
-      phi::DenseTensor inside_anchor = filter_output[1];
-
-      // Filter crowd gt
-      std::vector<phi::DenseTensor> ncrowd_output = FilterCrowdGtBoxLabel<T>(
-          dev_ctx, &gt_boxes_slice, &gt_labels_slice, &is_crowd_slice);
-      phi::DenseTensor ncrowd_gt_boxes = ncrowd_output[0];
-      phi::DenseTensor ncrowd_gt_labels = ncrowd_output[1];
-
-      auto ncrowd_gt_boxes_et =
-          framework::EigenTensor<T, 2>::From(ncrowd_gt_boxes);
-      ncrowd_gt_boxes_et = ncrowd_gt_boxes_et * im_scale;
-
-      phi::DenseTensor anchor_by_gt_overlap;
-      anchor_by_gt_overlap.mutable_data<T>(
-          {inside_anchor.dims()[0], ncrowd_gt_boxes.dims()[0]}, place);
-      BboxOverlaps<T>(inside_anchor, ncrowd_gt_boxes, &anchor_by_gt_overlap);
-
-      auto loc_score_tgtlbl_gt = GetAllFgBgGt<T>(dev_ctx,
-                                                 anchor_by_gt_overlap,
-                                                 ncrowd_gt_labels,
-                                                 positive_overlap,
-                                                 negative_overlap,
-                                                 engine);
-
-      phi::DenseTensor sampled_loc_index = loc_score_tgtlbl_gt[0];
-      phi::DenseTensor sampled_score_index = loc_score_tgtlbl_gt[1];
-      phi::DenseTensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
-      phi::DenseTensor sampled_gt_index = loc_score_tgtlbl_gt[3];
-      phi::DenseTensor sampled_bbox_inside_weight = loc_score_tgtlbl_gt[4];
-      phi::DenseTensor sampled_fg_num = loc_score_tgtlbl_gt[5];
-
-      int loc_num = static_cast<int>(sampled_loc_index.dims()[0]);
-      int score_num = static_cast<int>(sampled_score_index.dims()[0]);
-      // unmap to all anchor
-      phi::DenseTensor sampled_loc_index_unmap, sampled_score_index_unmap;
-      sampled_loc_index_unmap.mutable_data<int>({loc_num}, place);
-      sampled_score_index_unmap.mutable_data<int>({score_num}, place);
-      Gather<int>(inds_inside.data<int>(),
-                  1,
-                  sampled_loc_index.data<int>(),
-                  loc_num,
-                  sampled_loc_index_unmap.data<int>());
-      Gather<int>(inds_inside.data<int>(),
-                  1,
-                  sampled_score_index.data<int>(),
-                  score_num,
-                  sampled_score_index_unmap.data<int>());
-
-      // get target bbox deltas
-      phi::DenseTensor sampled_anchor, sampled_gt, sampled_tgt_bbox;
-      auto* sampled_anchor_data =
-          sampled_anchor.mutable_data<T>({loc_num, 4}, place);
-      auto* sampled_gt_data = sampled_gt.mutable_data<T>({loc_num, 4}, place);
-      Gather<T>(anchor->data<T>(),
-                4,
-                sampled_loc_index_unmap.data<int>(),
-                loc_num,
-                sampled_anchor_data);
-      Gather<T>(ncrowd_gt_boxes.data<T>(),
-                4,
-                sampled_gt_index.data<int>(),
-                loc_num,
-                sampled_gt_data);
-      sampled_tgt_bbox.mutable_data<T>({loc_num, 4}, place);
-      BoxToDelta<T>(loc_num,
-                    sampled_anchor,
-                    sampled_gt,
-                    nullptr,
-                    false,
-                    &sampled_tgt_bbox);
-
-      // Add anchor offset
-      int anchor_offset = static_cast<int>(i * anchor_num);
-      auto sampled_loc_index_unmap_et =
-          framework::EigenTensor<int, 1>::From(sampled_loc_index_unmap);
-      sampled_loc_index_unmap_et = sampled_loc_index_unmap_et + anchor_offset;
-      auto sampled_score_index_unmap_et =
-          framework::EigenTensor<int, 1>::From(sampled_score_index_unmap);
-      sampled_score_index_unmap_et =
-          sampled_score_index_unmap_et + anchor_offset;
-      AppendRpns<int>(loc_index, total_loc_num, &sampled_loc_index_unmap);
-      AppendRpns<int>(score_index, total_score_num, &sampled_score_index_unmap);
-      AppendRpns<T>(tgt_bbox, total_loc_num * 4, &sampled_tgt_bbox);
-      AppendRpns<int>(tgt_lbl, total_score_num, &sampled_tgtlbl);
-      AppendRpns<T>(
-          bbox_inside_weight, total_loc_num * 4, &sampled_bbox_inside_weight);
-      AppendRpns<int>(fg_num, total_fg_num, &sampled_fg_num);
-
-      total_loc_num += loc_num;
-      total_score_num += score_num;
-      total_fg_num += 1;
-      lod0_loc.emplace_back(total_loc_num);
-      lod0_score.emplace_back(total_score_num);
-      lod0_fg.emplace_back(total_fg_num);
-    }
-
-    PADDLE_ENFORCE_LE(
-        total_loc_num,
-        max_num,
-        platform::errors::InvalidArgument(
-            "The number of sampled bboxes should not be greater than the "
-            "number of all anchor boxes(%d), but the number of sampled "
-            "bboxes is :%d.",
-            max_num,
-            total_loc_num));
-    PADDLE_ENFORCE_LE(
-        total_score_num,
-        max_num,
-        platform::errors::InvalidArgument(
-            "The number of sampled scores should not be greater than the "
-            "number of all anchor boxes(%d), but the number of sampled "
-            "scores is :%d.",
-            max_num,
-            total_score_num));
-    PADDLE_ENFORCE_LE(
-        total_fg_num,
-        batch_num,
-        platform::errors::InvalidArgument(
-            "The number of foreground numbers should not be greater than the "
-            "batch size(%d), but the number of foreground numbers is :%d.",
-            batch_num,
-            total_fg_num));
-
-    lod_loc.emplace_back(lod0_loc);
-    loc_score.emplace_back(lod0_score);
-    lod_fg.emplace_back(lod0_fg);
-    loc_index->set_lod(lod_loc);
-    score_index->set_lod(loc_score);
-    tgt_bbox->set_lod(lod_loc);
-    tgt_lbl->set_lod(loc_score);
-    bbox_inside_weight->set_lod(lod_loc);
-    fg_num->set_lod(lod_fg);
-    loc_index->Resize({total_loc_num});
-    score_index->Resize({total_score_num});
-    tgt_bbox->Resize({total_loc_num, 4});
-    tgt_lbl->Resize({total_score_num, 1});
-    bbox_inside_weight->Resize({total_loc_num, 4});
-    fg_num->Resize({total_fg_num, 1});
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    rpn_target_assign,
-    ops::RpnTargetAssignOp,
-    ops::RpnTargetAssignOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-PD_REGISTER_STRUCT_KERNEL(rpn_target_assign,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::RpnTargetAssignKernel,
-                          float,
-                          double) {}
-REGISTER_OPERATOR(
-    retinanet_target_assign,
-    ops::RetinanetTargetAssignOp,
-    ops::RetinanetTargetAssignOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-PD_REGISTER_STRUCT_KERNEL(retinanet_target_assign,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::RetinanetTargetAssignKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc
index a0879337f5ae7..cee37d49eb69b 100644
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -45,28 +45,28 @@ class DetectionMAPOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         det_dims.size(),
         2UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(DetectRes) ndim must be 2, the shape is [N, 6],"
             "but received the ndim is %d",
             det_dims.size()));
     PADDLE_ENFORCE_EQ(
         det_dims[1],
         6UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The shape is of Input(DetectRes) [N, 6], but received"
             " shape is [N, %d]",
             det_dims[1]));
     auto label_dims = ctx->GetInputDim("Label");
     PADDLE_ENFORCE_EQ(label_dims.size(),
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The ndim of Input(Label) must be 2, but received %d",
                           label_dims.size()));
     if (ctx->IsRuntime() || label_dims[1] > 0) {
       PADDLE_ENFORCE_EQ(
           (label_dims[1] == 6 || label_dims[1] == 5),
           true,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The shape of Input(Label) is [N, 6] or [N, 5], but received "
               "[N, %d]",
               label_dims[1]));
@@ -75,12 +75,12 @@ class DetectionMAPOp : public framework::OperatorWithKernel {
     if (ctx->HasInput("PosCount")) {
       PADDLE_ENFORCE(
           ctx->HasInput("TruePos"),
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Input(TruePos) of DetectionMAPOp should not be null when "
               "Input(PosCount) is not null."));
       PADDLE_ENFORCE(
           ctx->HasInput("FalsePos"),
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Input(FalsePos) of DetectionMAPOp should not be null when "
               "Input(PosCount) is not null."));
     }
@@ -197,7 +197,7 @@ class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker {
           PADDLE_ENFORCE_NE(
               GetAPType(ap_type),
               APType::kNone,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "The ap_type should be 'integral' or '11point."));
         });
     AddComment(R"DOC(
diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h
index ccf0834968793..24fea9c431c63 100644
--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
@@ -82,12 +82,12 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         label_lod.size(),
         1UL,
-        platform::errors::InvalidArgument("Only support LodTensor of lod_level "
-                                          "with 1 in label, but received %d.",
-                                          label_lod.size()));
+        phi::errors::InvalidArgument("Only support LodTensor of lod_level "
+                                     "with 1 in label, but received %d.",
+                                     label_lod.size()));
     PADDLE_ENFORCE_EQ(label_lod[0].size(),
                       detect_lod[0].size(),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The batch_size of input(Label) and input(Detection) "
                           "must be the same, but received %d:%d",
                           label_lod[0].size(),
@@ -212,7 +212,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
           PADDLE_ENFORCE_EQ(
               input_label.dims()[1],
               5,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "The input label width"
                   " must be 5, but received %d, please check your input data",
                   input_label.dims()[1]));
@@ -504,7 +504,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
         mAP += average_precisions;
         ++count;
       } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
+        PADDLE_THROW(phi::errors::Unimplemented(
             "Unkown ap version %s. Now only supports integral and l1point.",
             ap_type));
       }
diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op.cc b/paddle/fluid/operators/dlnne/dlnne_engine_op.cc
index 86508bfbf2720..a09e8aaec8156 100644
--- a/paddle/fluid/operators/dlnne/dlnne_engine_op.cc
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op.cc
@@ -44,8 +44,8 @@ std::string ConvertType(phi::DataType type) {
     }
     default: {
       PADDLE_THROW(
-          platform::errors::Fatal("The DLNNE Calibration only support "
-                                  "float/float16/int32_t/int64_t input."));
+          phi::errors::Fatal("The DLNNE Calibration only support "
+                             "float/float16/int32_t/int64_t input."));
     }
   }
 }
@@ -66,8 +66,8 @@ int GetDataByte(phi::DataType type) {
     }
     default: {
       PADDLE_THROW(
-          platform::errors::Fatal("The DLNNE Calibration only support "
-                                  "float/float16/int32_t/int64_t input."));
+          phi::errors::Fatal("The DLNNE Calibration only support "
+                             "float/float16/int32_t/int64_t input."));
     }
   }
 }
@@ -93,7 +93,7 @@ void ConvertPaddle2Onnx(std::string onnx_file_name,
     PADDLE_ENFORCE_EQ(
         convert_flag,
         0,
-        platform::errors::Unavailable("Convert paddle to onnx failed"));
+        phi::errors::Unavailable("Convert paddle to onnx failed"));
   }
 }
 
@@ -108,10 +108,9 @@ void QuantizeOnnx(std::string onnx_file_name,
                 << " --output-model " << rlym_file_name;
     LOG(INFO) << convert_cmd.str();
     int convert_flag = system(convert_cmd.str().c_str());
-    PADDLE_ENFORCE_EQ(
-        convert_flag,
-        0,
-        platform::errors::Unavailable("Convert onnx to rlym failed"));
+    PADDLE_ENFORCE_EQ(convert_flag,
+                      0,
+                      phi::errors::Unavailable("Convert onnx to rlym failed"));
   }
 
   if (!FileExists(quantized_rlym_file_name.c_str())) {
@@ -121,9 +120,8 @@ void QuantizeOnnx(std::string onnx_file_name,
                  << dataset_plugin_path << " " << rlym_file_name;
     LOG(INFO) << quantize_cmd.str();
     int quantize_flag = system(quantize_cmd.str().c_str());
-    PADDLE_ENFORCE_EQ(quantize_flag,
-                      0,
-                      platform::errors::Unavailable("quantize model failed"));
+    PADDLE_ENFORCE_EQ(
+        quantize_flag, 0, phi::errors::Unavailable("quantize model failed"));
   }
 }
 
diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op.h b/paddle/fluid/operators/dlnne/dlnne_engine_op.h
index d0063c51512e3..363e21545c9ab 100644
--- a/paddle/fluid/operators/dlnne/dlnne_engine_op.h
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op.h
@@ -37,7 +37,7 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/utils/io_utils.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/float16.h"
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
@@ -131,7 +131,7 @@ static phi::DataType DLNNE2FluidDataType(dl::nne::DataType type) {
     case dl::nne::DataType::kBOOL:
       return phi::DataType::BOOL;
     default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "unknown fluid datatype in Fluid op converter"));
       return phi::DataType::FLOAT32;
   }
@@ -316,10 +316,9 @@ class DlnneEngineOp : public framework::OperatorBase {
       }
 
       builder = dl::nne::CreateInferBuilder();
-      PADDLE_ENFORCE_NE(
-          builder,
-          nullptr,
-          platform::errors::Unavailable("nne create builder failed"));
+      PADDLE_ENFORCE_NE(builder,
+                        nullptr,
+                        phi::errors::Unavailable("nne create builder failed"));
       dl::nne::BuilderConfig builder_cfg;
       builder_cfg.max_batch_size = max_batch_size_;
       builder_cfg.ws_mode = weight_share_map[weight_share_mode_];
@@ -327,10 +326,9 @@ class DlnneEngineOp : public framework::OperatorBase {
       network = builder->CreateNetwork();
 
       parser = dl::nne::CreateParser();
-      PADDLE_ENFORCE_NE(
-          parser,
-          nullptr,
-          platform::errors::Unavailable("nne create parser failed"));
+      PADDLE_ENFORCE_NE(parser,
+                        nullptr,
+                        phi::errors::Unavailable("nne create parser failed"));
       if (dlnne_log_flag_) {
         LOG(INFO) << "set output for dlnne";
       }
@@ -402,7 +400,7 @@ class DlnneEngineOp : public framework::OperatorBase {
     PADDLE_ENFORCE_EQ(
         input_names_.empty(),
         false,
-        platform::errors::PreconditionNotMet(
+        phi::errors::PreconditionNotMet(
             "Dlnne engine needs at least one input, but no input is found. "
             "Please check if you set the input correctly."));
 
@@ -440,7 +438,7 @@ class DlnneEngineOp : public framework::OperatorBase {
         PADDLE_ENFORCE_EQ(
             first_batch,
             batch,
-            platform::errors::Unavailable(
+            phi::errors::Unavailable(
                 "compute infer_batches is different from each other"));
       }
       infer_batch = first_batch;
@@ -474,13 +472,13 @@ class DlnneEngineOp : public framework::OperatorBase {
         data_bytes = 4;
         dtype = 2;
       } else if (type == phi::DataType::FLOAT16) {
-        buffer = static_cast<void *>(t.data<paddle::platform::float16>());
+        buffer = static_cast<void *>(t.data<phi::dtype::float16>());
         data_bytes = 2;
         dtype = 3;
       } else {
         PADDLE_THROW(
-            platform::errors::Fatal("The DLNNE Engine OP only support "
-                                    "float/int32_t/int64_t/float16 input."));
+            phi::errors::Fatal("The DLNNE Engine OP only support "
+                               "float/int32_t/int64_t/float16 input."));
       }
       input_buffers[bind_index] = buffer;
 
@@ -555,7 +553,7 @@ class DlnneEngineOp : public framework::OperatorBase {
       auto *fluid_v = scope.FindVar(y);
       PADDLE_ENFORCE_NOT_NULL(
           fluid_v,
-          platform::errors::NotFound(
+          phi::errors::NotFound(
               "Output variable %s is not found in DLNNE subgraph.", y));
 
       auto *fluid_t = fluid_v->GetMutable<phi::DenseTensor>();
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index 01df430f52161..d538164977277 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -71,7 +71,7 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
         .AddCustomChecker([](const float& drop_p) {
           PADDLE_ENFORCE_EQ(drop_p >= 0.0f && drop_p <= 1.0f,
                             true,
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "'dropout_prob' must be between 0.0 and 1.0."));
         })
         .SupportTensor();
@@ -100,7 +100,7 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
           PADDLE_ENFORCE_EQ(
               type == "downgrade_in_infer" || type == "upscale_in_train",
               true,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "dropout_implementation can only be downgrade_in_infer or "
                   "upscale_in_train"));
         });
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
index 191890865fb89..4029be65a00d6 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
@@ -107,6 +107,7 @@ class ElementwiseDivDoubleGradMaker : public framework::SingleGradOpMaker<T> {
     op->SetType("elementwise_div_grad_grad");
     op->SetInput("Y", this->Input("Y"));
     op->SetInput("Out", this->Input("Out"));
+    op->SetInput("Out@GRAD", this->Input(framework::GradVarName("Out")));
     op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
     op->SetInput("DDY", this->OutputGrad(framework::GradVarName("Y")));
     op->SetInput("DX", this->Output(framework::GradVarName("X")));
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index d835caedbf3c8..00a5ca7a39d0e 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
 #ifdef PADDLE_WITH_DNNL
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #endif
 
 namespace paddle {
@@ -44,7 +44,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ctx->GetInputsVarType("Y").front(),
         framework::proto::VarType::LOD_TENSOR,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The input var's type should be phi::DenseTensor, but the "
             "received is %s [%s].",
             ctx->GetInputsVarType("Y").front(),
@@ -55,7 +55,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           ctx->GetInputDim("Y").size(),
           1u,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "For elementwise_op, if X is Sparse(VarType.SELECTED_ROWS"
               "), Y must be scalar, the size of Y should be 1. "
               "But reveived the size of Y = %s.",
@@ -63,14 +63,14 @@ class ElementwiseOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           ctx->GetInputDim("Y")[0],
           1,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "For elementwise_op, if X is Sparse(VarType.SELECTED_ROWS"
               "), Y must be scalar, the first dimension of Y should be 1. "
               "But reveived the first dimension of Y = %s.",
               ctx->GetInputDim("Y")[0]));
     } else if (ctx->GetInputsVarType("X").front() !=
                framework::proto::VarType::LOD_TENSOR) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Input X's type[%s] is not supported by elementwise_op. Please set "
           "its type to LOD_TENSOR.",
           ctx->GetInputsVarType("X").front()));
@@ -87,7 +87,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {
       if (x_dims.size() == y_dims.size()) {
         PADDLE_ENFORCE_EQ((axis == -1) || (axis == 0),
                           true,
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "axis should be -1 or 0 while the dimension of "
                               "tensor X (%s) is equal to the dimension of "
                               "tensor Y (%s), but received axis: %s",
@@ -97,7 +97,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {
       }
       PADDLE_ENFORCE_EQ((axis >= (-1 * max_dim)) && (axis < max_dim),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The axis range must be [%s, %s), but axis is %s. "
                             "Please set the axis again.",
                             -1 * max_dim,
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index afa2df659c42a..3d0fe2ab399bc 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -76,7 +76,7 @@ int PackTensorsIntoVector(const framework::ExecutionContext &ctx,
   auto x_var = ctx.InputVar("X");
   PADDLE_ENFORCE_NOT_NULL(
       x_var,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Unable to get input Variable X, Variable name is %s.\n",
           ctx.InputName("X")));
   auto *y = ctx.Input<phi::DenseTensor>("Y");
@@ -89,13 +89,13 @@ int PackTensorsIntoVector(const framework::ExecutionContext &ctx,
   } else if (x_var->IsType<phi::SelectedRows>()) {
     PADDLE_ENFORCE_EQ(y->dims().size() == 1 && y->dims()[0] == 1,
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "For elementwise_op, if X is Sparse, Y must be "
                           "scalar. But received the size of Y = %d.",
                           y->dims().size()));
     PADDLE_ENFORCE_NOT_NULL(
         x_for_selectedrows,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The parameter x_for_selectedrows is excepted to "
             "be valid, once input variable X`s class type is "
             "SelectedRows.\n"));
@@ -110,7 +110,7 @@ int PackTensorsIntoVector(const framework::ExecutionContext &ctx,
     z = ctx.Output<phi::SelectedRows>("Out")->mutable_value();
     ins->emplace_back(x_for_selectedrows);
   } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
+    PADDLE_THROW(phi::errors::InvalidArgument(
         "X's type[%s] is not supported by elementwise_op. X's type should be "
         "phi::DenseTensor or SelectedRows.",
         framework::ToTypeName(x_var->Type())));
@@ -1403,7 +1403,7 @@ void FusedElemwiseAndActGradComputeEx(const framework::ExecutionContext &ctx,
   if (UseIntermediateOut) {
     PADDLE_ENFORCE_NOT_NULL(
         intermediate_out,
-        platform::errors::InvalidArgument("Intermediate out is null pointer."));
+        phi::errors::InvalidArgument("Intermediate out is null pointer."));
   }
   if (x_dim == y_dim) {
     FusedElemwiseAndActGradComputeNoBroadcast<DeviceContext,
@@ -1507,7 +1507,7 @@ void FusedElemwiseAndActComputeEx(const framework::ExecutionContext &ctx,
   if (KeepIntermediateOut) {
     PADDLE_ENFORCE_NOT_NULL(
         intermediate_out,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The save_intermediate_out is opened, intermediate "
             "out is null pointer."));
   }
diff --git a/paddle/fluid/operators/elementwise/elementwise_xpu.h b/paddle/fluid/operators/elementwise/elementwise_xpu.h
index c78c5cc6a4b6f..5dc7a9f00d67a 100644
--- a/paddle/fluid/operators/elementwise/elementwise_xpu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_xpu.h
@@ -39,12 +39,12 @@ void XPUElementwise(const framework::ExecutionContext& ctx,
   PADDLE_ENFORCE_NE(
       x_var,
       nullptr,
-      platform::errors::InvalidArgument("Cannot get input Variable X"));
+      phi::errors::InvalidArgument("Cannot get input Variable X"));
   PADDLE_ENFORCE_EQ(
       x_var->IsType<phi::DenseTensor>(),
       true,
-      platform::errors::InvalidArgument("XPU only support phi::DenseTensor, "
-                                        "Input(X) is not phi::DenseTensor"));
+      phi::errors::InvalidArgument("XPU only support phi::DenseTensor, "
+                                   "Input(X) is not phi::DenseTensor"));
 
   auto x = x_var->Get<phi::DenseTensor>();
   auto* y = ctx.Input<phi::DenseTensor>("Y");
diff --git a/paddle/fluid/operators/enqueue_op.cc b/paddle/fluid/operators/enqueue_op.cc
deleted file mode 100644
index c8279719789c4..0000000000000
--- a/paddle/fluid/operators/enqueue_op.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
-
-namespace paddle {
-namespace framework {
-class OpDesc;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-using LoDTensorBlockingQueueHolder =
-    paddle::operators::reader::LoDTensorBlockingQueueHolder;
-
-namespace paddle {
-namespace operators {
-
-class EnqueueOp : public framework::OperatorBase {
- public:
-  EnqueueOp(const std::string& type,
-            const framework::VariableNameMap& inputs,
-            const framework::VariableNameMap& outputs,
-            const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    const std::string& queue_name = Attr<std::string>("queue_name");
-    auto* queue_holder_var = scope.FindVar(queue_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        queue_holder_var,
-        platform::errors::NotFound(
-            "No LoDTensorBlockingQueueHolder variable with name %s found.",
-            queue_name));
-    const std::string& var_name = Input("X");
-    auto* in_var = scope.FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(in_var,
-                            platform::errors::NotFound(
-                                "No variable with name %s found.", var_name));
-    auto* in_tensor = in_var->GetMutable<phi::DenseTensor>();
-    auto* queue_holder =
-        queue_holder_var->template GetMutable<LoDTensorBlockingQueueHolder>();
-
-    paddle::framework::LoDTensorArray lod_tensor_vec;
-    lod_tensor_vec.emplace_back(*in_tensor);
-    queue_holder->GetQueue()->Push(lod_tensor_vec);
-  }
-};
-
-class EnqueueOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "`lod_tensor` to enqueue");
-    AddAttr<std::string>("queue_name",
-                         "Name of the `LoDTensorBlockingQueueHolder` variable");
-    AddComment(R"DOC(
-      Enqueue operator.
-      )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = ::paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(enqueue, ops::EnqueueOp, ops::EnqueueOpMaker);
diff --git a/paddle/fluid/operators/expand_as_v2_op.h b/paddle/fluid/operators/expand_as_v2_op.h
index 2c62dc570ff21..a9dd1f08c385b 100644
--- a/paddle/fluid/operators/expand_as_v2_op.h
+++ b/paddle/fluid/operators/expand_as_v2_op.h
@@ -17,9 +17,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
-#define MAX_RANK_SUPPORTED 6
+#define MAX_RANK_SUPPORTED 8
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 71295296218f0..4f57a35a1039e 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -36,7 +36,7 @@ class ExpandOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         static_cast<size_t>(x_dims.size()),
         expand_times.size(),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The number of elements (%d) of 'expand_times' for "
             "Op(expand) must be equal to the number of dimensions "
             "(%d) of the input.",
@@ -44,10 +44,11 @@ class ExpandOp : public framework::OperatorWithKernel {
             static_cast<size_t>(x_dims.size())));
     PADDLE_ENFORCE_LE(
         x_dims.size(),
-        6,
-        platform::errors::InvalidArgument(
+        MAX_RANK_SUPPORTED,
+        phi::errors::InvalidArgument(
             "The number of dimensions of the input for Op(expand) "
-            "must not be greater than 6, but the value received is %d.",
+            "must not be greater than %d, but the value received is %d.",
+            MAX_RANK_SUPPORTED,
             x_dims.size()));
 
     std::vector<int64_t> out_shape(x_dims.size());
@@ -58,7 +59,7 @@ class ExpandOp : public framework::OperatorWithKernel {
         PADDLE_ENFORCE_GT(
             expand_times[i],
             0,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "The %uth element of 'expand_times' for Op(expand) must be "
                 "greater than 0, but the value given is %d.",
                 i,
@@ -98,7 +99,7 @@ class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
+             "(Tensor, default Tensor<float>). A tensor with rank in [1, 8]."
              "X is the input to be expanded.");
     AddInput("ExpandTimes",
              "(Tensor<int>), optional). If provided, expand according to "
@@ -112,7 +113,7 @@ class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsDuplicable()
         .AsDispensable();
     AddOutput("Out",
-              "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
+              "(Tensor, default Tensor<float>). A tensor with rank in [1, 8]."
               "The rank of Output(Out) have the same with Input(X). "
               "After expanding, size of each dimension of Output(Out) is equal "
               "to size of the corresponding dimension of Input(X) multiplying "
@@ -123,7 +124,7 @@ class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Expand operator tiles the input by given times number. You should set times
 number for each dimension by providing attribute 'expand_times'. The rank of X
-should be in [1, 6]. Please note that size of 'expand_times' must be the same
+should be in [1, 8]. Please note that size of 'expand_times' must be the same
 with X's rank. Following is a using case:
 Input(X) is a 3-D tensor with shape [2, 3, 1]:
         [
@@ -163,7 +164,7 @@ class ExpandGradOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           x_dims[0],
           out_dims[0],
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The first dimension size (%d) of Input(Out@GRAD) should be "
               "equal to the corresponding dimension size (%d) of Input(X)",
               out_dims[0],
@@ -179,7 +180,7 @@ class ExpandGradOp : public framework::OperatorWithKernel {
           PADDLE_ENFORCE_EQ(
               x_dims[i] * expand_times[i],
               out_dims[i],
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "The %uth dimension size (%d) of Input(Out@GRAD) should be "
                   "equal to the multiplication of the corresponding dimension "
                   "sizes of Input(X) (%d) and expand_times (%d).",
@@ -284,19 +285,18 @@ REGISTER_OP_CPU_KERNEL(expand_grad,
                        ops::ExpandGradKernel<phi::CPUContext, int>,
                        ops::ExpandGradKernel<phi::CPUContext, int64_t>);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-REGISTER_OP_CUDA_KERNEL(
-    expand,
-    ops::ExpandKernel<phi::GPUContext, float>,
-    ops::ExpandKernel<phi::GPUContext, double>,
-    ops::ExpandKernel<phi::GPUContext, paddle::platform::float16>,
-    ops::ExpandKernel<phi::GPUContext, int>,
-    ops::ExpandKernel<phi::GPUContext, int64_t>,
-    ops::ExpandKernel<phi::GPUContext, bool>);
+REGISTER_OP_CUDA_KERNEL(expand,
+                        ops::ExpandKernel<phi::GPUContext, float>,
+                        ops::ExpandKernel<phi::GPUContext, double>,
+                        ops::ExpandKernel<phi::GPUContext, phi::dtype::float16>,
+                        ops::ExpandKernel<phi::GPUContext, int>,
+                        ops::ExpandKernel<phi::GPUContext, int64_t>,
+                        ops::ExpandKernel<phi::GPUContext, bool>);
 REGISTER_OP_CUDA_KERNEL(
     expand_grad,
     ops::ExpandGradKernel<phi::GPUContext, float>,
     ops::ExpandGradKernel<phi::GPUContext, double>,
-    ops::ExpandGradKernel<phi::GPUContext, paddle::platform::float16>,
+    ops::ExpandGradKernel<phi::GPUContext, phi::dtype::float16>,
     ops::ExpandGradKernel<phi::GPUContext, int>,
     ops::ExpandGradKernel<phi::GPUContext, int64_t>);
 #endif
diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h
index ee100b3b48418..3d539cbf0c944 100644
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -19,9 +19,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
-#define MAX_RANK_SUPPORTED 6
+#define MAX_RANK_SUPPORTED 8
 
 namespace paddle {
 namespace operators {
@@ -97,14 +97,14 @@ class ExpandKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_GE(
         rank,
         1,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The number of dimensions of the input 'x' for Op(expand) "
             "must be greater than or equal to 1, but the value received is %d.",
             rank));
     PADDLE_ENFORCE_LE(
         rank,
         MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The number of dimensions of the input 'x' for Op(expand) "
             "must be less than or equal to %d, but the value received is %d.",
             MAX_RANK_SUPPORTED,
@@ -128,6 +128,12 @@ class ExpandKernel : public framework::OpKernel<T> {
       case 6:
         Expand<6>(context);
         break;
+      case 7:
+        Expand<7>(context);
+        break;
+      case 8:
+        Expand<8>(context);
+        break;
     }
   }
 
@@ -140,7 +146,7 @@ class ExpandKernel : public framework::OpKernel<T> {
     auto expand_times = get_expand_times(context);
     PADDLE_ENFORCE_EQ(static_cast<size_t>(in_dims.size()),
                       expand_times.size(),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The number of elements (%d) of 'expand_times' for "
                           "Op(expand) must be equal to the number "
                           "of dimensions (%d) of the input.",
@@ -166,10 +172,10 @@ class ExpandKernel : public framework::OpKernel<T> {
     // use 32-bit index to speed up
     bool use_32bit_index = y.size() < Eigen::NumTraits<int>::highest();
     if (use_32bit_index) {
-      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+      phi::funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
           place, To32BitIndex(y), To32BitIndex(x), bcast_dims);
     } else {
-      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+      phi::funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
           place, y, x, bcast_dims);
     }
   }
@@ -216,7 +222,7 @@ class ExpandGradKernel : public framework::OpKernel<T> {
     } else {
       PADDLE_ENFORCE_GE(dims,
                         1,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The number of dimensions of the input "
                             "'Out@GRAD' for Op(expand_grad)"
                             " must be greater than or equal to 1, but "
@@ -224,7 +230,7 @@ class ExpandGradKernel : public framework::OpKernel<T> {
                             dims));
       PADDLE_ENFORCE_LE(dims,
                         MAX_RANK_SUPPORTED,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The number of dimensions of the input 'Out@GRAD' "
                             "for Op(expand_grad) must be less than or equal "
                             "to %d, but the value received is %d.",
@@ -249,10 +255,17 @@ class ExpandGradKernel : public framework::OpKernel<T> {
         case 6:
           ExpandBackward<6>(context, reshape_dims_vec, reduce_dims_vec);
           break;
+        case 7:
+          ExpandBackward<7>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 8:
+          ExpandBackward<8>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
         default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Only support tensor with rank being between 1 and 6. But "
+          PADDLE_THROW(phi::errors::InvalidArgument(
+              "Only support tensor with rank being between 1 and %d. But "
               "received tensor's rank = %d.",
+              MAX_RANK_SUPPORTED,
               dims));
       }
     }
@@ -267,14 +280,14 @@ class ExpandGradKernel : public framework::OpKernel<T> {
     size_t reduce_size = reduce_dims_vec.size();
     PADDLE_ENFORCE_EQ(reshape_size,
                       reshape_dims_vec.size(),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Inconsistent size between template Dims (%d) and "
                           "reshape dimensions (%d).",
                           reshape_size,
                           reshape_dims_vec.size()));
     PADDLE_ENFORCE_EQ(reduce_size,
                       reduce_dims_vec.size(),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Inconsistent size between template Dims (%d) and "
                           "reduce dimensions (%d).",
                           reduce_size,
@@ -294,8 +307,8 @@ class ExpandGradKernel : public framework::OpKernel<T> {
     auto out_grad = EigenVector<T>::Flatten(*in0);
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
-    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
-        place, x_grad, out_grad, reduce_dims, reshape_dims);
+    phi::funcs::EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::
+        Eval(place, x_grad, out_grad, reduce_dims, reshape_dims);
   }
 };
 
diff --git a/paddle/fluid/operators/expand_v2_op.h b/paddle/fluid/operators/expand_v2_op.h
index 0a70faddb7d58..57013d5eb8bd1 100644
--- a/paddle/fluid/operators/expand_v2_op.h
+++ b/paddle/fluid/operators/expand_v2_op.h
@@ -20,9 +20,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
-#define MAX_RANK_SUPPORTED 6
+#define MAX_RANK_SUPPORTED 8
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc
index ee64d5e1c5cc5..e527ae2d876e9 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -232,12 +232,12 @@ class FakeChannelWiseDequantizeMaxAbsOpMaker
                  "and mul, the quant_axis is equal to the cout axis.")
         .SetDefault(0)
         .AddCustomChecker([](const int& quant_axis) {
-          PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1,
-                            true,
-                            platform::errors::InvalidArgument(
-                                "'quant_axis' should be 0 or 1, but "
-                                "the received is %d",
-                                quant_axis));
+          PADDLE_ENFORCE_EQ(
+              quant_axis == 0 || quant_axis == 1,
+              true,
+              phi::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                           "the received is %d",
+                                           quant_axis));
         });
     AddAttr<int>("x_num_col_dims",
                  "The x_num_col_dims of mul. Only used for mul or matmul.")
@@ -245,7 +245,7 @@ class FakeChannelWiseDequantizeMaxAbsOpMaker
         .AddCustomChecker([](const int& x_num_col_dims) {
           PADDLE_ENFORCE_EQ(x_num_col_dims == 0,
                             false,
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "'x_num_col_dims' should be larger than 0, but "
                                 "the received is %d",
                                 x_num_col_dims));
diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu
index ea069daa40d7d..1e9c28661e23c 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cu
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fake_dequantize_op.cu.h"
 
 namespace ops = paddle::operators;
-using float16 = paddle::platform::float16;
+using float16 = phi::dtype::float16;
 
 PD_REGISTER_STRUCT_KERNEL(fake_dequantize_max_abs,
                           GPU,
diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h
index 57887721308d4..420996e878b76 100644
--- a/paddle/fluid/operators/fake_dequantize_op.h
+++ b/paddle/fluid/operators/fake_dequantize_op.h
@@ -82,7 +82,7 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           scales[0]->numel(),
           in->dims()[quant_axis],
-          platform::errors::PreconditionNotMet(
+          phi::errors::PreconditionNotMet(
               "The number of first scale values must be the same with "
               "quant_axis dimension value of Input(X) when the `Scales` has "
               "only one element, but %ld != %ld here.",
@@ -93,7 +93,7 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           scales[0]->numel(),
           in->dims()[x_num_col_dims],
-          platform::errors::PreconditionNotMet(
+          phi::errors::PreconditionNotMet(
               "The number of first scale values must be the same with "
               "corresponding dimension value of Input(X) when the `Scales` "
               "has two elements, but %ld != %ld here.",
@@ -101,7 +101,7 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
               in->dims()[1]));
       PADDLE_ENFORCE_EQ(scales[1]->numel(),
                         1,
-                        platform::errors::PreconditionNotMet(
+                        phi::errors::PreconditionNotMet(
                             "The second scale tensor should only have one "
                             "value at now, but it has %ld values here.",
                             scales[1]->numel()));
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index a5169892187a2..d7d9a1416d919 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -54,9 +54,9 @@ struct FindChannelAbsMaxFunctor<phi::CPUContext, T> {
     PADDLE_ENFORCE_EQ(
         quant_axis == 0 || quant_axis == 1,
         true,
-        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
-                                          "the received is %d",
-                                          quant_axis));
+        phi::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                     "the received is %d",
+                                     quant_axis));
     auto *in_data = in_tensor.data<T>();
     auto in_dims = in_tensor.dims();
     const int64_t channel = in_dims[quant_axis];
@@ -167,9 +167,9 @@ struct ChannelClipAndFakeQuantFunctor<phi::CPUContext, T> {
     PADDLE_ENFORCE_EQ(
         quant_axis == 0 || quant_axis == 1,
         true,
-        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
-                                          "the received is %d",
-                                          quant_axis));
+        phi::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                     "the received is %d",
+                                     quant_axis));
     auto *scale_data = scale.data<T>();
     auto *in_data = in.data<T>();
     auto *out_data = out->mutable_data<T>(ctx.GetPlace());
@@ -247,9 +247,9 @@ struct ChannelClipFakeQuantDequantFunctor<phi::CPUContext, T> {
     PADDLE_ENFORCE_EQ(
         quant_axis == 0 || quant_axis == 1,
         true,
-        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
-                                          "the received is %d",
-                                          quant_axis));
+        phi::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                     "the received is %d",
+                                     quant_axis));
 
     auto *scale_data = scale.data<T>();
     auto *in_data = in.data<T>();
@@ -426,7 +426,7 @@ class FakeQuantOrWithDequantAbsMaxOpMaker
         .AddCustomChecker([](const int &bit_length) {
           PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16,
                             true,
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "'bit_length' should be between 1 and 16, but "
                                 "the received is %d",
                                 bit_length));
@@ -493,19 +493,19 @@ class FakeChannelWiseQuantizeAbsMaxOpMaker
                  "and mul, the quant_axis is equal to the cout axis.")
         .SetDefault(0)
         .AddCustomChecker([](const int &quant_axis) {
-          PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1,
-                            true,
-                            platform::errors::InvalidArgument(
-                                "'quant_axis' should be 0 or 1, but "
-                                "the received is %d",
-                                quant_axis));
+          PADDLE_ENFORCE_EQ(
+              quant_axis == 0 || quant_axis == 1,
+              true,
+              phi::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                           "the received is %d",
+                                           quant_axis));
         });
     AddAttr<int>("bit_length", "(int, default 8)")
         .SetDefault(8)
         .AddCustomChecker([](const int &bit_length) {
           PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16,
                             true,
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "'bit_length' should be between 1 and 16, but "
                                 "the received is %d",
                                 bit_length));
@@ -574,19 +574,19 @@ class FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker
                  "and mul, the quant_axis is equal to the cout axis.")
         .SetDefault(0)
         .AddCustomChecker([](const int &quant_axis) {
-          PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1,
-                            true,
-                            platform::errors::InvalidArgument(
-                                "'quant_axis' should be 0 or 1, but "
-                                "the received is %d",
-                                quant_axis));
+          PADDLE_ENFORCE_EQ(
+              quant_axis == 0 || quant_axis == 1,
+              true,
+              phi::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                           "the received is %d",
+                                           quant_axis));
         });
     AddAttr<int>("bit_length", "(int, default 8)")
         .SetDefault(8)
         .AddCustomChecker([](const int &bit_length) {
           PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16,
                             true,
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "'bit_length' should be between 1 and 16, but "
                                 "the received is %d",
                                 bit_length));
@@ -654,7 +654,7 @@ class FakeQuantizeRangeAbsMaxOpMaker
         .AddCustomChecker([](const int &bit_length) {
           PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16,
                             true,
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "'bit_length' should be between 1 and 16, but "
                                 "the received is %d",
                                 bit_length));
@@ -735,7 +735,7 @@ class FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker
         .AddCustomChecker([](const int &bit_length) {
           PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16,
                             true,
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "'bit_length' should be between 1 and 16, but "
                                 "the received is %d",
                                 bit_length));
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 68ceaca46d04f..240fd119ff09a 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fake_quantize_op.cu.h"
 
 namespace ops = paddle::operators;
-using float16 = paddle::platform::float16;
+using float16 = phi::dtype::float16;
 
 PD_REGISTER_STRUCT_KERNEL(fake_quantize_abs_max,
                           GPU,
diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h
index bdf8a80debb64..cb2f498c22b0b 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu.h
+++ b/paddle/fluid/operators/fake_quantize_op.cu.h
@@ -31,7 +31,7 @@ struct QuantizeDataType {
 };
 
 template <>
-struct QuantizeDataType<paddle::platform::float16> {
+struct QuantizeDataType<phi::dtype::float16> {
   using type = float;
 };
 
@@ -92,7 +92,7 @@ struct FindAbsMaxFunctor<phi::GPUContext, T> {
 };
 
 template struct FindAbsMaxFunctor<phi::GPUContext, float>;
-template struct FindAbsMaxFunctor<phi::GPUContext, paddle::platform::float16>;
+template struct FindAbsMaxFunctor<phi::GPUContext, phi::dtype::float16>;
 
 template <typename T>
 __global__ void FindChannelAbsMaxKernelQuantAxis0(const T *in,
@@ -172,9 +172,9 @@ struct FindChannelAbsMaxFunctor<phi::GPUContext, T> {
     PADDLE_ENFORCE_EQ(
         quant_axis == 0 || quant_axis == 1,
         true,
-        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
-                                          "the received is %d",
-                                          quant_axis));
+        phi::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                     "the received is %d",
+                                     quant_axis));
     const int num = in_tensor.numel();
     auto in_dims = in_tensor.dims();
     const T *in_data = in_tensor.data<T>();
@@ -419,9 +419,9 @@ struct ChannelClipAndFakeQuantFunctor<phi::GPUContext, T> {
     PADDLE_ENFORCE_EQ(
         quant_axis == 0 || quant_axis == 1,
         true,
-        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
-                                          "the received is %d",
-                                          quant_axis));
+        phi::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                     "the received is %d",
+                                     quant_axis));
 
     int64_t num = in.numel();
     auto in_dims = in.dims();
@@ -665,9 +665,9 @@ struct ChannelClipFakeQuantDequantFunctor<phi::GPUContext, T> {
     PADDLE_ENFORCE_EQ(
         quant_axis == 0 || quant_axis == 1,
         true,
-        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
-                                          "the received is %d",
-                                          quant_axis));
+        phi::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                     "the received is %d",
+                                     quant_axis));
 
     int num = in.numel();
     auto in_dims = in.dims();
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index 6387018d1865e..39af6b5d5dec2 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -453,11 +453,11 @@ class StraightThroughEstimatorGradKernel : public framework::OpKernel<T> {
         context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     auto x_grad_name = framework::GradVarName("X");
     auto *d_x = context.Output<phi::DenseTensor>(x_grad_name);
-    PADDLE_ENFORCE_NOT_NULL(d_x,
-                            platform::errors::PreconditionNotMet(
-                                "StraightThroughEstimatorGradKernel "
-                                "doesn't have the output named %s.",
-                                x_grad_name));
+    PADDLE_ENFORCE_NOT_NULL(
+        d_x,
+        phi::errors::PreconditionNotMet("StraightThroughEstimatorGradKernel "
+                                        "doesn't have the output named %s.",
+                                        x_grad_name));
 
     // Initialize dx as same as d_out
     d_x->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 8a27649af864b..730ba969c779f 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -33,7 +33,7 @@ class FillConstantOp : public framework::OperatorWithKernel {
         PADDLE_ENFORCE_GE(
             shape[i],
             0,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "Each value of attribute 'shape' is expected to be no less "
                 "than 0. But received: shape[%u] = %d; shape = [%s].",
                 i,
@@ -96,7 +96,7 @@ class FillConstantOp : public framework::OperatorWithKernel {
           kt.set_backend(phi::Backend::XPU);
           break;
         default:
-          PADDLE_THROW(platform::errors::Unimplemented(
+          PADDLE_THROW(phi::errors::Unimplemented(
               "Could NOT determine the place of variable, place_type = %d .",
               place_type));
       }
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cu.cc b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
index c00d23928a70c..e398e94e4ba09 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.cu.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/float16.h"
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
@@ -29,7 +29,7 @@ PD_REGISTER_STRUCT_KERNEL(fill_zeros_like,
                           int64_t,
                           float,
                           double,
-                          plat::float16,
+                          phi::dtype::float16,
                           bool,
                           plat::complex<float>,
                           plat::complex<double>) {}
@@ -42,7 +42,7 @@ PD_REGISTER_STRUCT_KERNEL(fill_zeros_like2,
                           int64_t,
                           float,
                           double,
-                          plat::float16,
+                          phi::dtype::float16,
                           bool,
                           plat::complex<float>,
                           plat::complex<double>) {}
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
index 48c1a23b8591d..b6b67e12fd24d 100644
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -43,12 +43,12 @@ class Flatten2Op : public framework::OperatorWithKernel {
     const auto &in_dims = ctx->GetInputDim("X");
     PADDLE_ENFORCE_GE(axis,
                       0,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The axis should be greater than or equal to 0."));
     PADDLE_ENFORCE_LE(
         axis,
         in_dims.size(),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The axis should be less than or equal to input tensor's rank"));
 
     const auto &out_dims = Flatten2Op::GetOutputShape(axis, in_dims);
diff --git a/paddle/fluid/operators/fsp_op.cc b/paddle/fluid/operators/fsp_op.cc
index d2c1d2c45d685..c447f9d485f5c 100644
--- a/paddle/fluid/operators/fsp_op.cc
+++ b/paddle/fluid/operators/fsp_op.cc
@@ -34,28 +34,28 @@ class FSPOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         x_dims.size(),
         4UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The Input(X) must have shape [batch_size, channel, height, width]."
             "Now the dimension of 'X' is %d.",
             x_dims.size()));
     PADDLE_ENFORCE_EQ(
         y_dims.size(),
         4UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The Input(Y) must have shape [batch_size, channel, height, width]."
             "Now the dimension of 'Y' is %d.",
             y_dims.size()));
     PADDLE_ENFORCE_EQ(
         x_dims[2],
         y_dims[2],
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The Input(X)(%d) and Input(Y)(%d) should have the same height.",
             x_dims[2],
             y_dims[2]));
     PADDLE_ENFORCE_EQ(
         x_dims[3],
         y_dims[3],
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The Input(X)(%d) and Input(Y)(%d) should have the same width.",
             x_dims[3],
             y_dims[3]));
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index ced20a0108a52..713ad1931ce23 100755
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -6,7 +6,6 @@ endif()
 register_operators(
   EXCLUDES
   fused_bn_activation_op
-  fusion_conv_inception_op
   yolo_box_head_op
   yolo_box_post_op
   fusion_group_op
@@ -39,11 +38,7 @@ if(WITH_GPU OR WITH_ROCM)
     op_library(fused_bn_activation_op)
   endif()
   # HIP not support cudnnTransformTensor
-  # fusion_conv_inception_op needs cudnn 7 above
   # HIP not support cudnnConvolutionBiasActivationForward
-  if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7100))
-    op_library(fusion_conv_inception_op)
-  endif()
   op_library(yolo_box_head_op)
   op_library(yolo_box_post_op)
   op_library(fused_gate_attention_op)
diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h
index 8ea1e11cd29f4..2f1847d951058 100644
--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -170,7 +170,7 @@ void LaunchBiasAddFwKernel(const phi::GPUContext& ctx,
       break;
     }
     default: {
-      PADDLE_THROW(platform::errors::Unimplemented(
+      PADDLE_THROW(phi::errors::Unimplemented(
           "Unsupported vectorized size: %d !", vec_size));
       break;
     }
diff --git a/paddle/fluid/operators/fused/attn_feed_forward.h b/paddle/fluid/operators/fused/attn_feed_forward.h
index 77339f1fa0d64..25ba1cc13ead2 100644
--- a/paddle/fluid/operators/fused/attn_feed_forward.h
+++ b/paddle/fluid/operators/fused/attn_feed_forward.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/operators/fused/attn_bias_add.cu.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/attn_gemm_int8.h b/paddle/fluid/operators/fused/attn_gemm_int8.h
index 8dc4810b1f3b9..a6865649b26ae 100644
--- a/paddle/fluid/operators/fused/attn_gemm_int8.h
+++ b/paddle/fluid/operators/fused/attn_gemm_int8.h
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/cublaslt.h"
 #include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
@@ -87,12 +87,12 @@ class AttnMatmulINT8 {
       std::vector<phi::DenseTensor*> outs = {bias_out};
       phi::funcs::BroadcastKernel<T>(
           dev_ctx_, ins, &outs, phi::funcs::AddFunctor<T>());
-      PADDLE_ENFORCE_EQ(cudaGetLastError(),
-                        cudaSuccess,
-                        platform::errors::Fatal(
-                            "cuda error occurred after computing bias. "
-                            "But it does not mean this error is caused by "
-                            "bias computing"));
+      PADDLE_ENFORCE_EQ(
+          cudaGetLastError(),
+          cudaSuccess,
+          phi::errors::Fatal("cuda error occurred after computing bias. "
+                             "But it does not mean this error is caused by "
+                             "bias computing"));
     }
   }
 
@@ -141,12 +141,12 @@ class AttnMatmulINT8 {
       std::vector<phi::DenseTensor*> outs = {bias_out};
       phi::funcs::BroadcastKernel<T>(
           dev_ctx_, ins, &outs, phi::funcs::AddFunctor<T>());
-      PADDLE_ENFORCE_EQ(cudaGetLastError(),
-                        cudaSuccess,
-                        platform::errors::Fatal(
-                            "cuda error occurred after computing bias. "
-                            "But it does not mean this error is caused by "
-                            "bias computing"));
+      PADDLE_ENFORCE_EQ(
+          cudaGetLastError(),
+          cudaSuccess,
+          phi::errors::Fatal("cuda error occurred after computing bias. "
+                             "But it does not mean this error is caused by "
+                             "bias computing"));
     }
   }
 
diff --git a/paddle/fluid/operators/fused/cublaslt.h b/paddle/fluid/operators/fused/cublaslt.h
index e9728c58b55dc..e3f96b9ec1d3d 100644
--- a/paddle/fluid/operators/fused/cublaslt.h
+++ b/paddle/fluid/operators/fused/cublaslt.h
@@ -54,7 +54,7 @@ class CublasLtHelper {
     PADDLE_ENFORCE_EQ(
         status,
         CUBLAS_STATUS_SUCCESS,
-        platform::errors::External(
+        phi::errors::External(
             "cublasLtMatrixLayoutCreate execution error"
             "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
             "information"));
@@ -69,7 +69,7 @@ class CublasLtHelper {
     PADDLE_ENFORCE_EQ(
         status,
         CUBLAS_STATUS_SUCCESS,
-        platform::errors::External(
+        phi::errors::External(
             "cublasLtMatmulDescCreate execution error"
             "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
             "information"));
@@ -81,7 +81,7 @@ class CublasLtHelper {
     PADDLE_ENFORCE_EQ(
         status,
         CUBLAS_STATUS_SUCCESS,
-        platform::errors::External(
+        phi::errors::External(
             "cublasLtMatmulDescSetAttribute execution error"
             "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
             "information"));
@@ -91,7 +91,7 @@ class CublasLtHelper {
     PADDLE_ENFORCE_EQ(
         status,
         CUBLAS_STATUS_SUCCESS,
-        platform::errors::External(
+        phi::errors::External(
             "cublasLtMatrixLayoutCreate execution error"
             "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
             "information"));
@@ -100,7 +100,7 @@ class CublasLtHelper {
     PADDLE_ENFORCE_EQ(
         status,
         CUBLAS_STATUS_SUCCESS,
-        platform::errors::External(
+        phi::errors::External(
             "cublasLtMatrixLayoutCreate execution error"
             "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
             "information"));
@@ -109,7 +109,7 @@ class CublasLtHelper {
     PADDLE_ENFORCE_EQ(
         status,
         CUBLAS_STATUS_SUCCESS,
-        platform::errors::External(
+        phi::errors::External(
             "cublasLtMatrixLayoutCreate execution error"
             "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
             "information"));
@@ -212,7 +212,7 @@ class CublasLtHelper {
     PADDLE_ENFORCE_EQ(
         status,
         CUBLAS_STATUS_SUCCESS,
-        platform::errors::External(
+        phi::errors::External(
             "cublasLtMatmul execution error"
             "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
             "information"));
diff --git a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
index 8f3b5e4f09a06..5fb6f38b4c682 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
@@ -39,7 +39,7 @@ struct BNStatsFinalizeArgs {
     PADDLE_ENFORCE_EQ(
         param_shape.size(),
         4U,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The size of param_shape is expected to 4. But received "
             "param_shape's size is %d, param_shape is [%s].",
             param_shape.size(),
@@ -160,11 +160,11 @@ class CudnnBNStatsFinalize {
                                   CUDNN_BATCHNORM_SPATIAL_PERSISTENT);
     // Check workspace size, also creates plan.
     size_t workspace_size_bytes = train_op_.GetWorkspaceSizeInBytes(handle);
-    PADDLE_ENFORCE_EQ(workspace_size_bytes,
-                      0U,
-                      platform::errors::InvalidArgument(
-                          "Unexpected non-zero workspace size for "
-                          "CudnnBNStatsFinalize."));
+    PADDLE_ENFORCE_EQ(
+        workspace_size_bytes,
+        0U,
+        phi::errors::InvalidArgument("Unexpected non-zero workspace size for "
+                                     "CudnnBNStatsFinalize."));
     train_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE,
                                        static_cast<void *>(nullptr));
     train_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE,
@@ -192,11 +192,11 @@ class CudnnBNStatsFinalize {
                                       CUDNN_BATCHNORM_SPATIAL_PERSISTENT);
     // Check workspace size, also creates plan.
     size_t workspace_size_bytes = inference_op_.GetWorkspaceSizeInBytes(handle);
-    PADDLE_ENFORCE_EQ(workspace_size_bytes,
-                      0U,
-                      platform::errors::InvalidArgument(
-                          "Unexpected non-zero workspace size for "
-                          "CudnnBNStatsFinalize."));
+    PADDLE_ENFORCE_EQ(
+        workspace_size_bytes,
+        0U,
+        phi::errors::InvalidArgument("Unexpected non-zero workspace size for "
+                                     "CudnnBNStatsFinalize."));
     inference_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE,
                                            static_cast<void *>(nullptr));
     inference_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE,
diff --git a/paddle/fluid/operators/fused/cudnn_fusion_helper.h b/paddle/fluid/operators/fused/cudnn_fusion_helper.h
index 7b738383f6ac7..f1df14c4f60de 100644
--- a/paddle/fluid/operators/fused/cudnn_fusion_helper.h
+++ b/paddle/fluid/operators/fused/cudnn_fusion_helper.h
@@ -52,7 +52,7 @@ class CudnnFusionOp {
     PADDLE_ENFORCE_EQ(
         plan_created_,
         true,
-        platform::errors::Fatal(
+        phi::errors::Fatal(
             "CudnnFusionOp exec requested without a valid 'plan', need: "
             "<set const params>, GetWorkspaceSizeBytes(), Execute()."));
     PADDLE_ENFORCE_GPU_SUCCESS(
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
index 9dbb8a8eaebc8..5d0e6c44c4e63 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
@@ -55,7 +55,7 @@ struct NormConvolutionArgs {
     PADDLE_ENFORCE_EQ(
         input_shape.size(),
         4U,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The size of input_shape is expected to 4. But received "
             "input_shape's size is %d, input_shape is [%s].",
             input_shape.size(),
@@ -63,7 +63,7 @@ struct NormConvolutionArgs {
     PADDLE_ENFORCE_EQ(
         filter_shape.size(),
         4U,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The size of filter_shape is expected to 4. But received "
             "filter_shape's size is %d, filter_shape is [%s].",
             filter_shape.size(),
@@ -71,13 +71,13 @@ struct NormConvolutionArgs {
     PADDLE_ENFORCE_EQ(filter_shape[1] == filter_shape[2] &&
                           (filter_shape[1] == 1 || filter_shape[1] == 3),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The filter_shape is expected to store as nhwc, and "
                           "h = w = 1 or 3. But received filter_shape is [%s].",
                           common::make_ddim(filter_shape)));
     PADDLE_ENFORCE_EQ((filter_shape[0] % 32 == 0 && filter_shape[3] % 8 == 0),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The input channel is expected to be multiple of 8, "
                           "and the output channel is expected to be multiple "
                           "of 32. But received input channel is %d, output "
@@ -87,7 +87,7 @@ struct NormConvolutionArgs {
     PADDLE_ENFORCE_EQ(
         output_shape.size(),
         4U,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The size of output_shape is expected to 4. But received "
             "filter_shape's size is %d, filter_shape is [%s].",
             output_shape.size(),
@@ -96,7 +96,7 @@ struct NormConvolutionArgs {
     PADDLE_ENFORCE_EQ(
         is_support,
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Current test is only supported in the platforms with "
             "compatiblity greater than or equal to 70 and the kernel size "
             "must be equal to 1 or 3. When the kernel size is 1, "
diff --git a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
index 8b731e2c55408..7f47ea40e6cea 100644
--- a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
@@ -43,7 +43,7 @@ struct ScaleBiasAddReluArgs {
     PADDLE_ENFORCE_EQ(
         data_shape.size(),
         4U,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The size of data_shape is expected to 4. But received "
             "data_shape's size is %d, data_shape is [%s].",
             data_shape.size(),
@@ -51,7 +51,7 @@ struct ScaleBiasAddReluArgs {
     PADDLE_ENFORCE_EQ(
         param_shape.size(),
         4U,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The size of param_shape is expected to 4. But received "
             "param_shape's size is %d, param_shape is [%s].",
             param_shape.size(),
@@ -59,7 +59,7 @@ struct ScaleBiasAddReluArgs {
     PADDLE_ENFORCE_EQ(
         bitmask_shape.size(),
         3U,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The size of bitmask_shape is expected to 3. But received "
             "bitmask_shape's size is %d, bitmask_shape is [%s].",
             bitmask_shape.size(),
@@ -76,7 +76,7 @@ struct ScaleBiasAddReluArgs {
       PADDLE_ENFORCE_EQ(
           act_type,
           "relu",
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Only relu activation supported in normalized convolution."));
       mode = CUDNN_ACTIVATION_RELU;
     }
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index 843b5009a6fcc..2a43eea07535a 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -118,7 +118,7 @@ void InvokeTransposeRemovePadding(const phi::GPUContext& dev_ctx,
   PADDLE_ENFORCE_EQ(
       head_dim % PackSize,
       0,
-      platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "dim_head=%d must be divisible by vec_size=%d", head_dim, PackSize));
   const int32_t pack_num = elem_cnt / PackSize;
   const int32_t block_size = 128;
@@ -666,7 +666,7 @@ class FMHARef {
                               dev_ctx_,
                               qk_out_grad_tensor);
       } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
+        PADDLE_THROW(phi::errors::InvalidArgument(
             "Only used for the backward elementwise_add op when"
             "dy is not needed and dx is not reduce"));
         return;
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc
index 8ae1a60ad3b94..d46265de1b354 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op.cc
@@ -124,7 +124,7 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
     if (transpose_qkv_wb) {
       PADDLE_ENFORCE_EQ(y_dim.size(),
                         2,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The dimensions of qkv_weight must be 2 if enable"
                             "transpose_qkv_wb: (dim_embed, 3 * dim_embed),"
                             "but received dimensions of"
@@ -132,13 +132,13 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
                             y_dim.size()));
       PADDLE_ENFORCE_GT(num_heads,
                         0,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The num_heads must be provided and greater than 0 "
                             "if enable transpose_qkv_wb, but we got %d.",
                             num_heads));
       PADDLE_ENFORCE_EQ(y_dim[0] % num_heads,
                         0,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "First dim of qkv_w must be divisible by num heads "
                             "if enable transpose_qkv_wb, but receive first "
                             "dim of qkv_w is %d and num_heads is %d.",
@@ -147,7 +147,7 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
       if (ctx->Attrs().Get<int>("ring_id") == -1) {
         PADDLE_ENFORCE_EQ(y_dim[0] * 3,
                           y_dim[1],
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "The dimensions of qkv_weight must be 2"
                               "(dim_embed, 3 * dim_embed)."));
       } else {
@@ -159,21 +159,21 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
     } else {
       PADDLE_ENFORCE_EQ(y_dim.size(),
                         4,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The dimensions of qkv_weight must be 4 if not"
                             "enable transpose_qkv_wb: (3, num_head, dim_head, "
                             "dim_embed), but received [%d]",
                             y_dim.size()));
       PADDLE_ENFORCE_EQ(y_dim[0],
                         3,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "First dim of qkv_w must be 3 if disable "
                             "transpose_qkv_wb, but we got %d.",
                             y_dim[0]));
       if (ctx->Attrs().Get<int>("ring_id") == -1) {
         PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2],
                           y_dim[3],
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "The dimensions of qkv_weight must be 4"
                               "(3, num_head, dim_head, dim_embed),"
                               "and must satisfy the limitations: "
@@ -186,15 +186,15 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         x_dim.size(),
         3,
-        platform::errors::InvalidArgument("The dimensions of x must be 3"
-                                          "(batch_size, seq_len, dim_embed),"
-                                          "but received dimensions of"
-                                          "Input is [%d]",
-                                          x_dim.size()));
+        phi::errors::InvalidArgument("The dimensions of x must be 3"
+                                     "(batch_size, seq_len, dim_embed),"
+                                     "but received dimensions of"
+                                     "Input is [%d]",
+                                     x_dim.size()));
 
     PADDLE_ENFORCE_EQ(x_dim[2],
                       hidden_size,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "ShapeError: the dimension of x_dim[2] and y_dim[3] "
                           "(y_dim[1] if enable transpose_qkv_w) "
                           "must be equal. But received: the shape "
@@ -245,23 +245,23 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           c_dim.size(),
           5,
-          paddle::platform::errors::InvalidArgument(
-              "The CacheKV must be 5 dims, but got %d", c_dim.size()));
+          phi::errors::InvalidArgument("The CacheKV must be 5 dims, but got %d",
+                                       c_dim.size()));
       PADDLE_ENFORCE_EQ(c_dim[0],
                         2,
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The first dim of CacheKV must be 2, but got %d",
                             c_dim[0]));  // 2
       PADDLE_ENFORCE_EQ(c_dim[1],
                         x_dim[0],
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The second dim of CacheKV must be equal with "
                             "batch size %d, but got %d",
                             x_dim[0],
                             c_dim[1]));  // batch_size
       PADDLE_ENFORCE_EQ(c_dim[2],
                         num_heads,
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The third dim of CacheKV must be equal with num "
                             "head %d, but got %d",
                             num_heads,
@@ -272,14 +272,14 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
         PADDLE_ENFORCE_GE(
             c_dim[3],
             0,
-            paddle::platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "The forth dim of CacheKV must be greater than 0, but got %d",
                 c_dim[3]));  // cache_seq_len
       }
 
       PADDLE_ENFORCE_EQ(c_dim[4],
                         dim_head,
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The fifth dim of CacheKV must be equal with head "
                             "size %d, but got %d",
                             dim_head,
@@ -400,7 +400,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
         .AddCustomChecker([](const float &epsilon) {
           PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f,
                             true,
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "'epsilon' in Op(LayerNorm) should be between"
                                 "0.0 and 0.001, But received [%s].",
                                 epsilon));
@@ -413,7 +413,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
           PADDLE_ENFORCE_EQ(
               drop_p >= 0.0f && drop_p <= 1.0f,
               true,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "'attn_dropout_rate' must be between 0.0 and 1.0."));
         });
     AddAttr<bool>("is_test",
@@ -449,7 +449,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
           PADDLE_ENFORCE_EQ(
               type == "downgrade_in_infer" || type == "upscale_in_train",
               true,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "dropout_implementation can only be downgrade_in_infer or "
                   "upscale_in_train"));
         });
@@ -459,7 +459,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
         .AddCustomChecker([](const float &drop_p) {
           PADDLE_ENFORCE_EQ(drop_p >= 0.0f && drop_p <= 1.0f,
                             true,
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "'dropout_rate' must be between 0.0 and 1.0."));
         });
     AddAttr<bool>("dropout_fix_seed",
@@ -479,7 +479,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
           PADDLE_ENFORCE_EQ(
               type == "downgrade_in_infer" || type == "upscale_in_train",
               true,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "dropout_implementation can only be downgrade_in_infer or "
                   "upscale_in_train"));
         });
@@ -489,7 +489,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
         .AddCustomChecker([](const float &ln_epsilon) {
           PADDLE_ENFORCE_EQ(ln_epsilon >= 0.0f && ln_epsilon <= 0.001f,
                             true,
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "'epsilon' of the second LayerNorm in Fused "
                                 "attention op should be between"
                                 "0.0 and 0.001, But received [%s].",
@@ -540,7 +540,7 @@ class FusedAttentionGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"),
                       false,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "GradOp is only callable when is_test is false"));
 
     if (ctx->Attrs().Get<bool>("pre_layer_norm") == false) {
diff --git a/paddle/fluid/operators/fused/fused_attention_utils.h b/paddle/fluid/operators/fused/fused_attention_utils.h
index 39eb4c821e00a..18e3a513b3053 100644
--- a/paddle/fluid/operators/fused/fused_attention_utils.h
+++ b/paddle/fluid/operators/fused/fused_attention_utils.h
@@ -62,7 +62,7 @@ static void AllReduce(phi::DenseTensor &tensor,  // NOLINT
       // Use New Communication Library
       PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
                         true,
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "You choose to use new communication library by "
                             "setting environment "
                             "variable FLAGS_dynamic_static_unified_comm True. "
@@ -73,7 +73,7 @@ static void AllReduce(phi::DenseTensor &tensor,  // NOLINT
           comm_context_manager.Get(std::to_string(ring_id)));
       PADDLE_ENFORCE_NE(comm_ctx,
                         nullptr,
-                        paddle::platform::errors::Unavailable(
+                        phi::errors::Unavailable(
                             "NCCLCommContext is nullptr, collective op should "
                             "has ring_id attr."));
 
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
index 2ea40d840d2b3..69869cd3b7729 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
@@ -27,57 +27,57 @@ namespace operators {
 void FusedBatchNormActOp::InferShape(framework::InferShapeContext *ctx) const {
   PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
                     true,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Input(X) of BatchNormOp should not be null."));
   PADDLE_ENFORCE_EQ(ctx->HasInput("Scale"),
                     true,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Input(Scale) of BatchNormOp should not be null."));
   PADDLE_ENFORCE_EQ(ctx->HasInput("Bias"),
                     true,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Input(Bias) of BatchNormOp should not be null."));
   PADDLE_ENFORCE_EQ(ctx->HasInput("Mean"),
                     true,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Input(Mean) of BatchNormOp should not be null."));
   PADDLE_ENFORCE_EQ(ctx->HasInput("Variance"),
                     true,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Input(Variance) of BatchNormOp should not be null."));
   PADDLE_ENFORCE_EQ(ctx->HasOutput("Y"),
                     true,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Output(Y) of BatchNormOp should not be null."));
   PADDLE_ENFORCE_EQ(ctx->HasOutput("MeanOut"),
                     true,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Output(MeanOut) of BatchNormOp should not be null."));
   PADDLE_ENFORCE_EQ(
       ctx->HasOutput("VarianceOut"),
       true,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Output(VarianceOut) of BatchNormOp should not be null."));
   PADDLE_ENFORCE_EQ(
       ctx->HasOutput("SavedMean"),
       true,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Output(SavedMean) of BatchNormOp should not be null."));
   PADDLE_ENFORCE_EQ(
       ctx->HasOutput("SavedVariance"),
       true,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Output(SavedVariance) of BatchNormOp should not be null."));
 
   // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
   PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0],
                     ctx->Outputs("MeanOut")[0],
-                    platform::errors::PreconditionNotMet(
+                    phi::errors::PreconditionNotMet(
                         "Mean and MeanOut should share the same memory"));
   PADDLE_ENFORCE_EQ(
       ctx->Inputs("Variance")[0],
       ctx->Outputs("VarianceOut")[0],
-      platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "Variance and VarianceOut should share the same memory"));
 
   const auto x_dims = ctx->GetInputDim("X");
@@ -85,23 +85,23 @@ void FusedBatchNormActOp::InferShape(framework::InferShapeContext *ctx) const {
   PADDLE_ENFORCE_GE(
       x_dims.size(),
       2,
-      platform::errors::PreconditionNotMet("ShapeError: the dimension of input "
-                                           "X must greater than or equal to 2."
-                                           "But received: the shape of input X "
-                                           "= [%s], the dimension of input X ="
-                                           "[%d]",
-                                           x_dims,
-                                           x_dims.size()));
+      phi::errors::PreconditionNotMet("ShapeError: the dimension of input "
+                                      "X must greater than or equal to 2."
+                                      "But received: the shape of input X "
+                                      "= [%s], the dimension of input X ="
+                                      "[%d]",
+                                      x_dims,
+                                      x_dims.size()));
   PADDLE_ENFORCE_LE(
       x_dims.size(),
       5,
-      platform::errors::PreconditionNotMet("ShapeError: the dimension of input "
-                                           "X must smaller than or equal to 5."
-                                           "But received: the shape of input X "
-                                           "= [%s], the dimension of input X ="
-                                           "[%d]",
-                                           x_dims,
-                                           x_dims.size()));
+      phi::errors::PreconditionNotMet("ShapeError: the dimension of input "
+                                      "X must smaller than or equal to 5."
+                                      "But received: the shape of input X "
+                                      "= [%s], the dimension of input X ="
+                                      "[%d]",
+                                      x_dims,
+                                      x_dims.size()));
 
   const int64_t C = x_dims[x_dims.size() - 1];
 
@@ -111,7 +111,7 @@ void FusedBatchNormActOp::InferShape(framework::InferShapeContext *ctx) const {
   PADDLE_ENFORCE_EQ(
       scale_dim.size(),
       1UL,
-      platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "ShapeError: the dimension of scale must equal to 1."
           "But received: the shape of scale is [%s], the dimension "
           "of scale is [%d]",
@@ -119,7 +119,7 @@ void FusedBatchNormActOp::InferShape(framework::InferShapeContext *ctx) const {
           scale_dim.size()));
   PADDLE_ENFORCE_EQ(bias_dim.size(),
                     1UL,
-                    platform::errors::PreconditionNotMet(
+                    phi::errors::PreconditionNotMet(
                         "ShapeError: the dimension of bias must equal to 1."
                         "But received: the shape of bias is [%s],the dimension "
                         "of bias is [%d]",
@@ -135,14 +135,14 @@ void FusedBatchNormActOp::InferShape(framework::InferShapeContext *ctx) const {
   if (check) {
     PADDLE_ENFORCE_EQ(scale_dim[0],
                       C,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "ShapeError: the shape of scale must equal to [%d]"
                           "But received: the shape of scale is [%d]",
                           C,
                           scale_dim[0]));
     PADDLE_ENFORCE_EQ(bias_dim[0],
                       C,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "ShapeError: the shape of bias must equal to [%d]"
                           "But received: the shape of bias is [%d]",
                           C,
@@ -166,25 +166,25 @@ phi::KernelKey FusedBatchNormActOp::GetExpectedKernelType(
   if (input_data_type == framework::proto::VarType::FP64) {
     bn_param_type = framework::proto::VarType::FP64;
   }
-  PADDLE_ENFORCE_EQ(bn_param_type,
-                    framework::TransToProtoVarType(
-                        ctx.Input<phi::DenseTensor>("Scale")->dtype()),
-                    platform::errors::PreconditionNotMet(
-                        "Scale input should be of float type"));
-  PADDLE_ENFORCE_EQ(bn_param_type,
-                    framework::TransToProtoVarType(
-                        ctx.Input<phi::DenseTensor>("Bias")->dtype()),
-                    platform::errors::PreconditionNotMet(
-                        "Bias input should be of float type"));
-  PADDLE_ENFORCE_EQ(bn_param_type,
-                    framework::TransToProtoVarType(
-                        ctx.Input<phi::DenseTensor>("Mean")->dtype()),
-                    platform::errors::PreconditionNotMet(
-                        "Mean input should be of float type"));
+  PADDLE_ENFORCE_EQ(
+      bn_param_type,
+      framework::TransToProtoVarType(
+          ctx.Input<phi::DenseTensor>("Scale")->dtype()),
+      phi::errors::PreconditionNotMet("Scale input should be of float type"));
+  PADDLE_ENFORCE_EQ(
+      bn_param_type,
+      framework::TransToProtoVarType(
+          ctx.Input<phi::DenseTensor>("Bias")->dtype()),
+      phi::errors::PreconditionNotMet("Bias input should be of float type"));
+  PADDLE_ENFORCE_EQ(
+      bn_param_type,
+      framework::TransToProtoVarType(
+          ctx.Input<phi::DenseTensor>("Mean")->dtype()),
+      phi::errors::PreconditionNotMet("Mean input should be of float type"));
   PADDLE_ENFORCE_EQ(bn_param_type,
                     framework::TransToProtoVarType(
                         ctx.Input<phi::DenseTensor>("Variance")->dtype()),
-                    platform::errors::PreconditionNotMet(
+                    phi::errors::PreconditionNotMet(
                         "Variance input should be of float type"));
 
   return phi::KernelKey(input_data_type, ctx.GetPlace());
@@ -197,7 +197,7 @@ void FusedBatchNormActOpMaker::Make() {
       .AddCustomChecker([](const float &epsilon) {
         PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f,
                           true,
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "Attr(epsilon) should be between 0.0 and 0.001, "
                               "but received value is %f.",
                               epsilon));
@@ -252,37 +252,37 @@ void FusedBatchNormActGradOp::InferShape(
   PADDLE_ENFORCE_EQ(
       ctx->HasInput("X"),
       true,
-      platform::errors::InvalidArgument("Input(X) should not be null."));
+      phi::errors::InvalidArgument("Input(X) should not be null."));
   PADDLE_ENFORCE_EQ(
       ctx->HasInput("Scale"),
       true,
-      platform::errors::InvalidArgument("Input(Scale) should not be null."));
+      phi::errors::InvalidArgument("Input(Scale) should not be null."));
   PADDLE_ENFORCE_EQ(
       ctx->HasInput(framework::GradVarName("Y")),
       true,
-      platform::errors::InvalidArgument("Input(Y@GRAD) should not be null."));
-  PADDLE_ENFORCE_EQ(ctx->HasInput("SavedMean"),
-                    true,
-                    platform::errors::InvalidArgument(
-                        "Input(SavedMean) should not be null."));
-  PADDLE_ENFORCE_EQ(ctx->HasInput("SavedVariance"),
-                    true,
-                    platform::errors::InvalidArgument(
-                        "Input(SavedVariance) should not be null"));
+      phi::errors::InvalidArgument("Input(Y@GRAD) should not be null."));
+  PADDLE_ENFORCE_EQ(
+      ctx->HasInput("SavedMean"),
+      true,
+      phi::errors::InvalidArgument("Input(SavedMean) should not be null."));
+  PADDLE_ENFORCE_EQ(
+      ctx->HasInput("SavedVariance"),
+      true,
+      phi::errors::InvalidArgument("Input(SavedVariance) should not be null"));
 
   // check output
   PADDLE_ENFORCE_EQ(
       ctx->HasOutput(framework::GradVarName("X")),
       true,
-      platform::errors::InvalidArgument("Output(X@GRAD) should not be null."));
-  PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Scale")),
-                    true,
-                    platform::errors::InvalidArgument(
-                        "Output(Scale@GRAD) should not be null."));
-  PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Bias")),
-                    true,
-                    platform::errors::InvalidArgument(
-                        "Output(Bias@GRAD) should not be null."));
+      phi::errors::InvalidArgument("Output(X@GRAD) should not be null."));
+  PADDLE_ENFORCE_EQ(
+      ctx->HasOutput(framework::GradVarName("Scale")),
+      true,
+      phi::errors::InvalidArgument("Output(Scale@GRAD) should not be null."));
+  PADDLE_ENFORCE_EQ(
+      ctx->HasOutput(framework::GradVarName("Bias")),
+      true,
+      phi::errors::InvalidArgument("Output(Bias@GRAD) should not be null."));
 
   const auto x_dims = ctx->GetInputDim("X");
   const int C = x_dims[x_dims.size() - 1];
@@ -297,8 +297,8 @@ phi::KernelKey FusedBatchNormActGradOp::GetExpectedKernelType(
     const framework::ExecutionContext &ctx) const {
   const auto *var = ctx.InputVar(framework::GradVarName("Y"));
   if (var == nullptr) {
-    PADDLE_THROW(platform::errors::NotFound(
-        "Can not find Y@GRAD in the execution context."));
+    PADDLE_THROW(
+        phi::errors::NotFound("Can not find Y@GRAD in the execution context."));
   }
   const phi::DenseTensor *t = nullptr;
   if (var->IsType<phi::DenseTensor>()) {
@@ -306,7 +306,7 @@ phi::KernelKey FusedBatchNormActGradOp::GetExpectedKernelType(
   }
   if (t == nullptr) {
     PADDLE_THROW(
-        platform::errors::NotFound("Can not get the tensor value of Y@GRAD."));
+        phi::errors::NotFound("Can not get the tensor value of Y@GRAD."));
   }
 
   return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
index ac198e9cf2c25..ff903ee6ca716 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
@@ -51,35 +51,35 @@ void FusedBatchNormAddActOp::InferShape(
 
   const auto x_dims = ctx->GetInputDim("X");
   const auto z_dims = ctx->GetInputDim("Z");
-  PADDLE_ENFORCE_EQ(x_dims,
-                    z_dims,
-                    platform::errors::InvalidArgument(
-                        "ShapeError: the shapes of input "
-                        "must be equal. But received: the shape "
-                        "of input X = [%s], and the shape of "
-                        "input Y = [%s]",
-                        x_dims,
-                        z_dims));
+  PADDLE_ENFORCE_EQ(
+      x_dims,
+      z_dims,
+      phi::errors::InvalidArgument("ShapeError: the shapes of input "
+                                   "must be equal. But received: the shape "
+                                   "of input X = [%s], and the shape of "
+                                   "input Y = [%s]",
+                                   x_dims,
+                                   z_dims));
   PADDLE_ENFORCE_GE(
       x_dims.size(),
       2,
-      platform::errors::InvalidArgument("ShapeError: the dimensions of input "
-                                        "must greater than or equal to 2."
-                                        "But received: the shape of input "
-                                        "= [%s], the dimension of input = "
-                                        "[%d]",
-                                        x_dims,
-                                        x_dims.size()));
+      phi::errors::InvalidArgument("ShapeError: the dimensions of input "
+                                   "must greater than or equal to 2."
+                                   "But received: the shape of input "
+                                   "= [%s], the dimension of input = "
+                                   "[%d]",
+                                   x_dims,
+                                   x_dims.size()));
   PADDLE_ENFORCE_LE(
       x_dims.size(),
       5,
-      platform::errors::InvalidArgument("ShapeError: the dimensions of input "
-                                        "must smaller than or equal to 5."
-                                        "But received: the shape of input "
-                                        "= [%s], the dimension of input = "
-                                        "[%d]",
-                                        x_dims,
-                                        x_dims.size()));
+      phi::errors::InvalidArgument("ShapeError: the dimensions of input "
+                                   "must smaller than or equal to 5."
+                                   "But received: the shape of input "
+                                   "= [%s], the dimension of input = "
+                                   "[%d]",
+                                   x_dims,
+                                   x_dims.size()));
 
   const int64_t C = x_dims[x_dims.size() - 1];
 
@@ -89,7 +89,7 @@ void FusedBatchNormAddActOp::InferShape(
   PADDLE_ENFORCE_EQ(
       scale_dim.size(),
       1UL,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "ShapeError: the dimension of scale must equal to 1."
           "But received: the shape of scale is [%s], the dimension "
           "of scale is [%d]",
@@ -97,7 +97,7 @@ void FusedBatchNormAddActOp::InferShape(
           scale_dim.size()));
   PADDLE_ENFORCE_EQ(bias_dim.size(),
                     1UL,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "ShapeError: the dimension of bias must equal to 1."
                         "But received: the shape of bias is [%s],the dimension "
                         "of bias is [%d]",
@@ -113,14 +113,14 @@ void FusedBatchNormAddActOp::InferShape(
   if (check) {
     PADDLE_ENFORCE_EQ(scale_dim[0],
                       C,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "ShapeError: the shape of scale must equal to [%d]"
                           "But received: the shape of scale is [%d]",
                           C,
                           scale_dim[0]));
     PADDLE_ENFORCE_EQ(bias_dim[0],
                       C,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "ShapeError: the shape of bias must equal to [%d]"
                           "But received: the shape of bias is [%d]",
                           C,
@@ -145,12 +145,12 @@ phi::KernelKey FusedBatchNormAddActOp::GetExpectedKernelType(
       bn_param_type,
       framework::TransToProtoVarType(
           ctx.Input<phi::DenseTensor>("Scale")->dtype()),
-      platform::errors::InvalidArgument("Scale input should be of float type"));
+      phi::errors::InvalidArgument("Scale input should be of float type"));
   PADDLE_ENFORCE_EQ(
       bn_param_type,
       framework::TransToProtoVarType(
           ctx.Input<phi::DenseTensor>("Bias")->dtype()),
-      platform::errors::InvalidArgument("Bias input should be of float type"));
+      phi::errors::InvalidArgument("Bias input should be of float type"));
 
   return phi::KernelKey(input_data_type, ctx.GetPlace());
 }
@@ -194,7 +194,7 @@ void FusedBatchNormAddActOpMaker::Make() {
       .AddCustomChecker([](const float &epsilon) {
         PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f,
                           true,
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "'epsilon' should be between 0.0 and 0.001."));
       });
   AddAttr<std::string>("act_type", "The activation type to be fused.")
@@ -261,8 +261,8 @@ phi::KernelKey FusedBatchNormAddActGradOp::GetExpectedKernelType(
     const framework::ExecutionContext &ctx) const {
   const auto *var = ctx.InputVar(framework::GradVarName("Y"));
   if (var == nullptr) {
-    PADDLE_THROW(platform::errors::NotFound(
-        "Can not find Y@GRAD in the execution context."));
+    PADDLE_THROW(
+        phi::errors::NotFound("Can not find Y@GRAD in the execution context."));
   }
   const phi::DenseTensor *t = nullptr;
   if (var->IsType<phi::DenseTensor>()) {
@@ -270,7 +270,7 @@ phi::KernelKey FusedBatchNormAddActGradOp::GetExpectedKernelType(
   }
   if (t == nullptr) {
     PADDLE_THROW(
-        platform::errors::NotFound("Can not get the tensor value of Y@GRAD."));
+        phi::errors::NotFound("Can not get the tensor value of Y@GRAD."));
   }
 
   return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h
index ccd099109487c..737909be4d8bf 100644
--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -22,9 +22,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h
index 2b1f6b14c33e5..9e9a89015652b 100644
--- a/paddle/fluid/operators/fused/fused_dropout_helper.h
+++ b/paddle/fluid/operators/fused/fused_dropout_helper.h
@@ -288,7 +288,7 @@ class FusedDropoutHelper {
                                     quant_max_bound,
                                     quant_min_bound);
     } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Currently only supports gelu or relu activation functions!"));
     }
   }
@@ -332,7 +332,7 @@ class FusedDropoutHelper {
           d_bias,
           ctx);
     } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Currently only supports gelu or relu activation functions!"));
     }
   }
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
index b11840866d46b..b17a6827af0e9 100644
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
@@ -24,7 +24,7 @@ bool IsUnaryCompound(const std::vector<std::string> &functor_list) {
   PADDLE_ENFORCE_EQ(
       functor_list.size(),
       2,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Invalid functor list size %d, which should be equal to %d.",
           functor_list.size(),
           2));
@@ -39,7 +39,7 @@ bool HasInPlaceUnary(const std::vector<std::string> &functor_list) {
   PADDLE_ENFORCE_EQ(
       functor_list.size(),
       2,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Invalid functor list size %d, which should be equal to %d.",
           functor_list.size(),
           2));
@@ -55,7 +55,7 @@ bool InputXCanBeAbsent(const std::vector<std::string> &functor_list) {
   PADDLE_ENFORCE_EQ(
       functor_list.size(),
       2,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Invalid functor list size %d, which should be equal to %d.",
           functor_list.size(),
           2));
@@ -73,7 +73,7 @@ static bool IsSupportedCompound(const std::vector<std::string> &functors) {
   PADDLE_ENFORCE_EQ(
       functors.size(),
       2UL,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Invalid functor list size %d, which should be equal to %d.",
           functors.size(),
           2));
@@ -89,12 +89,12 @@ static bool IsSupportedCompound(const std::vector<std::string> &functors) {
   } else if (binary_fun.count(functors[1])) {
     unary_fun_str = functors[0];
   } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
+    PADDLE_THROW(phi::errors::InvalidArgument(
         "%s and %s are not included in fused_list.", functors[0], functors[1]));
   }
   PADDLE_ENFORCE_EQ(unary_fun.count(unary_fun_str),
                     1,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "%s is not included in fused_list.", unary_fun_str));
   return true;
 }
@@ -107,17 +107,17 @@ class FusedElemwiseActivationOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("X"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(X) of FusedElemwiseActivationOp op should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("Y"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(Y) of FusedElemwiseActivationOp op should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("Out"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Output(Out) of FusedElemwiseActivationOp op should not be null."));
 
     auto x_dim = ctx->GetInputDim("X");
@@ -134,7 +134,7 @@ class FusedElemwiseActivationOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           ctx->HasOutput("IntermediateOut"),
           true,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Output(IntermediateOut) of FusedElemwiseActivationOp "
               "should not be null."));
 
@@ -176,7 +176,7 @@ class FusedElemwiseActivationOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE_EQ(ctx.Input<phi::DenseTensor>("X")->dtype(),
                       ctx.Input<phi::DenseTensor>("Y")->dtype(),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The element's type of input should be the same."));
     return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
                           ctx.GetPlace());
@@ -214,7 +214,7 @@ class FusedElemwiseActivationMaker : public framework::OpProtoAndCheckerMaker {
           PADDLE_ENFORCE_EQ(
               IsSupportedCompound(functor_list),
               true,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "the input functors should support compounding."));
         });
 
@@ -317,10 +317,10 @@ class FusedElemwiseActivationOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(Out@Grad) should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput(framework::GradVarName("Out")),
+        true,
+        phi::errors::InvalidArgument("Input(Out@Grad) should not be null."));
 
     auto functor_list =
         ctx->Attrs().Get<std::vector<std::string>>("functor_list");
@@ -328,14 +328,14 @@ class FusedElemwiseActivationOpGrad : public framework::OperatorWithKernel {
     if (ctx->Attrs().Get<bool>("save_intermediate_out")) {
       PADDLE_ENFORCE_EQ(ctx->HasInput("IntermediateOut"),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "Input(IntermediateOut) should not be null."));
     } else {
       if (!InputXCanBeAbsent(functor_list)) {
         PADDLE_ENFORCE_EQ(
             ctx->HasInput("X"),
             true,
-            platform::errors::InvalidArgument("Input(X) should not be null."));
+            phi::errors::InvalidArgument("Input(X) should not be null."));
       }
     }
 
@@ -353,7 +353,7 @@ class FusedElemwiseActivationOpGrad : public framework::OperatorWithKernel {
         PADDLE_ENFORCE_EQ(
             InputXCanBeAbsent(functor_list),
             true,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "Only when BinaryFunctor is elementwise_add, the 'X' "
                 "could be absent."));
 
@@ -370,7 +370,7 @@ class FusedElemwiseActivationOpGrad : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           ctx->HasInput("Y"),
           true,
-          platform::errors::InvalidArgument("Input(Y) should not be null."));
+          phi::errors::InvalidArgument("Input(Y) should not be null."));
       ctx->SetOutputDim(y_grad_name, ctx->GetInputDim("Y"));
       ctx->ShareLoD("Y", y_grad_name);
     }
@@ -414,7 +414,7 @@ class FusedElemwiseAddActivationOp : public FusedElemwiseActivationOp {
     PADDLE_ENFORCE_EQ(
         elemntwise_add_detected,
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "When the FusedElemwiseAddActivationOp Is used in fused pass, the "
             "elementwise_add Op must be"
             "detected and used, Please check the fuse pass pattern"));
@@ -439,7 +439,7 @@ class FusedElemwiseAddActivationOpGrad : public FusedElemwiseActivationOpGrad {
     PADDLE_ENFORCE_EQ(
         elemntwise_add_grad_detected,
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "When the FusedElemwiseAddActivationOpGrad Is used in fused pass, "
             "the elementwise_add_grad Op must be"
             "detected and used, Please check the fuse pass pattern"));
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu
index e7c436dd1fa0c..e712b78c42669 100644
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu
@@ -23,14 +23,14 @@ PD_REGISTER_STRUCT_KERNEL(fused_elemwise_activation,
                           ops::FusedElemwiseActivationKernel,
                           float,
                           double,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
 PD_REGISTER_STRUCT_KERNEL(fused_elemwise_activation_grad,
                           GPU,
                           ALL_LAYOUT,
                           ops::FusedElemwiseActivationGradKernel,
                           float,
                           double,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
 
 PD_REGISTER_STRUCT_KERNEL(fused_elemwise_add_activation,
                           GPU,
@@ -38,11 +38,11 @@ PD_REGISTER_STRUCT_KERNEL(fused_elemwise_add_activation,
                           ops::FusedElemwiseAddActivationKernel,
                           float,
                           double,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
 PD_REGISTER_STRUCT_KERNEL(fused_elemwise_add_activation_grad,
                           GPU,
                           ALL_LAYOUT,
                           ops::FusedElemwiseAddActivationGradKernel,
                           float,
                           double,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
index ad7f79307e628..6c476afd340fa 100644
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
@@ -415,8 +415,8 @@ static void RunFunctors(const framework::ExecutionContext &ctx,
         in_y,
         outputs);
   } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s has not been implemented.", funcs_str));
+    PADDLE_THROW(phi::errors::InvalidArgument("%s has not been implemented.",
+                                              funcs_str));
   }
 }
 
@@ -611,8 +611,8 @@ static void RunGradFunctors(const framework::ExecutionContext &ctx,
                                           y_grad,
                                           d_intermediate_out);
   } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s has not been implemented.", funcs_str));
+    PADDLE_THROW(phi::errors::InvalidArgument("%s has not been implemented.",
+                                              funcs_str));
   }
 }
 
@@ -629,10 +629,10 @@ class FusedElemwiseActivationKernel : public framework::OpKernel<T> {
                                  "Y",
                                  "FusedElemwiseActivation");
 
-    PADDLE_ENFORCE_EQ(ctx.HasOutput("Out"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "The output(Out) should not be empty"));
+    PADDLE_ENFORCE_EQ(
+        ctx.HasOutput("Out"),
+        true,
+        phi::errors::InvalidArgument("The output(Out) should not be empty"));
     auto output = ctx.Output<phi::DenseTensor>("Out");
 
     std::vector<phi::DenseTensor *> outputs;
@@ -641,7 +641,7 @@ class FusedElemwiseActivationKernel : public framework::OpKernel<T> {
     if (ctx.Attr<bool>("save_intermediate_out")) {
       PADDLE_ENFORCE_EQ(ctx.HasOutput("IntermediateOut"),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The save_intermediate_out is enable, so the "
                             "IntermediateOut should not be empty."));
 
@@ -663,16 +663,16 @@ class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_NE(
         in_y,
         nullptr,
-        platform::errors::InvalidArgument("Input(Y) should not be nullptr."));
+        phi::errors::InvalidArgument("Input(Y) should not be nullptr."));
     phi::DenseTensor *in_out =
         const_cast<phi::DenseTensor *>(ctx.Input<phi::DenseTensor>("Out"));
 
     auto in_out_grad =
         ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_NE(in_out_grad,
-                      nullptr,
-                      platform::errors::InvalidArgument(
-                          "Input(Out@Grad) should not be nullptr."));
+    PADDLE_ENFORCE_NE(
+        in_out_grad,
+        nullptr,
+        phi::errors::InvalidArgument("Input(Out@Grad) should not be nullptr."));
 
     phi::DenseTensor *in_x =
         const_cast<phi::DenseTensor *>(ctx.Input<phi::DenseTensor>("X"));
@@ -695,7 +695,7 @@ class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
           ctx.Input<phi::DenseTensor>("IntermediateOut"));
       PADDLE_ENFORCE_NE(in_intermediate_out,
                         nullptr,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The option of 'save_intermediate_out' is opened,"
                             " so the number of 'Out' should be two."));
     } else {
@@ -703,7 +703,7 @@ class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_NE(
             in_x,
             nullptr,
-            platform::errors::InvalidArgument("Input(X) should not be null."));
+            phi::errors::InvalidArgument("Input(X) should not be null."));
       }
     }
 
@@ -712,13 +712,13 @@ class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_NE(
           in_x,
           nullptr,
-          platform::errors::InvalidArgument("Input(X) should not be null."));
+          phi::errors::InvalidArgument("Input(X) should not be null."));
     } else {
       // If functor_list contains elementwise_add, the backward doesn't use
       // in_x, in_y and in_out.
       PADDLE_ENFORCE_EQ(InputXCanBeAbsent(functor_list),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "Only when the compoundfunctor contains "
                             "elementwise_add_grad, the 'X' could be absent."));
       in_x = const_cast<phi::DenseTensor *>(in_out_grad);
@@ -729,13 +729,13 @@ class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_NE(
           in_out,
           nullptr,
-          platform::errors::InvalidArgument("Input(X) should not be null."));
+          phi::errors::InvalidArgument("Input(X) should not be null."));
     } else {
       // If functor_list contains elementwise_add, the backward doesn't use
       // in_x, in_y and in_out.
       PADDLE_ENFORCE_EQ(InputXCanBeAbsent(functor_list),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "Only when the compoundfunctor contains "
                             "elementwise_add_grad, the 'X' could be absent."));
       in_out = const_cast<phi::DenseTensor *>(in_out_grad);
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
index e69825fdd9076..e4c43e4e4efb2 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -50,12 +50,12 @@ void FusedEmbeddingFCLSTMOp::InferShape(
   PADDLE_ENFORCE_EQ(
       table_dims.size(),
       2,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The Embeddings's rank should be 2, but received value is:%d.",
           table_dims.size()));
   PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1],
                     1,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The last dimension of the 'Ids' tensor must be 1, but "
                         "received value is:%d.",
                         ids_dims[ids_rank - 1]));
@@ -64,14 +64,14 @@ void FusedEmbeddingFCLSTMOp::InferShape(
   PADDLE_ENFORCE_EQ(
       x_dims.size(),
       2,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Input(Ids)'s rank must be 2, but received value is:%d.",
           x_dims.size()));
 
   if (ctx->HasInput("H0")) {
     PADDLE_ENFORCE_EQ(ctx->HasInput("C0"),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input(Cell) and Input(Hidden) of LSTM should exist "
                           "at the same time."));
     auto h_dims = ctx->GetInputDim("H0");
@@ -79,7 +79,7 @@ void FusedEmbeddingFCLSTMOp::InferShape(
     PADDLE_ENFORCE_EQ(
         h_dims,
         c_dims,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The dimension of Input(H0) and Input(C0) "
             "should be the same, but received H0 dim is:[%s], C0 dim is[%s]",
             h_dims,
@@ -91,19 +91,19 @@ void FusedEmbeddingFCLSTMOp::InferShape(
   PADDLE_ENFORCE_EQ(
       wh_dims.size(),
       2,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The rank of Input(WeightH) should be 2, but received value is:%d.",
           wh_dims.size()));
   PADDLE_ENFORCE_EQ(wh_dims[0],
                     frame_size,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The first dimension of Input(WeightH) should equal to "
                         "frame size:%d, but received value is:%d.",
                         frame_size,
                         wh_dims[0]));
   PADDLE_ENFORCE_EQ(wh_dims[1],
                     4 * frame_size,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The second dimension of Input(WeightH) should equal "
                         "to 4 * %d, but received value is:%d.",
                         frame_size,
@@ -113,19 +113,19 @@ void FusedEmbeddingFCLSTMOp::InferShape(
   PADDLE_ENFORCE_EQ(
       b_dims.size(),
       2,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The rank of Input(Bias) should be 2, but received value is:%d.",
           b_dims.size()));
-  PADDLE_ENFORCE_EQ(b_dims[0],
-                    1,
-                    platform::errors::InvalidArgument(
-                        "The first dimension of Input(Bias) "
-                        "should be 1, but received value is:%d.",
-                        b_dims[0]));
+  PADDLE_ENFORCE_EQ(
+      b_dims[0],
+      1,
+      phi::errors::InvalidArgument("The first dimension of Input(Bias) "
+                                   "should be 1, but received value is:%d.",
+                                   b_dims[0]));
   PADDLE_ENFORCE_EQ(
       b_dims[1],
       (ctx->Attrs().Get<bool>("use_peepholes") ? 7 : 4) * frame_size,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The second dimension of Input(Bias) should be "
           "7 * %d if enable peepholes connection or"
           "4 * %d if disable peepholes, bias dim is:%d, use_peepholes:%d",
@@ -417,11 +417,11 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_LT(
           ids_data[i],
           row_number,
-          platform::errors::OutOfRange(
+          phi::errors::OutOfRange(
               "Value of Ids %d should less than dict size %d.", i, row_number));
       PADDLE_ENFORCE_GE(ids_data[i],
                         0,
-                        platform::errors::OutOfRange(
+                        phi::errors::OutOfRange(
                             "Value of Ids %d should greater than ZERO.", i));
       memcpy(xx_data + i * row_width,
              embeddings_data + ids_data[i] * row_width,
@@ -530,11 +530,11 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_LT(
           ids_data[i],
           row_number,
-          platform::errors::OutOfRange(
+          phi::errors::OutOfRange(
               "Value of Ids %d should less than dict size %d.", i, row_number));
       PADDLE_ENFORCE_GE(ids_data[i],
                         0,
-                        platform::errors::OutOfRange(
+                        phi::errors::OutOfRange(
                             "Value of Ids %d should greater than ZERO.", i));
       memcpy(xx_data + i * row_width,
              embeddings_data + ids_data[i] * row_width,
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
index a0ee64bd2eced..4a7691bd33844 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
@@ -37,21 +37,21 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_EQ(table_dims.size(),
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dim size of the input tensor 'W' should be 2. "
                           "But received W's size = %d.",
                           table_dims.size()));
     PADDLE_ENFORCE_EQ(
         ids_dims[ids_dims.size() - 1],
         1,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The last dimension of the input tensor 'Ids' should be 1. "
             "But received Ids's size in the last dimension = %d.",
             ids_dims[ids_dims.size() - 1]));
     // we only support sum now
     PADDLE_ENFORCE_EQ(combiner,
                       "sum",
-                      platform::errors::Unimplemented(
+                      phi::errors::Unimplemented(
                           "The pooling type of sequence_pool only support sum "
                           "now. So the 'combiner' must be 'sum'."));
 
@@ -61,7 +61,7 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel {
         PADDLE_GET(framework::VarDesc*, ctx->GetInputVarPtrs("Ids")[0]);
     PADDLE_ENFORCE_EQ(ids_desc->GetLoDLevel(),
                       1,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "In compile time, the LoD Level of Ids should be 1. "
                           "But received the LoD Level of Ids = %d.",
                           ids_desc->GetLoDLevel()));
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
index e0186d99acb03..2a9a1e71dbd2b 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
@@ -95,7 +95,7 @@ struct EmbeddingVSumFunctor {
 
     PADDLE_ENFORCE_LE(table_width * idx_width,
                       out_width,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "table_width * idx_width should be less than or "
                           "equal to out_width. But received "
                           "table_width * idx_width = %s, out_width = %d.",
@@ -103,7 +103,7 @@ struct EmbeddingVSumFunctor {
                           out_width));
     PADDLE_ENFORCE_GT(ids_lod.size(),
                       1UL,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The tensor ids's LoD[0] should be greater than 1. "
                           "But received the ids's LoD[0] = %d.",
                           ids_lod.size()));
@@ -152,7 +152,7 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
     // in run time, the LoD of ids must be 1
     PADDLE_ENFORCE_EQ(ids_lod.size(),
                       1UL,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The LoD level of Input(Ids) should be 1. But "
                           "received Ids's LoD level = %d.",
                           ids_lod.size()));
@@ -236,7 +236,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
       auto *table_t = context.Input<phi::SelectedRows>("W");
       table_dim = table_t->value().dims();
     } else {
-      PADDLE_THROW(platform::errors::PermissionDenied(
+      PADDLE_THROW(phi::errors::PermissionDenied(
           "The parameter W of a LookupTable "
           "must be either phi::DenseTensor or SelectedRows."));
     }
@@ -293,7 +293,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
       const auto &ids_lod = ids->lod();
       PADDLE_ENFORCE_EQ(ids_lod.size(),
                         1UL,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The LoD level of Input(Ids) should be 1. But "
                             "received Ids's LoD level = %d.",
                             ids_lod.size()));
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc
index f6343f5bd1cbf..5956ea5a839a7 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cc
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc
@@ -79,8 +79,8 @@ class FusedFeedForwardOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GT(
         mat_dim_x.width_,
         static_cast<size_t>(1),
-        platform::errors::InvalidArgument("Product from the X shape[1] to "
-                                          "shape[n-1] must be larger than 1!"));
+        phi::errors::InvalidArgument("Product from the X shape[1] to "
+                                     "shape[n-1] must be larger than 1!"));
     auto dim_Linear1Weight = context->GetInputDim("Linear1Weight");
     auto tmp_dim_x = dim_x;
     tmp_dim_x[dim_x.size() - 1] =
@@ -190,7 +190,7 @@ class FusedFeedForwardOpMaker : public framework::OpProtoAndCheckerMaker {
           PADDLE_ENFORCE_EQ(
               drop_p >= 0.0f && drop_p <= 1.0f,
               true,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "'dropout1_rate' must be between 0.0 and 1.0."));
         });
     AddAttr<float>("dropout2_rate", "the dropout rate of second dropout")
@@ -199,7 +199,7 @@ class FusedFeedForwardOpMaker : public framework::OpProtoAndCheckerMaker {
           PADDLE_ENFORCE_EQ(
               drop_p >= 0.0f && drop_p <= 1.0f,
               true,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "'dropout2_rate' must be between 0.0 and 1.0."));
         });
     AddAttr<std::string>("dropout1_implementation",
@@ -209,7 +209,7 @@ class FusedFeedForwardOpMaker : public framework::OpProtoAndCheckerMaker {
           PADDLE_ENFORCE_EQ(
               type == "downgrade_in_infer" || type == "upscale_in_train",
               true,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "dropout1_implementation can only be downgrade_in_infer or "
                   "upscale_in_train"));
         });
@@ -220,7 +220,7 @@ class FusedFeedForwardOpMaker : public framework::OpProtoAndCheckerMaker {
           PADDLE_ENFORCE_EQ(
               type == "downgrade_in_infer" || type == "upscale_in_train",
               true,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "dropout2_implementation can only be downgrade_in_infer or "
                   "upscale_in_train"));
         });
@@ -266,7 +266,7 @@ class FusedFeedForwardOpGrad : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"),
                       false,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "GradOp is only callable when is_test is false"));
     bool pre_layer_norm = ctx->Attrs().Get<bool>("pre_layer_norm");
     OP_INOUT_CHECK(ctx->HasInput("Dropout1Mask"),
diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h
index 69fbca0f9be0f..cc1d0de18ada1 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention.h
+++ b/paddle/fluid/operators/fused/fused_gate_attention.h
@@ -156,8 +156,8 @@ struct GateAttentionConfig {
     if (merge_qkv) {
       PADDLE_ENFORCE_NOT_NULL(
           qkv_weight,
-          platform::errors::NotFound("The input qkv_weight can not be nullptr "
-                                     "when merge_qkv is true."));
+          phi::errors::NotFound("The input qkv_weight can not be nullptr "
+                                "when merge_qkv is true."));
 
       // When q_dim == kv_dim, QKV matmul can be computed merged.
       // qkv_weight: shape=[3, num_heads, head_dim, q_dim]
@@ -172,12 +172,12 @@ struct GateAttentionConfig {
     } else {
       PADDLE_ENFORCE_NOT_NULL(
           key,
-          platform::errors::NotFound(
+          phi::errors::NotFound(
               "The input key can not be nullptr when merge_qkv is false."));
       PADDLE_ENFORCE_NOT_NULL(
           query_weight,
-          platform::errors::NotFound("The input query_weight can not be "
-                                     "nullptr when merge_qkv is false."));
+          phi::errors::NotFound("The input query_weight can not be "
+                                "nullptr when merge_qkv is false."));
 
       // When q_dim != kv_dim, QKV matmul must be computed saparately.
       // key: shape=[batch_size, seq_len_m, m_size, kv_dim]
@@ -414,8 +414,8 @@ class FMHAGateRef {
       // qkv_transpose_out = transpose(qkv_out)
       PADDLE_ENFORCE_NOT_NULL(
           qkv_transpose_out,
-          platform::errors::NotFound("The input qkv_transpose_out can not be "
-                                     "nullptr when merge_qkv is true."));
+          phi::errors::NotFound("The input qkv_transpose_out can not be "
+                                "nullptr when merge_qkv is true."));
 
       phi::DenseTensor* qkv_out = config->GetQKVOut();
       ComputeQKVTransposeForward(*qkv_out, qkv_transpose_out);
@@ -429,16 +429,16 @@ class FMHAGateRef {
     } else {
       PADDLE_ENFORCE_NOT_NULL(
           q_transpose_out,
-          platform::errors::NotFound("The input q_transpose_out can not be "
-                                     "nullptr when merge_qkv is false."));
+          phi::errors::NotFound("The input q_transpose_out can not be "
+                                "nullptr when merge_qkv is false."));
       PADDLE_ENFORCE_NOT_NULL(
           k_transpose_out,
-          platform::errors::NotFound("The input k_transpose_out can not be "
-                                     "nullptr when merge_qkv is false."));
+          phi::errors::NotFound("The input k_transpose_out can not be "
+                                "nullptr when merge_qkv is false."));
       PADDLE_ENFORCE_NOT_NULL(
           v_transpose_out,
-          platform::errors::NotFound("The input v_transpose_out can not be "
-                                     "nullptr when merge_qkv is false."));
+          phi::errors::NotFound("The input v_transpose_out can not be "
+                                "nullptr when merge_qkv is false."));
 
       phi::DenseTensor* query_out = config->GetQueryOut();
       phi::DenseTensor* key_out = config->GetKeyOut();
@@ -544,8 +544,8 @@ class FMHAGateRef {
     if (merge_qkv_) {
       PADDLE_ENFORCE_NOT_NULL(
           qkv_transpose_out,
-          platform::errors::NotFound("The input qkv_transpose_out can not be "
-                                     "nullptr when merge_qkv is true."));
+          phi::errors::NotFound("The input qkv_transpose_out can not be "
+                                "nullptr when merge_qkv is true."));
 
       int64_t q_size = config->GetQuerySize();
       q_ptr = qkv_transpose_out->data<T>();
@@ -562,16 +562,16 @@ class FMHAGateRef {
     } else {
       PADDLE_ENFORCE_NOT_NULL(
           q_transpose_out,
-          platform::errors::NotFound("The input q_transpose_out can not be "
-                                     "nullptr when merge_qkv is false."));
+          phi::errors::NotFound("The input q_transpose_out can not be "
+                                "nullptr when merge_qkv is false."));
       PADDLE_ENFORCE_NOT_NULL(
           k_transpose_out,
-          platform::errors::NotFound("The input k_transpose_out can not be "
-                                     "nullptr when merge_qkv is false."));
+          phi::errors::NotFound("The input k_transpose_out can not be "
+                                "nullptr when merge_qkv is false."));
       PADDLE_ENFORCE_NOT_NULL(
           v_transpose_out,
-          platform::errors::NotFound("The input v_transpose_out can not be "
-                                     "nullptr when merge_qkv is false."));
+          phi::errors::NotFound("The input v_transpose_out can not be "
+                                "nullptr when merge_qkv is false."));
 
       q_ptr = q_transpose_out->data<T>();
       k_ptr = k_transpose_out->data<T>();
@@ -787,11 +787,11 @@ class FMHAGateRef {
                                       phi::DenseTensor* nonbatched_bias_grad) {
     PADDLE_ENFORCE_NOT_NULL(
         qk_out_grad,
-        platform::errors::NotFound("The qk_out_grad can not be nullptr."));
+        phi::errors::NotFound("The qk_out_grad can not be nullptr."));
 
     PADDLE_ENFORCE_EQ(qk_out_grad->dims(),
                       softmax_out->dims(),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The shape of qk_out_grad and softmax_out is "
                           "expected to be the same. But received qk_out_grad's "
                           "shape = %s, softmax_out's shape = %s.",
@@ -800,7 +800,7 @@ class FMHAGateRef {
 
     PADDLE_ENFORCE_EQ(src_mask_grad,
                       nullptr,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "src_mask_grad is expected to be nullptr."));
 
     phi::SoftmaxBackwardCUDAKernelDriver<T>(
@@ -874,8 +874,8 @@ class FlashAttnWithGating {
 
     PADDLE_ENFORCE_NOT_NULL(
         qkv_transpose_out,
-        platform::errors::NotFound("The input qkv_transpose_out can not be "
-                                   "nullptr when merge_qkv is true."));
+        phi::errors::NotFound("The input qkv_transpose_out can not be "
+                              "nullptr when merge_qkv is true."));
 
     // 1. Transpose qkv_out for flash_attn.
     phi::DenseTensor* qkv_out = config->GetQKVOut();
@@ -989,8 +989,8 @@ class FlashAttnWithGating {
 
     PADDLE_ENFORCE_NOT_NULL(
         qkv_transpose_out,
-        platform::errors::NotFound("The input qkv_transpose_out can not be"
-                                   "nullptr when merge_qkv is true."));
+        phi::errors::NotFound("The input qkv_transpose_out can not be"
+                              "nullptr when merge_qkv is true."));
 
     int64_t q_size = config->GetQuerySize();
     const T* q_ptr = qkv_transpose_out->data<T>();
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
index d066086bd6ae0..78202f70bcffb 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
@@ -401,7 +401,7 @@ class FusedGateAttentionOpKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           !key || query == key || query->data<T>() == key->data<T>(),
           true,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "key is expected to be nullptr or the same as "
               "query, but received key=%p, query=%p.",
               key,
@@ -623,14 +623,14 @@ PD_REGISTER_STRUCT_KERNEL(fused_gate_attention,
                           ALL_LAYOUT,
                           ops::FusedGateAttentionOpKernel,
                           float,
-                          plat::float16,
+                          phi::dtype::float16,
                           plat::bfloat16) {}
 PD_REGISTER_STRUCT_KERNEL(fused_gate_attention_grad,
                           GPU,
                           ALL_LAYOUT,
                           ops::FusedGateAttentionGradKernel,
                           float,
-                          plat::float16,
+                          phi::dtype::float16,
                           plat::bfloat16) {}
 #else
 PD_REGISTER_STRUCT_KERNEL(fused_gate_attention,
@@ -639,7 +639,7 @@ PD_REGISTER_STRUCT_KERNEL(fused_gate_attention,
                           ops::FusedGateAttentionOpKernel,
                           float,
                           double,
-                          plat::float16,
+                          phi::dtype::float16,
                           plat::bfloat16) {}
 PD_REGISTER_STRUCT_KERNEL(fused_gate_attention_grad,
                           GPU,
@@ -647,6 +647,6 @@ PD_REGISTER_STRUCT_KERNEL(fused_gate_attention_grad,
                           ops::FusedGateAttentionGradKernel,
                           float,
                           double,
-                          plat::float16,
+                          phi::dtype::float16,
                           plat::bfloat16) {}
 #endif
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index a6bd467dc1992..157ab69afc943 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -466,12 +466,11 @@ struct FusedLayernormResidualDropoutBiasFunctor {
   }
 };
 
-template struct FusedLayernormResidualDropoutBiasFunctor<
-    paddle::platform::float16,
-    uint8_t,
-    8,
-    float,
-    false>;
+template struct FusedLayernormResidualDropoutBiasFunctor<phi::dtype::float16,
+                                                         uint8_t,
+                                                         8,
+                                                         float,
+                                                         false>;
 
 /*
  * @brief layernorm(residual + dropout(x));
@@ -872,7 +871,7 @@ void LaunchLayernormResidualDropoutBias(
               epsilon,
               cols));
       default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
+        PADDLE_THROW(phi::errors::InvalidArgument(
             "Product from begin_norm_axis to end must be larger than 1"));
         break;
     }
@@ -1037,7 +1036,7 @@ void LaunchLayernormResidualDropoutBias(
       switch (cols) {
         LAUNCH_FUSED_FAST_LN_KERNEL;
         default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
+          PADDLE_THROW(phi::errors::InvalidArgument(
               "Only when column is equal to 768/1024/4096 is supported for "
               "now"));
           break;
diff --git a/paddle/fluid/operators/fused/fused_matmul_op.cc b/paddle/fluid/operators/fused/fused_matmul_op.cc
index 129f7e85386e7..93d79d677f8a5 100644
--- a/paddle/fluid/operators/fused/fused_matmul_op.cc
+++ b/paddle/fluid/operators/fused/fused_matmul_op.cc
@@ -176,47 +176,47 @@ class FusedMatmulOpMaker : public framework::OpProtoAndCheckerMaker {
  protected:
   void Apply() {
     AddInput("ResidualData",
-             "Extra input from matmul_elementwise_add_mkldnn_fuse_pass")
+             "Extra input from matmul_elementwise_add_onednn_fuse_pass")
         .AsDispensable()
         .AsExtra();
     AddAttr<float>("matmul_alpha", "Output scale used in matmul_v1")
         .SetDefault(1.0f);
     AddAttr<std::string>(
         "fuse_activation",
-        "Activation type from matmul_activation_mkldnn_fuse_pass")
+        "Activation type from matmul_activation_onednn_fuse_pass")
         .SetDefault("");
     AddAttr<float>("fuse_alpha",
-                   "Activation alpha from matmul_activation_mkldnn_fuse_pass")
+                   "Activation alpha from matmul_activation_onednn_fuse_pass")
         .SetDefault(0.0f);
     AddAttr<float>("fuse_beta",
-                   "Activation beta from matmul_activation_mkldnn_fuse_pass")
+                   "Activation beta from matmul_activation_onednn_fuse_pass")
         .SetDefault(0.0f);
     AddAttr<float>("fused_output_scale",
                    "Output scale from operator_scale_onednn_fuse_pass")
         .SetDefault(1.0f);
     AddAttr<std::vector<int>>("fused_reshape_X",
                               "Reshape's shape attribute from "
-                              "reshape_transpose_matmul_mkldnn_fuse_pass")
+                              "reshape_transpose_matmul_onednn_fuse_pass")
         .SetDefault({});
     AddAttr<std::vector<int>>("fused_transpose_X",
                               "Transpose's axis attribute from "
-                              "reshape_transpose_matmul_mkldnn_fuse_pass")
+                              "reshape_transpose_matmul_onednn_fuse_pass")
         .SetDefault({});
     AddAttr<std::vector<int>>("fused_reshape_Y",
                               "Reshape's shape attribute from "
-                              "reshape_transpose_matmul_mkldnn_fuse_pass")
+                              "reshape_transpose_matmul_onednn_fuse_pass")
         .SetDefault({});
     AddAttr<std::vector<int>>("fused_transpose_Y",
                               "Transpose's axis attribute from "
-                              "reshape_transpose_matmul_mkldnn_fuse_pass")
+                              "reshape_transpose_matmul_onednn_fuse_pass")
         .SetDefault({});
     AddAttr<std::vector<int>>("fused_reshape_Out",
                               "Reshape's shape attribute from "
-                              "matmul_transpose_reshape_mkldnn_fuse_pass")
+                              "matmul_transpose_reshape_onednn_fuse_pass")
         .SetDefault({});
     AddAttr<std::vector<int>>("fused_transpose_Out",
                               "Transpose's axis attribute from "
-                              "matmul_transpose_reshape_mkldnn_fuse_pass")
+                              "matmul_transpose_reshape_onednn_fuse_pass")
         .SetDefault({});
     AddAttr<std::string>("mkldnn_data_type", "oneDNN operator data type")
         .SetDefault("float32")
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
index 562ddf7ae6c4e..dcea415e32508 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
@@ -72,23 +72,23 @@ class FusedMultiTransformerINT8Op : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         x_dim.size(),
         3,
-        platform::errors::InvalidArgument("The dimensions of x must be 3"
-                                          "(batch_size, seq_len, dim_embed),"
-                                          "but received dimensions of"
-                                          "Input is [%d]",
-                                          x_dim.size()));
-    PADDLE_ENFORCE_EQ(y_dim.size(),
-                      4,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of qkv_weight must be 4"
-                          "(3, num_head, dim_head, dim_embed),"
-                          "but received dimensions of"
-                          "Input is [%d]",
-                          y_dim.size()));
+        phi::errors::InvalidArgument("The dimensions of x must be 3"
+                                     "(batch_size, seq_len, dim_embed),"
+                                     "but received dimensions of"
+                                     "Input is [%d]",
+                                     x_dim.size()));
+    PADDLE_ENFORCE_EQ(
+        y_dim.size(),
+        4,
+        phi::errors::InvalidArgument("The dimensions of qkv_weight must be 4"
+                                     "(3, num_head, dim_head, dim_embed),"
+                                     "but received dimensions of"
+                                     "Input is [%d]",
+                                     y_dim.size()));
     PADDLE_ENFORCE_EQ(
         x_dim[2],
         trans_qkvw ? y_dim[3] : y_dim[0],
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "ShapeError: the dimension of x_dim[2] and y_dim[3](trans_qkvw is "
             "true) or y_dim[0](trans_qkvw is false)"
             "must be equal. But received: the shape "
@@ -101,7 +101,7 @@ class FusedMultiTransformerINT8Op : public framework::OperatorWithKernel {
       if (trans_qkvw) {
         PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2],
                           y_dim[3],
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "The dimensions of qkv_weight must be 4"
                               "(3, num_head, dim_head, dim_embed),"
                               "and must satisfy the limitations: "
@@ -110,7 +110,7 @@ class FusedMultiTransformerINT8Op : public framework::OperatorWithKernel {
       } else {
         PADDLE_ENFORCE_EQ(y_dim[2] * y_dim[3],
                           y_dim[0],
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "The dimensions of qkv_weight must be 4"
                               "(dim_embed, 3, num_head, dim_head),"
                               "and must satisfy the limitations: "
@@ -126,23 +126,23 @@ class FusedMultiTransformerINT8Op : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           c_dim.size(),
           5,
-          paddle::platform::errors::InvalidArgument(
-              "The CacheKV must be 5 dims, but got %d", c_dim.size()));
+          phi::errors::InvalidArgument("The CacheKV must be 5 dims, but got %d",
+                                       c_dim.size()));
       PADDLE_ENFORCE_EQ(c_dim[0],
                         2,
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The first dim of CacheKV must be 2, but got %d",
                             c_dim[0]));  // 2
       PADDLE_ENFORCE_EQ(c_dim[1],
                         x_dim[0],
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The second dim of CacheKV must be equal with "
                             "batch size %d, but got %d",
                             x_dim[0],
                             c_dim[1]));  // batch_size
       PADDLE_ENFORCE_EQ(c_dim[2],
                         trans_qkvw ? y_dim[1] : y_dim[2],
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The third dim of CacheKV must be equal with num "
                             "head %d, but got %d",
                             trans_qkvw ? y_dim[1] : y_dim[2],
@@ -150,12 +150,12 @@ class FusedMultiTransformerINT8Op : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_GT(
           c_dim[3],
           0,
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The forth dim of CacheKV must be greater than 0, but got %d",
               c_dim[3]));  // cache_seq_len
       PADDLE_ENFORCE_EQ(c_dim[4],
                         trans_qkvw ? y_dim[2] : y_dim[3],
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The fifth dim of CacheKV must be equal with head "
                             "size %d, but got %d",
                             trans_qkvw ? y_dim[2] : y_dim[3],
@@ -273,7 +273,7 @@ class FusedMultiTransformerINT8OpMaker
         .AddCustomChecker([](const float &epsilon) {
           PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f,
                             true,
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "'epsilon' in Op(LayerNorm) should be between"
                                 "0.0 and 0.001, But received [%s].",
                                 epsilon));
@@ -284,7 +284,7 @@ class FusedMultiTransformerINT8OpMaker
         .AddCustomChecker([](const float &drop_p) {
           PADDLE_ENFORCE_EQ(drop_p >= 0.0f && drop_p <= 1.0f,
                             true,
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "'dropout_rate' must be between 0.0 and 1.0."));
         });
 
@@ -301,7 +301,7 @@ class FusedMultiTransformerINT8OpMaker
           PADDLE_ENFORCE_EQ(
               type == "downgrade_in_infer" || type == "upscale_in_train",
               true,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "dropout_implementation can only be downgrade_in_infer or "
                   "upscale_in_train"));
         });
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
index a76e93f5cdcf5..5893024c0e958 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
@@ -106,19 +106,19 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
     if (time_step) {
       PADDLE_ENFORCE_EQ(time_step->place(),
                         platform::CPUPlace(),
-                        platform::errors::PreconditionNotMet(
+                        phi::errors::PreconditionNotMet(
                             "The place of input(TimeStep) must be CPUPlace."));
       // cache_seq_len
       int time_step_value = time_step->data<int>()[0];
       PADDLE_ENFORCE_GT(time_step_value,
                         0,
-                        platform::errors::PreconditionNotMet(
+                        phi::errors::PreconditionNotMet(
                             "The value of time_step must > 0, but now is %d",
                             time_step_value));
       PADDLE_ENFORCE_EQ(
           seq_len,
           1,
-          platform::errors::PreconditionNotMet(
+          phi::errors::PreconditionNotMet(
               "In decode stage, the seq_len of input must be 1, but now is %d",
               seq_len));
       out_seq_len += time_step_value;
@@ -668,4 +668,4 @@ PD_REGISTER_STRUCT_KERNEL(fused_multi_transformer_int8,
                           ALL_LAYOUT,
                           ops::FusedMultiTransformerINT8OpKernel,
                           float,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
index a8bd90c7da5d4..dc90eaa3e5306 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
@@ -66,23 +66,23 @@ class FusedMultiTransformerOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         x_dim.size(),
         3,
-        platform::errors::InvalidArgument("The dimensions of x must be 3"
-                                          "(batch_size, seq_len, dim_embed),"
-                                          "but received dimensions of"
-                                          "Input is [%d]",
-                                          x_dim.size()));
-    PADDLE_ENFORCE_EQ(y_dim.size(),
-                      4,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of qkv_weight must be 4"
-                          "(3, num_head, dim_head, dim_embed),"
-                          "but received dimensions of"
-                          "Input is [%d]",
-                          y_dim.size()));
+        phi::errors::InvalidArgument("The dimensions of x must be 3"
+                                     "(batch_size, seq_len, dim_embed),"
+                                     "but received dimensions of"
+                                     "Input is [%d]",
+                                     x_dim.size()));
+    PADDLE_ENFORCE_EQ(
+        y_dim.size(),
+        4,
+        phi::errors::InvalidArgument("The dimensions of qkv_weight must be 4"
+                                     "(3, num_head, dim_head, dim_embed),"
+                                     "but received dimensions of"
+                                     "Input is [%d]",
+                                     y_dim.size()));
     PADDLE_ENFORCE_EQ(
         x_dim[2],
         trans_qkvw ? y_dim[3] : y_dim[0],
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "ShapeError: the dimension of x_dim[2] and y_dim[3](trans_qkvw is "
             "true) or y_dim[0](trans_qkvw is false)"
             "must be equal. But received: the shape "
@@ -99,30 +99,30 @@ class FusedMultiTransformerOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           c_dim.size(),
           5,
-          paddle::platform::errors::InvalidArgument(
-              "The CacheKV must be 5 dims, but got %d", c_dim.size()));
+          phi::errors::InvalidArgument("The CacheKV must be 5 dims, but got %d",
+                                       c_dim.size()));
       PADDLE_ENFORCE_EQ(c_dim[0],
                         2,
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The first dim of CacheKV must be 2, but got %d",
                             c_dim[0]));  // 2
       PADDLE_ENFORCE_EQ(c_dim[1],
                         x_dim[0],
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The second dim of CacheKV must be equal with "
                             "batch size %d, but got %d",
                             x_dim[0],
                             c_dim[1]));  // batch_size
       PADDLE_ENFORCE_EQ(c_dim[2],
                         trans_qkvw ? y_dim[1] : y_dim[2],
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The third dim of CacheKV must be equal with num "
                             "head %d, but got %d",
                             trans_qkvw ? y_dim[1] : y_dim[2],
                             c_dim[2]));  // num_head
       PADDLE_ENFORCE_EQ(c_dim[4],
                         trans_qkvw ? y_dim[2] : y_dim[3],
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The fifth dim of CacheKV must be equal with head "
                             "size %d, but got %d",
                             trans_qkvw ? y_dim[2] : y_dim[3],
@@ -223,7 +223,7 @@ class FusedMultiTransformerOpOpMaker
           PADDLE_ENFORCE_EQ(
               rotary_emb_dims >= 0 && rotary_emb_dims <= 2,
               true,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "'rotary_emb_dims' in Op(Rotray) should be between"
                   "0 and 2, But received [%s].",
                   rotary_emb_dims));
@@ -234,7 +234,7 @@ class FusedMultiTransformerOpOpMaker
         .AddCustomChecker([](const float &epsilon) {
           PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f,
                             true,
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "'epsilon' in Op(LayerNorm) should be between"
                                 "0.0 and 0.001, But received [%s].",
                                 epsilon));
@@ -245,7 +245,7 @@ class FusedMultiTransformerOpOpMaker
         .AddCustomChecker([](const float &drop_p) {
           PADDLE_ENFORCE_EQ(drop_p >= 0.0f && drop_p <= 1.0f,
                             true,
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "'dropout_rate' must be between 0.0 and 1.0."));
         });
 
@@ -262,7 +262,7 @@ class FusedMultiTransformerOpOpMaker
           PADDLE_ENFORCE_EQ(
               type == "downgrade_in_infer" || type == "upscale_in_train",
               true,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "dropout_implementation can only be downgrade_in_infer or "
                   "upscale_in_train"));
         });
@@ -272,7 +272,7 @@ class FusedMultiTransformerOpOpMaker
           PADDLE_ENFORCE_EQ(
               act_type == "gelu" || act_type == "relu" || act_type == "none",
               true,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "Only support `gelu`, `relu`, `none` activation in "
                   "FusedMultiTransformer. "));
         });
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
index 415a6ba1ffdf3..4bf467e9caf8f 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
@@ -131,7 +131,7 @@ static void AllReduce(phi::DenseTensor &tensor,  // NOLINT
 namespace {  // NOLINT
 
 namespace plat = paddle::platform;
-using float16 = plat::float16;
+using float16 = phi::dtype::float16;
 
 #define MMHA_USE_FP32_ACUM_FOR_LOGITS
 #define MMHA_USE_FP32_ACUM_FOR_OUT
@@ -746,9 +746,9 @@ inline __device__ void convert_from_float(float4 &dst, float4 src) {  // NOLINT
   dst = src;
 }
 
-inline __device__ void convert_from_float(plat::float16 &dst,  // NOLINT
+inline __device__ void convert_from_float(phi::dtype::float16 &dst,  // NOLINT
                                           float src) {
-  dst = static_cast<plat::float16>(src);
+  dst = static_cast<phi::dtype::float16>(src);
 }
 
 inline __device__ void convert_from_float(uint4 &dst, Float8_ src) {  // NOLINT
diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc
index 3dbba2bf42ce4..93ac8f4e220c9 100644
--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc
@@ -25,24 +25,23 @@ class FusedSeqpoolCVMOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GE(
         ctx->Inputs("X").size(),
         1UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Inputs(X) of FusedSeqpoolCVMOp should not be empty."));
     PADDLE_ENFORCE_GE(
         ctx->Outputs("Out").size(),
         1UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Outputs(Out) of FusedSeqpoolCVMOp should not be empty."));
 
     auto cvm_dims = ctx->GetInputDim("CVM");
     PADDLE_ENFORCE_EQ(
         cvm_dims.size(),
         2UL,
-        platform::errors::InvalidArgument("Input(CVM)'s rank should be 2."));
-    PADDLE_ENFORCE_EQ(
-        cvm_dims[1],
-        2UL,
-        platform::errors::InvalidArgument("The 2nd dimension of "
-                                          "Input(CVM) should be 2."));
+        phi::errors::InvalidArgument("Input(CVM)'s rank should be 2."));
+    PADDLE_ENFORCE_EQ(cvm_dims[1],
+                      2UL,
+                      phi::errors::InvalidArgument("The 2nd dimension of "
+                                                   "Input(CVM) should be 2."));
 
     auto ins_dims = ctx->GetInputsDim("X");
     const int cvm_offset = ctx->Attrs().Get<int>("cvm_offset");
@@ -53,7 +52,7 @@ class FusedSeqpoolCVMOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_GT(num_inputs,
                       0UL,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input tensors count should be greater than 0, "
                           "but received value is %d.",
                           num_inputs));
@@ -62,7 +61,7 @@ class FusedSeqpoolCVMOp : public framework::OperatorWithKernel {
     // since input lod is not accessible here.
     PADDLE_ENFORCE_EQ(ins_dims[0].size(),
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dims size of first input should be equal to 2, "
                           "but received value is %d.",
                           ins_dims[0].size()));
@@ -88,7 +87,7 @@ class FusedSeqpoolCVMOp : public framework::OperatorWithKernel {
         } else {
           PADDLE_ENFORCE_EQ(batch_size,
                             cur_batch_size,
-                            platform::errors::PreconditionNotMet(
+                            phi::errors::PreconditionNotMet(
                                 "The batch size of all input should be same, "
                                 "please check, last batch_size is %d, current "
                                 "batch_size is %d",
@@ -111,7 +110,7 @@ class FusedSeqpoolCVMOp : public framework::OperatorWithKernel {
           PADDLE_ENFORCE_GT(
               dims[rank - 1],
               2,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "Shape error in %lu id, the last dimension(embedding) of the "
                   "'X' tensor must be larger than 2.",
                   i));
@@ -145,7 +144,7 @@ class FusedSeqpoolCVMOp : public framework::OperatorWithKernel {
     }
     PADDLE_ENFORCE_EQ(flag,
                       1,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "All Inputs of fused_seqpool_cvm OP are Empty!"));
     return phi::KernelKey(input_data_type, ctx.GetPlace());
     // return phi::KernelKey(framework::proto::VarType::FP32,
@@ -201,13 +200,13 @@ class FusedSeqpoolCVMGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         cvm_dims.size(),
         2,
-        platform::errors::InvalidArgument("Input(CVM)'s rank should be 2."));
+        phi::errors::InvalidArgument("Input(CVM)'s rank should be 2."));
 
     for (size_t i = 0; i < og_dims.size(); i++) {
       PADDLE_ENFORCE_EQ(
           og_dims[i].size(),
           x_dims[i].size(),
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The rank of output grad must equal to Input(X). But "
               "received: input rank %u, input shape [%s].",
               og_dims[i].size(),
@@ -217,7 +216,7 @@ class FusedSeqpoolCVMGradOp : public framework::OperatorWithKernel {
         PADDLE_ENFORCE_EQ(
             o_dim,
             x_dims[i][og_dims[i].size() - 1],
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "The dimension mismatch between Input(OUT@GRAD) and "
                 "Input(X). Received Input(OUT@GRAD): input rank %u, "
                 "input shape [%s]; received Input(X): input rank %u, "
@@ -230,7 +229,7 @@ class FusedSeqpoolCVMGradOp : public framework::OperatorWithKernel {
         PADDLE_ENFORCE_EQ(
             og_dims[i][og_dims[i].size() - 1],
             x_dims[i][og_dims[i].size() - 1] - cvm_offset,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "The dimension mismatch between Input(OUT@GRAD) and "
                 "Input(X). Received Input(OUT@GRAD): input rank %u, "
                 "input shape [%s]; received Input(X): input rank %u, "
diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
index 362860aa23bdf..df00c74a30237 100644
--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
@@ -463,7 +463,7 @@ class FusedSeqpoolCVMCUDAKernel : public framework::OpKernel<T> {
       } else {
         PADDLE_ENFORCE_EQ(batch_size,
                           cur_batch_size,
-                          platform::errors::PreconditionNotMet(
+                          phi::errors::PreconditionNotMet(
                               "The batch size of all input should be same, "
                               "please cheack, last batchsize is %d, current "
                               "batchsize is %d",
@@ -550,7 +550,7 @@ class FusedSeqpoolCVMGradCUDAKernel : public framework::OpKernel<T> {
       } else {
         PADDLE_ENFORCE_EQ(batch_size,
                           cur_batch_size,
-                          platform::errors::PreconditionNotMet(
+                          phi::errors::PreconditionNotMet(
                               "The batch size of all input should be same, "
                               "please cheack, last batchsize is %d, current "
                               "batchsize is %d",
diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.h b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.h
index dcc76bbf95254..bd4475da0b8ea 100644
--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.h
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.h
@@ -27,7 +27,7 @@ template <typename T, typename DeviceContext>
 class FusedSeqpoolCVMOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
+    PADDLE_THROW(phi::errors::Unimplemented(
         "Unimplemented CPU kernel for FusedSeqpoolCVMOp, only support GPU "
         "now."));
   }
@@ -37,7 +37,7 @@ template <typename T, typename DeviceContext>
 class FusedSeqpoolCVMGradOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
+    PADDLE_THROW(phi::errors::Unimplemented(
         "Unimplemented CPU kernel for FusedSeqpoolCVMGradOp, only support GPU "
         "now."));
   }
diff --git a/paddle/fluid/operators/fused/fused_softmax_mask.cu.h b/paddle/fluid/operators/fused/fused_softmax_mask.cu.h
index 12e511fe3aef9..cb3292a60ebd2 100644
--- a/paddle/fluid/operators/fused/fused_softmax_mask.cu.h
+++ b/paddle/fluid/operators/fused/fused_softmax_mask.cu.h
@@ -123,9 +123,10 @@ __global__ void FusedSoftmaxMaskVecKernel(T* dst,
 // #define SELECT_SOFTMAX_MASK_KERNEL(ELEMENTS) \
 //   do { \
 //     if (sizeof(T) == 2 && seq_len % 8 == 0) { \
-//       FusedSoftmaxMaskVecKernel<plat::float16, 8, ELEMENTS> \
+//       FusedSoftmaxMaskVecKernel<phi::dtype::float16, 8, ELEMENTS> \
 //            <<<grid, block, 0, stream>>>( \
-//           (plat::float16*)dst, (const plat::float16*)src, mask, seq_len); \
+//           (phi::dtype::float16*)dst, (const phi::dtype::float16*)src, mask,
+//           seq_len); \
 //     } \
 //     else if (seq_len % 4 == 0) SOFTMAX_MASK_KERNEL(4, ELEMENTS); \
 //     else if (seq_len % 2 == 0) SOFTMAX_MASK_KERNEL(2, ELEMENTS); \
@@ -159,9 +160,9 @@ void LaunchFusedSoftmaxMaskKernel(const T* src,
   PADDLE_ENFORCE_EQ(
       seq_len > 0 && seq_len <= 4096,
       true,
-      platform::errors::InvalidArgument("seq_len must be between (0, 4096] "
-                                        "received the seq_len is %d",
-                                        seq_len));
+      phi::errors::InvalidArgument("seq_len must be between (0, 4096] "
+                                   "received the seq_len is %d",
+                                   seq_len));
 
   constexpr int block_size = 128;
   constexpr int warp_size = 32;
@@ -196,7 +197,7 @@ void LaunchFusedSoftmaxMaskKernel(const T* src,
       CASE_SOFTMAX_MASK_KERNEL(64);   // <=2048
       CASE_SOFTMAX_MASK_KERNEL(128);  // <=4096
     default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "seq_len must be between (0, 4096], received the seq_len is %d",
           seq_len));
   }
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
deleted file mode 100644
index 41a9299f7258c..0000000000000
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.h"
-
-namespace paddle {
-namespace operators {
-
-class ConvInceptionFusionOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    // 1 x
-    auto in_dims = ctx->GetInputDim("Input");
-    // 4 filters
-    auto w_dims = ctx->GetInputsDim("Filter");
-
-    PADDLE_ENFORCE_EQ(
-        in_dims.size(),
-        4,
-        platform::errors::InvalidArgument("Conv intput should be 4-D tensor."));
-    PADDLE_ENFORCE_EQ(
-        w_dims.size(),
-        4,
-        platform::errors::InvalidArgument("There should be 4 filters."));
-    PADDLE_ENFORCE_EQ(w_dims[0][1],
-                      in_dims[1],
-                      platform::errors::InvalidArgument(
-                          "Invalid filter channel number %d, which should be "
-                          "equal to input channel number %d.",
-                          w_dims[0][1],
-                          in_dims[1]));
-    PADDLE_ENFORCE_EQ(w_dims[1][1],
-                      in_dims[1],
-                      platform::errors::InvalidArgument(
-                          "Invalid filter channel number %d, which should be "
-                          "equal to input channel number %d.",
-                          w_dims[1][1],
-                          in_dims[1]));
-
-    int n = in_dims[0];
-    // compute output channel
-    // 1st channel
-    int c = w_dims[0][0];
-    // add 2nd channel
-    c += (w_dims[1][0] - w_dims[2][1] * 2);
-    // add 3rd channel
-    c += (w_dims[2][0] - w_dims[3][1]);
-    // add 4-th channel
-    c += w_dims[3][0];
-
-    int h = in_dims[2];
-    int w = in_dims[3];
-
-    ctx->SetOutputDim("Output", {n, c, h, w});
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
-                          ctx.GetPlace());
-  }
-};
-
-class ConvInceptionFusionOpMaker : public framework::OpProtoAndCheckerMaker {
- protected:
-  void Make() override {
-    AddInput("Input", "(Tensor) NCHW layout.");
-    AddInput("Filter", "(vector<Tensor>) 4 aggregated filters").AsDuplicable();
-    AddInput("Bias", "(vector<Tensor>) it's length is equal to Filter")
-        .AsDuplicable();
-    AddOutput("Output",
-              "(Tensor) The output tensor of convolution operator. "
-              "The format of output tensor is also NCHW.");
-    AddOutput("TempOutput", "").AsDuplicable();
-    AddAttr<std::string>(
-        "pooling_type",
-        "(string), pooling type, can be \"max\" for max-pooling "
-        "and \"avg\" for average-pooling.")
-        .InEnum({"max", "avg"});
-    AddAttr<bool>(
-        "exclusive",
-        "(bool, default True) When true, will exclude the zero-padding in the "
-        "averaging calculating, otherwise, include the zero-padding. Note, it "
-        "is only used when pooling_type is avg. The default is True.")
-        .SetDefault(true);
-    AddAttr<std::string>(
-        "activation",
-        "The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' "
-        "'relux' , 'tanh', 'band_pass'")
-        .SetDefault("relu");
-    AddAttr<int>("workspace_size_MB",
-                 "Only used in cudnn kernel. Need set use_cudnn to true."
-                 "workspace size for cudnn, in MB, "
-                 "workspace is a section of GPU memory which will be "
-                 "allocated/freed each time the operator runs, larger "
-                 "workspace size can increase performance but also requires "
-                 "better hardware. This size should be chosen carefully.")
-        .SetDefault(phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB());
-    AddComment(R"DOC(
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    conv2d_inception_fusion,
-    ops::ConvInceptionFusionOp,
-    ops::ConvInceptionFusionOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
deleted file mode 100644
index 63f065e0fef49..0000000000000
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ /dev/null
@@ -1,350 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-#include "paddle/phi/kernels/gpudnn/conv_gpudnn_info.h"
-
-namespace paddle {
-namespace operators {
-
-#if CUDNN_VERSION >= 7100
-using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
-using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
-using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
-using ScopedActivationDescriptor = platform::ScopedActivationDescriptor;
-using DataLayout = platform::DataLayout;
-
-using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor;
-using PoolingMode = platform::PoolingMode;
-template <typename T>
-using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
-
-template <typename T>
-using CudnnDataType = platform::CudnnDataType<T>;
-
-template <typename T, typename DeviceContext>
-class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-#if CUDNN_VERSION < 7100
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "The conv2d_inception_fusion operator is not supported on GPU "
-        "when CUDNN version < 7.1.0"));
-#endif
-    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto filters = ctx.MultiInput<phi::DenseTensor>("Filter");
-    auto bias = ctx.MultiInput<phi::DenseTensor>("Bias");
-
-    auto* output = ctx.Output<phi::DenseTensor>("Output");
-    auto temp_outs = ctx.MultiOutput<phi::DenseTensor>("TempOutput");
-
-    const std::string pool_type = ctx.Attr<std::string>("pooling_type");
-    const std::string activation = ctx.Attr<std::string>("activation");
-    const bool exclusive = ctx.Attr<bool>("exclusive");
-
-    int64_t user_workspace_size =
-        static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
-
-    const T* input_data = input->data<T>();
-    T* output_data = dev_ctx.Alloc<T>(output, output->numel() * sizeof(T));
-    temp_outs[0]->Resize(input->dims());
-    T* temp_data =
-        dev_ctx.Alloc<T>(temp_outs[0], temp_outs[0]->numel() * sizeof(T));
-
-    DataLayout layout = DataLayout::kNCHW;
-    std::vector<int> in_dim = common::vectorize<int>(input->dims());
-
-    // ------------------- cudnn descriptors ---------------------
-    PoolingMode pooling_mode;
-    if (pool_type == "max") {
-      pooling_mode = PoolingMode::kMaximum;
-    } else {
-      pooling_mode = exclusive ? PoolingMode::kAverageExclusive
-                               : (PoolingMode::kAverageInclusive);
-    }
-    std::vector<int> k0x0 = {0, 0};
-    std::vector<int> k1x1 = {1, 1};
-    std::vector<int> k1x1_2 = {1, 1};
-    std::vector<int> k3x3 = {3, 3};
-    ScopedPoolingDescriptor pool_desc;
-    ScopedActivationDescriptor act_desc;
-    ScopedTensorDescriptor out_pool_desc;
-    ScopedTensorDescriptor input_desc;
-    cudnnPoolingDescriptor_t cudnn_pool_desc =
-        pool_desc.descriptor(pooling_mode, k3x3, k1x1, k1x1);
-
-    cudnnTensorDescriptor_t cudnn_input_desc =
-        input_desc.descriptor<T>(layout, common::vectorize<int>(input->dims()));
-    cudnnTensorDescriptor_t pool_out_desc = out_pool_desc.descriptor<T>(
-        layout, common::vectorize<int>(input->dims()));
-
-    cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
-    cudnnTensorDescriptor_t* out_desc = new cudnnTensorDescriptor_t[4];
-    cudnnFilterDescriptor_t* filter_desc = new cudnnFilterDescriptor_t[4];
-    cudnnTensorDescriptor_t* bias_desc = new cudnnTensorDescriptor_t[4];
-    cudnnTensorDescriptor_t* in_desc = new cudnnTensorDescriptor_t[4];
-    cudnnConvolutionDescriptor_t* conv_desc =
-        new cudnnConvolutionDescriptor_t[4];
-    for (int i = 0; i < 4; ++i) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnCreateFilterDescriptor(&filter_desc[i]));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnCreateTensorDescriptor(&bias_desc[i]));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnCreateTensorDescriptor(&in_desc[i]));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnCreateTensorDescriptor(&out_desc[i]));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnCreateConvolutionDescriptor(&conv_desc[i]));
-    }
-
-    std::vector<std::vector<int>> filter_dims;
-    std::vector<std::vector<int>> bias_dims;
-    std::vector<std::vector<int>> in_dims;
-    std::vector<std::vector<int>> out_dims;
-    std::vector<std::vector<int>> in_strides;
-    std::vector<std::vector<int>> out_strides;
-    std::vector<std::vector<int>> bias_strides;
-
-    cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;
-    int n = in_dim[0];
-    int h = in_dim[2];
-    int w = in_dim[3];
-    int oc = output->dims()[1];
-
-    cudnnDataType_t compute_type = (cudnn_dtype == CUDNN_DATA_DOUBLE)
-                                       ? CUDNN_DATA_DOUBLE
-                                       : CUDNN_DATA_FLOAT;
-
-    for (int i = 0; i < 4; ++i) {
-      filter_dims.push_back(common::vectorize<int>(filters[i]->dims()));
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor(
-          filter_desc[i], cudnn_dtype, format, 4, filter_dims[i].data()));
-      bias_dims.push_back({1, filter_dims[i][0], 1, 1});
-      bias_strides.push_back({filter_dims[i][0], 1, 1, 1});
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-          bias_desc[i],
-          cudnn_dtype,
-          4,
-          bias_dims[i].data(),
-          bias_strides[i].data()));
-      in_dims.push_back({n, filter_dims[i][1], h, w});
-      out_dims.push_back({n, filter_dims[i][0], h, w});
-      in_strides.push_back({filter_dims[i][1] * h * w, h * w, w, 1});
-      out_strides.push_back({oc * h * w, h * w, w, 1});
-
-      if (i < 2) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::cudnnSetConvolutionNdDescriptor(
-                conv_desc[i],
-                2,
-                k0x0.data(),
-                k1x1.data(),
-                k1x1.data(),
-                CUDNN_CROSS_CORRELATION,
-                compute_type));
-      } else {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::cudnnSetConvolutionNdDescriptor(
-                conv_desc[i],
-                2,
-                k1x1.data(),
-                k1x1.data(),
-                k1x1.data(),
-                CUDNN_CROSS_CORRELATION,
-                compute_type));
-      }
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
-          conv_desc[i], CUDNN_DEFAULT_MATH));
-#if CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
-      if (!phi::allow_tf32_cudnn) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::cudnnSetConvolutionMathType(conv_desc[i],
-                                                           CUDNN_FMA_MATH));
-      }
-#endif  // CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
-    }
-    in_dims[2][1] *= 2;
-    in_strides[2][0] = oc * h * w;
-    out_strides[2][0] = filter_dims[2][0] * h * w;  // this out is continuous.
-    in_strides[3][0] = filter_dims[2][0] * h * w;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnSetConvolutionGroupCount(conv_desc[2], 2));
-
-    cudnnConvolutionFwdAlgo_t algo[4];
-    auto handle = dev_ctx.cudnn_handle();
-    size_t workspace_size_in_bytes = 0;  // final workspace to allocate.
-
-    size_t workspace_size_limit = 0;
-    if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
-      int64_t max_user_size =
-          std::min(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
-                   user_workspace_size);
-      workspace_size_limit = max_user_size * 1024 * 1024;
-    }
-
-    for (int i = 0; i < 4; ++i) {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-          in_desc[i], cudnn_dtype, 4, in_dims[i].data(), in_strides[i].data()));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnSetTensorNdDescriptor(out_desc[i],
-                                                        cudnn_dtype,
-                                                        4,
-                                                        out_dims[i].data(),
-                                                        out_strides[i].data()));
-
-      int perf_count;
-      int best_algo_idx = 0;
-      size_t tmp_size = 0;
-      std::unique_ptr<cudnnConvolutionFwdAlgoPerf_t[]> perf_results(
-          new cudnnConvolutionFwdAlgoPerf_t[phi::kNUM_CUDNN_FWD_ALGS]);
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
-              handle,
-              in_desc[i],
-              filter_desc[i],
-              conv_desc[i],
-              out_desc[i],
-              phi::kNUM_CUDNN_FWD_ALGS,
-              &perf_count,
-              perf_results.get()));
-      algo[i] = (perf_results.get())[best_algo_idx].algo;
-
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
-              handle,
-              in_desc[i],
-              filter_desc[i],
-              conv_desc[i],
-              out_desc[i],
-              algo[i],
-              &tmp_size));
-
-      workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
-    }
-    cudnnActivationDescriptor_t cudnn_act_desc =
-        act_desc.descriptor<T>(activation);
-
-    int oc0 = filter_dims[0][0];
-    int oc1 = filter_dims[1][0] - filter_dims[2][1] * 2;
-    int oc3 = filter_dims[3][0];
-    int oc2 = oc - oc0 - oc1 - oc3;
-
-    // branch1: pool + 1x1 conv
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnPoolingForward(handle,
-                                               cudnn_pool_desc,
-                                               &alpha,
-                                               cudnn_input_desc,
-                                               input_data,
-                                               &beta,
-                                               pool_out_desc,
-                                               temp_data));
-
-    std::vector<const void*> in_datas;
-    in_datas.push_back(static_cast<const void*>(temp_data));
-    in_datas.push_back(static_cast<const void*>(input_data));
-    in_datas.push_back(
-        static_cast<const void*>(output_data + (oc0 + oc1) * h * w));
-    temp_outs[1]->Resize(common::make_ddim(out_dims[2]));
-    T* temp2_data =
-        dev_ctx.Alloc<T>(temp_outs[1], temp_outs[1]->numel() * sizeof(T));
-    in_datas.push_back(static_cast<const void*>(temp2_data + oc2 * h * w));
-
-    std::vector<void*> out_datas;
-    out_datas.push_back(static_cast<void*>(output_data));
-    out_datas.push_back(static_cast<void*>(output_data + oc0 * h * w));
-    out_datas.push_back(static_cast<void*>(temp2_data));
-    out_datas.push_back(
-        static_cast<void*>(output_data + (oc0 + oc1 + oc2) * h * w));
-
-    for (int i = 0; i < 4; ++i) {
-      auto func = [&](void* cudnn_workspace) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::cudnnConvolutionBiasActivationForward(
-                handle,
-                &alpha,
-                in_desc[i],
-                in_datas[i],
-                filter_desc[i],
-                static_cast<const void*>(filters[i]->data<T>()),
-                conv_desc[i],
-                algo[i],
-                cudnn_workspace,
-                workspace_size_in_bytes,
-                &beta,
-                out_desc[i],
-                out_datas[i],
-                bias_desc[i],
-                static_cast<const void*>(bias[i]->data<T>()),
-                cudnn_act_desc,
-                out_desc[i],
-                out_datas[i]));
-      };
-      auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-      workspace_handle.RunFunc(func, workspace_size_in_bytes);
-    }
-
-    cudnnTensorDescriptor_t x_desc;
-    cudnnTensorDescriptor_t y_desc;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&x_desc));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&y_desc));
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        x_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[2].data()));
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        y_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[3].data()));
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnTransformTensor(
-        handle,
-        CudnnDataType<T>::kOne(),
-        x_desc,
-        static_cast<const void*>(out_datas[2]),
-        CudnnDataType<T>::kZero(),
-        y_desc,
-        static_cast<void*>(output_data + (oc0 + oc1) * h * w)));
-
-    for (int i = 0; i < 4; ++i) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDestroyTensorDescriptor(in_desc[i]));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDestroyTensorDescriptor(out_desc[i]));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDestroyFilterDescriptor(filter_desc[i]));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDestroyTensorDescriptor(bias_desc[i]));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDestroyConvolutionDescriptor(conv_desc[i]));
-    }
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(x_desc));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(y_desc));
-  }
-};
-#endif
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-PD_REGISTER_STRUCT_KERNEL(conv2d_inception_fusion,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::CUDNNConvInceptionFusionOpKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc
index 1c3b37d12d689..dc3a223d745b3 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -36,7 +36,7 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
   auto x_dims = ctx->GetInputDim("X");
   PADDLE_ENFORCE_EQ(x_dims.size(),
                     2,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Input(X)'s rank must be 2, but received x's rank "
                         "is:%d, x dim is:[%s]",
                         x_dims.size(),
@@ -48,7 +48,7 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
     auto c_dims = ctx->GetInputDim("C0");
     PADDLE_ENFORCE_EQ(h_dims,
                       c_dims,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dimension of Input(H0) and Input(C0) should be "
                           "same, but received h0 dims is:[%s], c0 dims is:[%s]",
                           h_dims,
@@ -58,14 +58,14 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
   auto wx_dims = ctx->GetInputDim("WeightX");
   PADDLE_ENFORCE_EQ(wx_dims.size(),
                     2,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The rank of Input(WeightX) should be 2, but received "
                         "WeightX's rank is:%d, WeightX dim is:[%s]",
                         wx_dims.size(),
                         wx_dims));
   PADDLE_ENFORCE_EQ(wx_dims[0],
                     x_dims[1],
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The first dimension of Input(WeightX) "
                         "should equal to second dimension of Input(X), but "
                         "received WeightX first dim is:%d, X second dim is:%d",
@@ -77,14 +77,14 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
 
   PADDLE_ENFORCE_EQ(wh_dims.size(),
                     2,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The rank of Input(WeightH) should be 2, but received "
                         "WeightH rank is:%d, WeightH dim is:[%s]",
                         wh_dims.size(),
                         wh_dims));
   PADDLE_ENFORCE_EQ(wh_dims[0],
                     frame_size,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The first dimension of Input(WeightH) "
                         "should equal to frame size, but received WeightH "
                         "first dim is:%d, frame size is:%d.",
@@ -93,7 +93,7 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
 
   PADDLE_ENFORCE_EQ(wh_dims[1],
                     4 * frame_size,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The second dimension of Input(WeightH) "
                         "should equal to 4 * frame_size, but received WeightH "
                         "second dimension is:%d, frame size is:%d.",
@@ -103,14 +103,14 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
   auto b_dims = ctx->GetInputDim("Bias");
   PADDLE_ENFORCE_EQ(b_dims.size(),
                     2,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The rank of Input(Bias) should be 2, but received "
                         "Bias rank is:%d, Bias dim is:[%s]",
                         b_dims.size(),
                         b_dims));
   PADDLE_ENFORCE_EQ(b_dims[0],
                     1,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The first dimension of Input(Bias) should be 1, but "
                         "received Bias's dimension is:[%s]",
                         b_dims));
@@ -118,7 +118,7 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
   if (ctx->Attrs().Get<bool>("use_peepholes")) {
     PADDLE_ENFORCE_EQ(b_dims[1],
                       7 * frame_size,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The second dimension of Input(Bias) should be "
                           "7 * %d if enable peepholes connection, but received "
                           "Bias dim is:[%s]",
@@ -129,7 +129,7 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
     PADDLE_ENFORCE_EQ(
         b_dims[1],
         4 * frame_size,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The second dimension of Input(Bias) should be "
             "4 * %d if disable peepholes, but received Bias dim is:[%s]",
             frame_size,
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
index b489f5e458bc1..725eb2682e1a2 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
@@ -26,24 +26,24 @@ void FusionSeqPoolConcatOp::InferShape(
     framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE_GE(ctx->Inputs("X").size(),
                     1UL,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Inputs(X) of FusionSeqPoolConcatOp should be greater "
                         "than 1, but received value is %d.",
                         ctx->Inputs("X").size()));
   OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "FusionSeqPoolConcat");
   int axis = ctx->Attrs().Get<int>("axis");
-  PADDLE_ENFORCE_EQ(axis,
-                    1,
-                    platform::errors::InvalidArgument(
-                        "FusionSeqPoolConcatOp only supports concat "
-                        "axis=1 yet, but received axis value is %d",
-                        axis));
+  PADDLE_ENFORCE_EQ(
+      axis,
+      1,
+      phi::errors::InvalidArgument("FusionSeqPoolConcatOp only supports concat "
+                                   "axis=1 yet, but received axis value is %d",
+                                   axis));
 
   auto ins_dims = ctx->GetInputsDim("X");
   const size_t n = ins_dims.size();
   PADDLE_ENFORCE_GT(n,
                     0UL,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Input tensors count should be greater than 0, "
                         "but received value is %d.",
                         n));
@@ -55,7 +55,7 @@ void FusionSeqPoolConcatOp::InferShape(
   // since input lod is not accessible here.
   PADDLE_ENFORCE_EQ(ins_dims[0].size(),
                     2,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The dims size of first input should be equal to 2, "
                         "but received value is %d.",
                         ins_dims[0].size()));
@@ -116,7 +116,7 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel<T> {
     int w = static_cast<int>(ins[0]->numel() / x0_dims[0]);
     PADDLE_ENFORCE_EQ(y_dims[1] % w,
                       0,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The output of dims[1] should be dividable of w, but "
                           "dims[1] is %d, w is %d.",
                           y_dims[1],
@@ -140,7 +140,7 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           static_cast<int>(ins[i]->numel() / x_dims[0]),
           w,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Width of all inputs should be equal, but the width of the %d-th "
               "input %d is not equal to the previous %d",
               i,
@@ -149,7 +149,7 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           x_lod.size(),
           bs + 1,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Batchsize of all inputs should be equal, but the value of the "
               "%d-th %d is not equal to the previous %d.",
               i,
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
index 5bcd4d2fbc75a..352d427b8ab91 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
@@ -27,33 +27,31 @@ void FusionSeqPoolCVMConcatOp::InferShape(
   PADDLE_ENFORCE_GE(
       ctx->Inputs("X").size(),
       1UL,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Inputs(X) of FusionSeqPoolCVMConcatOp should not be empty."));
   PADDLE_ENFORCE(
       ctx->HasOutput("Out"),
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Output(Out) of FusionSeqPoolCVMConcatOp should not be null."));
   int axis = ctx->Attrs().Get<int>("axis");
-  PADDLE_ENFORCE_EQ(axis,
-                    1,
-                    paddle::platform::errors::InvalidArgument(
-                        "FusionSeqPoolCVMConcatOp only supports "
-                        "concat axis=1 yet, but received %d.",
-                        axis));
+  PADDLE_ENFORCE_EQ(
+      axis,
+      1,
+      phi::errors::InvalidArgument("FusionSeqPoolCVMConcatOp only supports "
+                                   "concat axis=1 yet, but received %d.",
+                                   axis));
   bool use_cvm = ctx->Attrs().Get<bool>("use_cvm");
-  PADDLE_ENFORCE_EQ(use_cvm,
-                    true,
-                    paddle::platform::errors::InvalidArgument(
-                        "FusionSeqPoolCVMConcatOp only supports "
-                        "use_cvm is true yet, but received %d.",
-                        use_cvm));
+  PADDLE_ENFORCE_EQ(
+      use_cvm,
+      true,
+      phi::errors::InvalidArgument("FusionSeqPoolCVMConcatOp only supports "
+                                   "use_cvm is true yet, but received %d.",
+                                   use_cvm));
 
   auto ins_dims = ctx->GetInputsDim("X");
   const size_t n = ins_dims.size();
-  PADDLE_ENFORCE_GT(n,
-                    0UL,
-                    paddle::platform::errors::InvalidArgument(
-                        "Input tensors count should > 0."));
+  PADDLE_ENFORCE_GT(
+      n, 0UL, phi::errors::InvalidArgument("Input tensors count should > 0."));
   if (n == 1) {
     LOG(WARNING) << "Only have one input, may waste memory";
   }
@@ -62,7 +60,7 @@ void FusionSeqPoolCVMConcatOp::InferShape(
   // since input lod is not accessible here.
   PADDLE_ENFORCE_EQ(ins_dims[0].size(),
                     2,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The dims size of first input should be 2."));
   ctx->SetOutputDim("Out", {-1, ins_dims[0][axis] * static_cast<int>(n)});
 }
@@ -120,7 +118,7 @@ class FusionSeqPoolCVMConcatKernel : public framework::OpKernel<T> {
     int w = static_cast<int>(ins[0]->numel() / x0_dims[0]);
     PADDLE_ENFORCE_EQ(y_dims[1] % w,
                       0,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The output of dims[1] should be dividable of w"));
     phi::jit::seq_pool_attr_t attr(w, phi::jit::SeqPoolType::kSum);
     if (pooltype == "AVERAGE") {
@@ -138,13 +136,13 @@ class FusionSeqPoolCVMConcatKernel : public framework::OpKernel<T> {
       auto x_lod = ins[i]->lod()[0];
       const T* src = ins[i]->data<T>();
       T* dst = y_data + i * w;
-      PADDLE_ENFORCE_EQ(static_cast<int>(ins[i]->numel() / x_dims[0]),
-                        w,
-                        paddle::platform::errors::InvalidArgument(
-                            "Width of all inputs should be equal."));
+      PADDLE_ENFORCE_EQ(
+          static_cast<int>(ins[i]->numel() / x_dims[0]),
+          w,
+          phi::errors::InvalidArgument("Width of all inputs should be equal."));
       PADDLE_ENFORCE_EQ(x_lod.size(),
                         bs + 1,
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "Batchsize of all inputs should be equal."));
       for (size_t j = 0; j < bs; ++j) {
         attr.h = static_cast<int>(x_lod[j + 1] - x_lod[j]);
diff --git a/paddle/fluid/operators/fused/multi_gru_op.cc b/paddle/fluid/operators/fused/multi_gru_op.cc
index df8e8c956a045..7011dfebb6719 100644
--- a/paddle/fluid/operators/fused/multi_gru_op.cc
+++ b/paddle/fluid/operators/fused/multi_gru_op.cc
@@ -37,10 +37,10 @@ void MultiGRUOp::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE_EQ(
       x_mat_dims.size(),
       2,
-      platform::errors::InvalidArgument("The size of input X dims should be 2, "
-                                        "or 3 with second dimension equal to "
-                                        "1, but now Input X dim is:[%s] ",
-                                        x_dims));
+      phi::errors::InvalidArgument("The size of input X dims should be 2, "
+                                   "or 3 with second dimension equal to "
+                                   "1, but now Input X dim is:[%s] ",
+                                   x_dims));
 
   auto layers = ctx->Attrs().Get<int>("layers");
   auto wx_dims = ctx->GetInputsDim("WeightX");
@@ -48,7 +48,7 @@ void MultiGRUOp::InferShape(framework::InferShapeContext* ctx) const {
     PADDLE_ENFORCE_EQ(
         wx_dims[i][0],
         x_mat_dims[1],
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The first dimension of flattened WeightX #%d"
             "should equal to last dimension of flattened input X, but "
             "received fattened WeightX dimension is:%d, flattened X dimension "
@@ -62,7 +62,7 @@ void MultiGRUOp::InferShape(framework::InferShapeContext* ctx) const {
   for (int i = 0; i < 2 * layers; ++i) {
     PADDLE_ENFORCE_EQ(wx_dims[i].size(),
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The rank of WeightX #%d should be 2, but received "
                           "WeightX dim size is:%d, WeightX dim is:[%s] ",
                           i,
@@ -70,7 +70,7 @@ void MultiGRUOp::InferShape(framework::InferShapeContext* ctx) const {
                           wx_dims[i]));
     PADDLE_ENFORCE_EQ(wh_dims[i].size(),
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The rank of WeightH #%d should be 2, but received "
                           "WeightH dim size is:%d, WeightH dim is:[%s] ",
                           i,
@@ -80,7 +80,7 @@ void MultiGRUOp::InferShape(framework::InferShapeContext* ctx) const {
     PADDLE_ENFORCE_EQ(
         wh_dims[i][1],
         3 * frame_size,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The second dimension of WeightH #%d "
             "should equal to 3 * frame_size, but received WeightH's "
             "second dimension is: %d, frame size is:%d",
@@ -90,7 +90,7 @@ void MultiGRUOp::InferShape(framework::InferShapeContext* ctx) const {
     PADDLE_ENFORCE_EQ(
         wx_dims[i][1],
         3 * frame_size,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The second dimension of WeightX #%d "
             "should equal to 3 * frame_size, but received WeightX's "
             "second dimension is: %d, frame size is:%d",
@@ -105,7 +105,7 @@ void MultiGRUOp::InferShape(framework::InferShapeContext* ctx) const {
       int frame_size = static_cast<int>(wh_dims[i][0]);
       PADDLE_ENFORCE_EQ(b_dims[i].size(),
                         2,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The rank of Bias #%d should be 2, but received "
                             "Bias rank is:%d, Bias dim is:[%s]",
                             i,
@@ -113,7 +113,7 @@ void MultiGRUOp::InferShape(framework::InferShapeContext* ctx) const {
                             b_dims[i]));
       PADDLE_ENFORCE_EQ(b_dims[i][0],
                         1,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The first dimension of Bias #%d should be 1, but "
                             "received Bias first dim is:%d, Bias dim is:[%s]",
                             i,
@@ -122,7 +122,7 @@ void MultiGRUOp::InferShape(framework::InferShapeContext* ctx) const {
       PADDLE_ENFORCE_EQ(
           b_dims[i][1],
           frame_size * 3,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The shape of Bias #%d must be [1, frame_size * 3], but "
               "received bias dim is:[%s], frame size is:%d",
               i,
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc b/paddle/fluid/operators/fused/onednn/fusion_lstm_onednn_op.cc
similarity index 95%
rename from paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
rename to paddle/fluid/operators/fused/onednn/fusion_lstm_onednn_op.cc
index 2b92cb6f76663..68c73f7d3500b 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/onednn/fusion_lstm_onednn_op.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/operators/fused/fusion_lstm_op.h"
-#include "paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h"
+#include "paddle/fluid/operators/fused/onednn/fusion_rnn_onednn.h"
 #include "paddle/phi/core/expect.h"
 
 namespace paddle {
@@ -66,17 +66,17 @@ class LSTMMKLDNNHandler
       PADDLE_ENFORCE_EQ(
           ctx.Attr<std::string>("gate_activation"),
           "sigmoid",
-          platform::errors::Unimplemented("oneDNN fusion_lstm supports only "
-                                          "sigmoid as a gate activation."));
+          phi::errors::Unimplemented("oneDNN fusion_lstm supports only "
+                                     "sigmoid as a gate activation."));
       PADDLE_ENFORCE_EQ(
           ctx.Attr<std::string>("cell_activation"),
           "tanh",
-          platform::errors::Unimplemented(
+          phi::errors::Unimplemented(
               "oneDNN fusion_lstm supports only tanh as a cell activation."));
       PADDLE_ENFORCE_EQ(
           ctx.Attr<std::string>("candidate_activation"),
           "tanh",
-          platform::errors::Unimplemented(
+          phi::errors::Unimplemented(
               "oneDNN fusion_lstm supports only tanh a candidate activation."));
 
       // Weights for int8 kernel are of a type s8
@@ -325,7 +325,7 @@ template <typename T, typename DeviceContext>
 class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const bool is_bf16 = std::is_same<T, paddle::platform::bfloat16>::value;
+    const bool is_bf16 = std::is_same<T, phi::dtype::bfloat16>::value;
     const bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
 
     // BF16 does not support force output
@@ -407,14 +407,11 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
           handler.template AcquireWeightHMemory<float>(weight_h);
     } else if (framework::TransToProtoVarType(weight_h->dtype()) ==
                paddle::framework::proto::VarType_Type_BF16) {
-      h0_memory_p =
-          handler.template AcquireH0Memory<paddle::platform::bfloat16>(h0);
+      h0_memory_p = handler.template AcquireH0Memory<phi::dtype::bfloat16>(h0);
       weight_x_memory_p =
-          handler.template AcquireWeightXMemory<paddle::platform::bfloat16>(
-              weight_x);
+          handler.template AcquireWeightXMemory<phi::dtype::bfloat16>(weight_x);
       weight_h_memory_p =
-          handler.template AcquireWeightHMemory<paddle::platform::bfloat16>(
-              weight_h);
+          handler.template AcquireWeightHMemory<phi::dtype::bfloat16>(weight_h);
     } else {
       h0_memory_p = handler.template AcquireH0Memory<uint8_t>(h0);
       weight_x_memory_p =
@@ -478,4 +475,4 @@ PD_REGISTER_STRUCT_KERNEL(fusion_lstm,
                           ops::FusionLSTMMKLDNNKernel,
                           float,
                           uint8_t,
-                          paddle::platform::bfloat16) {}
+                          phi::dtype::bfloat16) {}
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h b/paddle/fluid/operators/fused/onednn/fusion_rnn_onednn.h
similarity index 100%
rename from paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h
rename to paddle/fluid/operators/fused/onednn/fusion_rnn_onednn.h
diff --git a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/onednn/multi_gru_onednn_op.cc
similarity index 97%
rename from paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
rename to paddle/fluid/operators/fused/onednn/multi_gru_onednn_op.cc
index 8e11c91a117d1..c9545876a0dc6 100644
--- a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/onednn/multi_gru_onednn_op.cc
@@ -69,29 +69,29 @@ class MultiGRUHandler {
     PADDLE_ENFORCE_EQ(
         weights_x_.size(),
         layers_ * 2,
-        platform::errors::InvalidArgument("The number of WeightX inputs does "
-                                          "not match the number of layers."));
+        phi::errors::InvalidArgument("The number of WeightX inputs does "
+                                     "not match the number of layers."));
     PADDLE_ENFORCE_EQ(
         weights_h_.size(),
         layers_ * 2,
-        platform::errors::InvalidArgument("The number of WeightH inputs does "
-                                          "not match the number of layers."));
+        phi::errors::InvalidArgument("The number of WeightH inputs does "
+                                     "not match the number of layers."));
     if (!biases_.empty())
       PADDLE_ENFORCE_EQ(
           biases_.size(),
           layers_ * 2,
-          platform::errors::InvalidArgument("The number of Bias inputs does "
-                                            "not match the number of layers."));
+          phi::errors::InvalidArgument("The number of Bias inputs does "
+                                       "not match the number of layers."));
     // oneDNN kernel has hardcoded activation functions
     PADDLE_ENFORCE_EQ(
         ctx.Attr<std::string>("gate_activation"),
         "sigmoid",
-        platform::errors::Unimplemented(
+        phi::errors::Unimplemented(
             "oneDNN fusion_gru supports only sigmoid as a gate activation."));
     PADDLE_ENFORCE_EQ(
         ctx.Attr<std::string>("activation"),
         "tanh",
-        platform::errors::Unimplemented(
+        phi::errors::Unimplemented(
             "oneDNN fusion_gru supports only tanh as an activation."));
 
     N_ = x_lod_.size() - 1;  // Number of sentences (batches)
@@ -134,7 +134,7 @@ class MultiGRUHandler {
       PADDLE_ENFORCE_EQ(
           scale_weights.size(),
           layers_ * 2,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The number of weight scale inputs does "
               "not match the number of layers. Expected: %d. Actual: %d",
               layers_ * 2,
@@ -212,7 +212,7 @@ class MultiGRUHandler {
           attrs_[2 * layer + (dir == R2L)]);
       PADDLE_ENFORCE_NOT_NULL(
           pd,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Primitive descriptor for gru_forward cannot be null."));
       dev_ctx_.SetBlob(pd_key, pd);
     }
diff --git a/paddle/fluid/operators/fused/quant_dequant_kernel.h b/paddle/fluid/operators/fused/quant_dequant_kernel.h
index 8e8fdc95e91b5..63dbee42d6e7a 100644
--- a/paddle/fluid/operators/fused/quant_dequant_kernel.h
+++ b/paddle/fluid/operators/fused/quant_dequant_kernel.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/operators/fake_quantize_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/resnet_basic_block_op.cc b/paddle/fluid/operators/fused/resnet_basic_block_op.cc
index 58125a9b7f674..37315367189fa 100644
--- a/paddle/fluid/operators/fused/resnet_basic_block_op.cc
+++ b/paddle/fluid/operators/fused/resnet_basic_block_op.cc
@@ -112,29 +112,29 @@ class ResNetBasicBlockOp : public framework::OperatorWithKernel {
     // make sure Mean/RunningMean and Var/RunningVar share memory
     PADDLE_ENFORCE_EQ(ctx->Inputs("Mean1")[0],
                       ctx->Outputs("Mean1Out")[0],
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Mean1 and Mean1Out should share the same memory"));
     PADDLE_ENFORCE_EQ(ctx->Inputs("Var1")[0],
                       ctx->Outputs("Var1Out")[0],
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Var1 and Var1Out should share the same memory"));
     PADDLE_ENFORCE_EQ(ctx->Inputs("Mean2")[0],
                       ctx->Outputs("Mean2Out")[0],
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Mean2 and Mean2Out should share the same memory"));
     PADDLE_ENFORCE_EQ(ctx->Inputs("Var2")[0],
                       ctx->Outputs("Var2Out")[0],
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Var2 and Var2Out should share the same memory"));
 
     if (has_shortcut) {
       PADDLE_ENFORCE_EQ(ctx->Inputs("Mean3")[0],
                         ctx->Outputs("Mean3Out")[0],
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "Mean3 and Mean3Out should share the same memory"));
       PADDLE_ENFORCE_EQ(ctx->Inputs("Var3")[0],
                         ctx->Outputs("Var3Out")[0],
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "Var3 and Var3Out should share the same memory"));
     }
 
@@ -143,10 +143,10 @@ class ResNetBasicBlockOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         data_format,
         "NCHW",
-        platform::errors::InvalidArgument("The data format must equal to NCHW. "
-                                          "But received: the data format "
-                                          "= [%s]",
-                                          data_format));
+        phi::errors::InvalidArgument("The data format must equal to NCHW. "
+                                     "But received: the data format "
+                                     "= [%s]",
+                                     data_format));
     int stride1 = ctx->Attrs().Get<int>("stride1");
     int stride2 = ctx->Attrs().Get<int>("stride2");
     int padding1 = ctx->Attrs().Get<int>("padding1");
@@ -158,13 +158,13 @@ class ResNetBasicBlockOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         x1_dims.size(),
         4,
-        platform::errors::InvalidArgument("The dimensions of input "
-                                          "must equal to 4."
-                                          "But received: the shape of input "
-                                          "= [%s], the dimension of input = "
-                                          "[%d]",
-                                          x1_dims,
-                                          x1_dims.size()));
+        phi::errors::InvalidArgument("The dimensions of input "
+                                     "must equal to 4."
+                                     "But received: the shape of input "
+                                     "= [%s], the dimension of input = "
+                                     "[%d]",
+                                     x1_dims,
+                                     x1_dims.size()));
 
     // Calculate the dims of output1
     int batch = x1_dims[0];
@@ -226,26 +226,26 @@ class ResNetBasicBlockOp : public framework::OperatorWithKernel {
     // By default, the type of the scale, bias, mean,
     // and var tensors should be float when input tensor's dtype is float16.
     auto bn_param_type = framework::proto::VarType::FP32;
-    PADDLE_ENFORCE_EQ(bn_param_type,
-                      framework::TransToProtoVarType(
-                          ctx.Input<phi::DenseTensor>("Scale1")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Scale input should be of float type"));
-    PADDLE_ENFORCE_EQ(bn_param_type,
-                      framework::TransToProtoVarType(
-                          ctx.Input<phi::DenseTensor>("Bias1")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Bias input should be of float type"));
-    PADDLE_ENFORCE_EQ(bn_param_type,
-                      framework::TransToProtoVarType(
-                          ctx.Input<phi::DenseTensor>("Scale2")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Scale input should be of float type"));
-    PADDLE_ENFORCE_EQ(bn_param_type,
-                      framework::TransToProtoVarType(
-                          ctx.Input<phi::DenseTensor>("Bias2")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Bias input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        bn_param_type,
+        framework::TransToProtoVarType(
+            ctx.Input<phi::DenseTensor>("Scale1")->dtype()),
+        phi::errors::InvalidArgument("Scale input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        bn_param_type,
+        framework::TransToProtoVarType(
+            ctx.Input<phi::DenseTensor>("Bias1")->dtype()),
+        phi::errors::InvalidArgument("Bias input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        bn_param_type,
+        framework::TransToProtoVarType(
+            ctx.Input<phi::DenseTensor>("Scale2")->dtype()),
+        phi::errors::InvalidArgument("Scale input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        bn_param_type,
+        framework::TransToProtoVarType(
+            ctx.Input<phi::DenseTensor>("Bias2")->dtype()),
+        phi::errors::InvalidArgument("Bias input should be of float type"));
 
     return phi::KernelKey(input_data_type, ctx.GetPlace());
   }
@@ -546,8 +546,7 @@ class ResNetBasicBlockGradOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const {
     PADDLE_ENFORCE_NOT_NULL(
         ctx.InputVar(framework::GradVarName("Y")),
-        platform::errors::NotFound(
-            "Can not find Y@GRAD in the execution context."));
+        phi::errors::NotFound("Can not find Y@GRAD in the execution context."));
 
     return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
                           ctx.GetPlace());
diff --git a/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc b/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc
index 16e2261f1afb5..50a3b3c46137d 100644
--- a/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc
+++ b/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc
@@ -298,10 +298,9 @@ class ResNetBasicBlockXPUKernel : public framework::OpKernel<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
 
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_xpu_place(ctx.GetPlace()),
-        true,
-        platform::errors::PreconditionNotMet("It must use XPUPlace."));
+    PADDLE_ENFORCE_EQ(platform::is_xpu_place(ctx.GetPlace()),
+                      true,
+                      phi::errors::PreconditionNotMet("It must use XPUPlace."));
 
     // input
     const phi::DenseTensor* x = ctx.Input<phi::DenseTensor>("X");
@@ -704,10 +703,9 @@ class ResNetBasicBlockGradXPUKernel : public framework::OpKernel<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
 
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_xpu_place(ctx.GetPlace()),
-        true,
-        platform::errors::PreconditionNotMet("It must use XPUPlace."));
+    PADDLE_ENFORCE_EQ(platform::is_xpu_place(ctx.GetPlace()),
+                      true,
+                      phi::errors::PreconditionNotMet("It must use XPUPlace."));
 
     const phi::DenseTensor* y_grad =
         ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cc b/paddle/fluid/operators/fused/resnet_unit_op.cc
index 5827cd3427dee..d4e9b3f8e4525 100644
--- a/paddle/fluid/operators/fused/resnet_unit_op.cc
+++ b/paddle/fluid/operators/fused/resnet_unit_op.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -101,22 +101,22 @@ class ResNetUnitOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ctx->Inputs("MeanX")[0],
         ctx->Outputs("RunningMeanX")[0],
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "MeanX and RunningMeanX should share the same memory"));
     PADDLE_ENFORCE_EQ(ctx->Inputs("VarX")[0],
                       ctx->Outputs("RunningVarX")[0],
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "VarX and RunningVarX should share the same memory"));
     if (has_shortcut) {
       PADDLE_ENFORCE_EQ(
           ctx->Inputs("MeanZ")[0],
           ctx->Outputs("RunningMeanZ")[0],
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "MeanZ and RunningMeanZ should share the same memory"));
       PADDLE_ENFORCE_EQ(
           ctx->Inputs("VarZ")[0],
           ctx->Outputs("RunningVarZ")[0],
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "VarZ and RunningVarZ should share the same memory"));
     }
 
@@ -132,25 +132,25 @@ class ResNetUnitOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         x_dims.size(),
         4,
-        platform::errors::InvalidArgument("The dimensions of input "
-                                          "must equal to 4."
-                                          "But received: the shape of input "
-                                          "= [%s], the dimension of input = "
-                                          "[%d]",
-                                          x_dims,
-                                          x_dims.size()));
-    PADDLE_ENFORCE_EQ(w_dims.size(),
-                      4,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of filter "
-                          "must equal to 4."
-                          "But received: the shape of filter "
-                          "= [%s], the dimension of filter = [%d] ",
-                          w_dims,
-                          w_dims.size()));
+        phi::errors::InvalidArgument("The dimensions of input "
+                                     "must equal to 4."
+                                     "But received: the shape of input "
+                                     "= [%s], the dimension of input = "
+                                     "[%d]",
+                                     x_dims,
+                                     x_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        w_dims.size(),
+        4,
+        phi::errors::InvalidArgument("The dimensions of filter "
+                                     "must equal to 4."
+                                     "But received: the shape of filter "
+                                     "= [%s], the dimension of filter = [%d] ",
+                                     w_dims,
+                                     w_dims.size()));
     PADDLE_ENFORCE_EQ(bn_param_dims.size(),
                       4,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dimensions of bn param "
                           "must equal to 4."
                           "But received: the shape of bn param "
@@ -208,16 +208,16 @@ class ResNetUnitOp : public framework::OperatorWithKernel {
     // and var tensors should be float when input tensor's dtype is float16.
     auto bn_param_type = framework::proto::VarType::FP32;
 
-    PADDLE_ENFORCE_EQ(bn_param_type,
-                      framework::TransToProtoVarType(
-                          ctx.Input<phi::DenseTensor>("ScaleX")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Scale input should be of float type"));
-    PADDLE_ENFORCE_EQ(bn_param_type,
-                      framework::TransToProtoVarType(
-                          ctx.Input<phi::DenseTensor>("BiasX")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Bias input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        bn_param_type,
+        framework::TransToProtoVarType(
+            ctx.Input<phi::DenseTensor>("ScaleX")->dtype()),
+        phi::errors::InvalidArgument("Scale input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        bn_param_type,
+        framework::TransToProtoVarType(
+            ctx.Input<phi::DenseTensor>("BiasX")->dtype()),
+        phi::errors::InvalidArgument("Bias input should be of float type"));
     return phi::KernelKey(input_data_type, ctx.GetPlace());
   }
 };
@@ -394,8 +394,7 @@ class ResNetUnitGradOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE_NOT_NULL(
         ctx.InputVar(framework::GradVarName("Y")),
-        platform::errors::NotFound(
-            "Can not find Y@GRAD in the execution context."));
+        phi::errors::NotFound("Can not find Y@GRAD in the execution context."));
 
     return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
                           ctx.GetPlace());
diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cu b/paddle/fluid/operators/fused/resnet_unit_op.cu
index 5b126008bf654..6afe03a67ceab 100644
--- a/paddle/fluid/operators/fused/resnet_unit_op.cu
+++ b/paddle/fluid/operators/fused/resnet_unit_op.cu
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h"
 #include "paddle/fluid/operators/fused/cudnn_norm_conv.cu.h"
 #include "paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -30,10 +30,10 @@ class ResNetUnitKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(ctx.GetPlace()),
         true,
-        platform::errors::PreconditionNotMet("It must use CUDAPlace."));
+        phi::errors::PreconditionNotMet("It must use CUDAPlace."));
     PADDLE_ENFORCE_EQ(platform::CudnnDataType<T>::type,
                       CUDNN_DATA_HALF,
-                      platform::errors::Unavailable(
+                      phi::errors::Unavailable(
                           "ResNetUnitOp only supports float16 for now."));
 
     // input x
@@ -230,10 +230,10 @@ class ResNetUnitGradKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(ctx.GetPlace()),
         true,
-        platform::errors::PreconditionNotMet("It must use CUDAPlace."));
+        phi::errors::PreconditionNotMet("It must use CUDAPlace."));
     PADDLE_ENFORCE_EQ(platform::CudnnDataType<T>::type,
                       CUDNN_DATA_HALF,
-                      platform::errors::Unavailable(
+                      phi::errors::Unavailable(
                           "ResNetUnitOp only supports float16 for now."));
 
     const phi::DenseTensor *y_grad =
@@ -420,10 +420,10 @@ class ResNetUnitGradKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 PD_REGISTER_STRUCT_KERNEL(
-    resnet_unit, GPU, ALL_LAYOUT, ops::ResNetUnitKernel, plat::float16) {}
+    resnet_unit, GPU, ALL_LAYOUT, ops::ResNetUnitKernel, phi::dtype::float16) {}
 PD_REGISTER_STRUCT_KERNEL(resnet_unit_grad,
                           GPU,
                           ALL_LAYOUT,
                           ops::ResNetUnitGradKernel,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
 #endif
diff --git a/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc b/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
index c00e58f8463ab..f50d452d6c285 100644
--- a/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
+++ b/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -26,10 +26,9 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(
-        platform::is_xpu_place(place),
-        true,
-        platform::errors::PreconditionNotMet("It must use XPUPlace."));
+    PADDLE_ENFORCE_EQ(platform::is_xpu_place(place),
+                      true,
+                      phi::errors::PreconditionNotMet("It must use XPUPlace."));
 
     bool is_nchw = (ctx.Attr<std::string>("data_format") == "NCHW");
     // input x
@@ -188,10 +187,9 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(
-        platform::is_xpu_place(place),
-        true,
-        platform::errors::PreconditionNotMet("It must use XPUPlace."));
+    PADDLE_ENFORCE_EQ(platform::is_xpu_place(place),
+                      true,
+                      phi::errors::PreconditionNotMet("It must use XPUPlace."));
 
     bool is_nchw = (ctx.Attr<std::string>("data_format") == "NCHW");
     const phi::DenseTensor *y_grad =
@@ -365,11 +363,11 @@ PD_REGISTER_STRUCT_KERNEL(resnet_unit,
                           XPU,
                           ALL_LAYOUT,
                           ops::ResNetUnitXPUKernel,
-                          plat::float16,
+                          phi::dtype::float16,
                           float) {}
 PD_REGISTER_STRUCT_KERNEL(resnet_unit_grad,
                           XPU,
                           ALL_LAYOUT,
                           ops::ResNetUnitGradXPUKernel,
-                          plat::float16,
+                          phi::dtype::float16,
                           float) {}
diff --git a/paddle/fluid/operators/fused/unity_build_rule.cmake b/paddle/fluid/operators/fused/unity_build_rule.cmake
index 8605cd3cdae85..9ef1e53891d52 100644
--- a/paddle/fluid/operators/fused/unity_build_rule.cmake
+++ b/paddle/fluid/operators/fused/unity_build_rule.cmake
@@ -10,11 +10,7 @@ register_unity_group(
   fused_embedding_fc_lstm_op.cc
   fused_embedding_seq_pool_op.cc
   fusion_lstm_op.cc
-  fusion_repeated_fc_relu_op.cc
-  fusion_seqconv_eltadd_relu_op.cc
-  fusion_seqexpand_concat_fc_op.cc
   fusion_seqpool_concat_op.cc
-  fusion_squared_mat_sub_op.cc
   multi_gru_op.cc
-  mkldnn/multi_gru_mkldnn_op.cc
+  onednn/multi_gru_onednn_op.cc
   fusion_seqpool_cvm_concat_op.cc)
diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc
index 12c8ec9b81db1..851c448865363 100644
--- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc
+++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc
@@ -30,9 +30,9 @@ class SoftmaxMaskFuseUpperTriangleOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         x_dims.size(),
         4,
-        platform::errors::InvalidArgument("Input x must be in 4D dimension but "
-                                          "received the dimension of X is %d",
-                                          x_dims.size()));
+        phi::errors::InvalidArgument("Input x must be in 4D dimension but "
+                                     "received the dimension of X is %d",
+                                     x_dims.size()));
 
     ctx->SetOutputDim("Out", x_dims);
     ctx->ShareLoD("X", "Out");
diff --git a/paddle/fluid/operators/fused_token_prune_op.cc b/paddle/fluid/operators/fused_token_prune_op.cc
index 9fab5c8e7c48d..144e91be396de 100644
--- a/paddle/fluid/operators/fused_token_prune_op.cc
+++ b/paddle/fluid/operators/fused_token_prune_op.cc
@@ -107,59 +107,59 @@ class FusedTokenPruneOp : public framework::OperatorWithKernel {
     auto new_mask_dim = ctx->GetInputDim("NewMask");
 
     // check input dims number
-    PADDLE_ENFORCE_EQ(mask_dim.size(),
-                      4,
-                      platform::errors::InvalidArgument(
-                          "The input mask must be 4-dimension"));
-    PADDLE_ENFORCE_EQ(attn_dim.size(),
-                      4,
-                      platform::errors::InvalidArgument(
-                          "The input attn must be 4-dimension"));
+    PADDLE_ENFORCE_EQ(
+        mask_dim.size(),
+        4,
+        phi::errors::InvalidArgument("The input mask must be 4-dimension"));
+    PADDLE_ENFORCE_EQ(
+        attn_dim.size(),
+        4,
+        phi::errors::InvalidArgument("The input attn must be 4-dimension"));
     PADDLE_ENFORCE_EQ(
         x_dim.size(),
         3,
-        platform::errors::InvalidArgument("The input x must be 4-dimension"));
-    PADDLE_ENFORCE_EQ(new_mask_dim.size(),
-                      4,
-                      platform::errors::InvalidArgument(
-                          "The input attn must be 4-dimension"));
+        phi::errors::InvalidArgument("The input x must be 4-dimension"));
+    PADDLE_ENFORCE_EQ(
+        new_mask_dim.size(),
+        4,
+        phi::errors::InvalidArgument("The input attn must be 4-dimension"));
 
     // check input dims relations
     PADDLE_ENFORCE_EQ(mask_dim[0],
                       attn_dim[0],
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The first dim of mask and attn should be the same"
                           "which is batch size"));
     PADDLE_ENFORCE_EQ(mask_dim[1],
                       attn_dim[1],
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The second dim of mask and attn should be the same"
                           "which is nb_head"));
     PADDLE_ENFORCE_EQ(mask_dim[0],
                       x_dim[0],
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The first dim of mask and x should be the same"
                           "which is batch size"));
     PADDLE_ENFORCE_EQ(
         mask_dim[2],
         mask_dim[3],
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The third dim and the fourth dim of mask should be the same"
             "which is max seq len"));
     PADDLE_ENFORCE_EQ(
         attn_dim[2],
         attn_dim[3],
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The third dim and the fourth dim of mask should be the same"
             "which is max seq len"));
     PADDLE_ENFORCE_EQ(attn_dim[2],
                       mask_dim[2],
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The third dim of mask and attn should be the same"
                           "which is max seq len"));
     PADDLE_ENFORCE_EQ(attn_dim[2],
                       x_dim[1],
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The third dim of mask and the second dim of attn"
                           "should be the same which is max seq len"));
 
diff --git a/paddle/fluid/operators/generator/get_expected_kernel_func.cc b/paddle/fluid/operators/generator/get_expected_kernel_func.cc
index a0f4f7a7e22fa..d9276c5eb9d62 100644
--- a/paddle/fluid/operators/generator/get_expected_kernel_func.cc
+++ b/paddle/fluid/operators/generator/get_expected_kernel_func.cc
@@ -105,8 +105,8 @@ phi::KernelKey GetConcatExpectedKernelType(
     op_ptr->SetDnnFallback(true);
   }
   if (flag == 0) {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "All Inputs of Concat OP are Empty!"));
+    PADDLE_THROW(
+        phi::errors::InvalidArgument("All Inputs of Concat OP are Empty!"));
   }
   return phi::KernelKey(input_data_type, ctx.GetPlace());
 }
@@ -128,7 +128,7 @@ phi::KernelKey GetReduceExpectedKernelType(
             platform::is_xpu_place(ctx.GetPlace()) ||
             platform::is_custom_place(ctx.GetPlace()),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "float16 can only be used on GPU or NPU or XPU place"));
   }
   return phi::KernelKey(input_data_type, ctx.GetPlace());
@@ -236,7 +236,7 @@ phi::KernelKey GetSoftmaxExpectedKernelType(
             platform::is_xpu_place(ctx.GetPlace()) ||
             platform::is_custom_place(ctx.GetPlace()),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "float16 can only be used on GPU/XPU and custom place"));
   }
   return phi::KernelKey(
@@ -255,7 +255,7 @@ phi::KernelKey GetSoftmaxGradExpectedKernelType(
     if (!(platform::is_gpu_place(ctx.GetPlace()) ||
           platform::is_xpu_place(ctx.GetPlace()) ||
           platform::is_custom_place(ctx.GetPlace())))
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "float16 can only be used on GPU/XPU and custom place"));
   }
   return phi::KernelKey(
@@ -275,7 +275,7 @@ phi::KernelKey GetStridedSliceExpectedKernelType(
             platform::is_same_place(tensor.place(),
                                     ctx.device_context().GetPlace()),
             true,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "Place of context is %s. Place of input tensor is %s. They "
                 "are should be same, but reveived different place.",
                 string::to_string(ctx.device_context().GetPlace()),
@@ -375,18 +375,18 @@ phi::KernelKey GetInstanceNormExpectedKernelType(
     in_param_type = framework::proto::VarType::FP64;
   }
   if (ctx.HasInput("Scale")) {
-    PADDLE_ENFORCE_EQ(in_param_type,
-                      framework::TransToProtoVarType(
-                          ctx.Input<phi::DenseTensor>("Scale")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Scale input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        in_param_type,
+        framework::TransToProtoVarType(
+            ctx.Input<phi::DenseTensor>("Scale")->dtype()),
+        phi::errors::InvalidArgument("Scale input should be of float type"));
   }
   if (ctx.HasInput("Bias")) {
-    PADDLE_ENFORCE_EQ(in_param_type,
-                      framework::TransToProtoVarType(
-                          ctx.Input<phi::DenseTensor>("Bias")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Bias input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        in_param_type,
+        framework::TransToProtoVarType(
+            ctx.Input<phi::DenseTensor>("Bias")->dtype()),
+        phi::errors::InvalidArgument("Bias input should be of float type"));
   }
 
   return phi::KernelKey(input_data_type, ctx.GetPlace());
@@ -423,7 +423,7 @@ phi::KernelKey GetConvExpectedKernelType(
     PADDLE_ENFORCE_EQ(
         input_data_type,
         filter_data_type,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "input and filter data type should be consistent, "
             "but received input data type is %s and filter type "
             "is %s",
diff --git a/paddle/fluid/operators/generator/templates/operator_utils.c.j2 b/paddle/fluid/operators/generator/templates/operator_utils.c.j2
index 068704e6d0687..c42032a45cdcd 100644
--- a/paddle/fluid/operators/generator/templates/operator_utils.c.j2
+++ b/paddle/fluid/operators/generator/templates/operator_utils.c.j2
@@ -811,7 +811,7 @@ class {{op_name | to_composite_grad_opmaker_name}} : public prim::CompositeGradO
     {% if "tensor_name" in attr_dict[attrs[i]] %}
     auto {{'tensor_' + attrs[i]}} = this->GetOptionalSingleForwardInput("{{attr_dict[attrs[i]]['tensor_name']}}");
     if ({{'tensor_' + attrs[i]}}) {
-      PADDLE_THROW(platform::errors::Unimplemented(
+      PADDLE_THROW(phi::errors::Unimplemented(
           "We don't support dynamic tensor attribute {{attr_dict[attrs[i]]['tensor_name']}} for {{op_name}} composite"
           "for now. "));
     }
@@ -819,7 +819,7 @@ class {{op_name | to_composite_grad_opmaker_name}} : public prim::CompositeGradO
     {% if "tensors_name" in attr_dict[attrs[i]] %}
     auto {{'tensors_' + attrs[i]}} = this->GetOptionalMultiForwardInput("{{attr_dict[attrs[i]]['tensors_name']}}");
     if ({{'tensors_' + attrs[i]}}) {
-      PADDLE_THROW(platform::errors::Unimplemented(
+      PADDLE_THROW(phi::errors::Unimplemented(
           "We don't support dynamic tensors attribute {{attr_dict[attrs[i]]['tensor_name']}} for {{op_name}} composite "
           "for now. "));
     }
diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
index 8ae92b04b7df4..d11b445f3a9b8 100644
--- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
+++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
@@ -30,7 +30,7 @@ class GetTensorFromSelectedRowsOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("X").front(),
                       framework::proto::VarType::SELECTED_ROWS,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The input X(%s)'s type should be SelectedRows, "
                           "but the received is %s",
                           ctx->Inputs("X").front(),
@@ -38,7 +38,7 @@ class GetTensorFromSelectedRowsOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ctx->GetOutputsVarType("Out").front(),
         framework::proto::VarType::LOD_TENSOR,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The output Out(%s)'s type should be phi::DenseTensor, "
             "but the received is %s",
             ctx->Outputs("Out").front(),
diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
index c88d36602bd79..6fdd6d380a7fe 100644
--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
@@ -38,7 +38,7 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "It must use CUDAPlace when using CUDA Kernel"));
     auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto handle = dev_ctx.cudnn_handle();
@@ -87,7 +87,7 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "It must use CUDAPlace when using CUDA Kernel"));
     auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto handle = dev_ctx.cudnn_handle();
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index e23b3c6c42d5f..c948315189a15 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -52,7 +52,7 @@ class GRUOp : public framework::OperatorWithKernel {
     if (ctx->IsRuntime()) {
       PADDLE_ENFORCE_EQ(input_size,
                         frame_size * 3,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The second dimension of Input(Input) must be 3 "
                             "times of frame_size in GRUOp, but received %d "
                             "(Input) vs %d (frame_size).",
@@ -62,7 +62,7 @@ class GRUOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         weight_dims[1],
         frame_size * 3,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The shape of Input(Weight) matrix must be [frame_size, frame_size "
             "* 3], but received [%d, %d] (Weight) vs [%d, %d] (frame_size).",
             weight_dims[0],
@@ -74,7 +74,7 @@ class GRUOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           h0_dims[1],
           frame_size,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The width of Input(H0) must be equal to frame_size, but "
               "received %d (width of H0) vs %d (frame_size).",
               h0_dims[1],
@@ -87,7 +87,7 @@ class GRUOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           bias_height,
           1,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The shape of Bias must be [1, frame_size * 3], but received "
               "[%d, %d] (Bias) vs [1, %d] (frame_size * 3).",
               bias_height,
@@ -96,7 +96,7 @@ class GRUOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           bias_width,
           frame_size * 3,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The shape of Bias must be [1, frame_size * 3], but received "
               "[%d, %d] (Bias) vs [1, %d] (frame_size * 3).",
               bias_height,
@@ -233,7 +233,7 @@ class GRUGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         input_size,
         frame_size * 3,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The second dimension of Input(Input) must be 3 times of "
             "frame_size in GRUOp, but received %d (Input) vs %d (frame_size).",
             input_size,
@@ -241,7 +241,7 @@ class GRUGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         weight_height,
         frame_size,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The shape of Input(Weight) matrix must be [frame_size, frame_size "
             "* 3], but received [%d, %d] (Weight) vs [%d, %d] (frame_size).",
             weight_height,
@@ -251,7 +251,7 @@ class GRUGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         weight_width,
         frame_size * 3,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The shape of Input(Weight) matrix must be [frame_size, frame_size "
             "* 3], but received [%d, %d] (Weight) vs [%d, %d] (frame_size).",
             weight_height,
@@ -263,7 +263,7 @@ class GRUGradOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           h0_dims[1],
           frame_size,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The width of Input(H0) must be equal to frame_size, but "
               "received %d (width of H0) vs %d (frame_size).",
               h0_dims[1],
@@ -279,7 +279,7 @@ class GRUGradOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           bias_height,
           1,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The shape of Bias must be [1, frame_size * 3], but received "
               "[%d, %d] (Bias) vs [1, %d] (frame_size * 3).",
               bias_height,
@@ -288,7 +288,7 @@ class GRUGradOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           bias_width,
           frame_size * 3,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The shape of Bias must be [1, frame_size * 3], but received "
               "[%d, %d] (Bias) vs [1, %d] (frame_size * 3).",
               bias_height,
@@ -406,7 +406,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
                                        frame_size /*height of height*/);
       PADDLE_ENFORCE_NOT_NULL(
           packed_gate,
-          platform::errors::NotFound(
+          phi::errors::NotFound(
               "The calculation result of packed_gate by "
               "GEMM_ALLOC should not be null when using MKL."));
       blas.GEMM_PACK(CblasBMatrix,
@@ -424,7 +424,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
                                         frame_size /*height of height*/);
       PADDLE_ENFORCE_NOT_NULL(
           packed_state,
-          platform::errors::NotFound(
+          phi::errors::NotFound(
               "The calculation result of packed_state by "
               "GEMM_ALLOC should not be null when using MKL."));
       blas.GEMM_PACK(CblasBMatrix,
diff --git a/paddle/fluid/operators/gru_unit_op.cc b/paddle/fluid/operators/gru_unit_op.cc
index b217d58e6d5da..5a29abda1f369 100644
--- a/paddle/fluid/operators/gru_unit_op.cc
+++ b/paddle/fluid/operators/gru_unit_op.cc
@@ -45,7 +45,7 @@ class GRUUnitOp : public framework::OperatorWithKernel {
     if (ctx->IsRuntime() || input_size >= 0) {
       PADDLE_ENFORCE_EQ(input_size,
                         frame_size * 3,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The second dimension of Input(Input) must be 3 "
                             "times of frame_size in GRUUnitOp, but received %d "
                             "(Input) vs %d (frame_size).",
@@ -55,7 +55,7 @@ class GRUUnitOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         weight_height,
         frame_size,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The shape of Input(Weight) matrix must be [frame_size, frame_size "
             "* 3] in GRUUnitOp, but received [%d, %d] (Weight) vs [%d, %d] "
             "(frame_size).",
@@ -66,7 +66,7 @@ class GRUUnitOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         weight_width,
         frame_size * 3,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The shape of Input(Weight) matrix must be [frame_size, frame_size "
             "* 3] in GRUUnitOp, but received [%d, %d] (Weight) vs [%d, %d] "
             "(frame_size).",
@@ -82,7 +82,7 @@ class GRUUnitOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           bias_height,
           1,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The shape of Bias must be [1, frame_size * 3], but received "
               "[%d, %d] (Bias) vs [1, %d] (frame_size * 3).",
               bias_height,
@@ -91,7 +91,7 @@ class GRUUnitOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           bias_width,
           frame_size * 3,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The shape of Bias must be [1, frame_size * 3], but received "
               "[%d, %d] (Bias) vs [1, %d] (frame_size * 3).",
               bias_height,
@@ -203,7 +203,7 @@ class GRUUnitGradOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           input_size,
           frame_size * 3,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The second dimension of Input(Input) must be 3 "
               "times of frame_size in GRUUnitGradOp, but received %d "
               "(Input) vs %d (frame_size).",
@@ -213,7 +213,7 @@ class GRUUnitGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         weight_height,
         frame_size,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The shape of Input(Weight) matrix must be [frame_size, frame_size "
             "* 3] in GRUUnitGradOp, but received [%d, %d] (Weight) vs [%d, %d] "
             "(frame_size).",
@@ -224,7 +224,7 @@ class GRUUnitGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         weight_width,
         frame_size * 3,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The shape of Input(Weight) matrix must be [frame_size, frame_size "
             "* 3] in GRUUnitGradOp, but received [%d, %d] (Weight) vs [%d, %d] "
             "(frame_size).",
@@ -240,7 +240,7 @@ class GRUUnitGradOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           bias_height,
           1,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The shape of Bias must be [1, frame_size * 3], but received "
               "[%d, %d] (Bias) vs [1, %d] (frame_size * 3).",
               bias_height,
@@ -249,7 +249,7 @@ class GRUUnitGradOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           bias_width,
           frame_size * 3,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The shape of Bias must be [1, frame_size * 3], but received "
               "[%d, %d] (Bias) vs [1, %d] (frame_size * 3).",
               bias_height,
diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h
index 933176433e2d7..fa774e2bef3c2 100644
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -46,7 +46,7 @@ class GRUUnitKernel : public framework::OpKernel<T> {
       else
         ReluCUDAFunctor<T>()(d, x, y);
     } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
+      PADDLE_THROW(phi::errors::Unimplemented(
           "Unsupported activation type, only supports identity, sigmoid, tanh "
           "and relu."));
     }
@@ -169,7 +169,7 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     else if (act_type == relu)
       ReluGradFunctor<T>()(d, x, y, dy, dx);
     else
-      PADDLE_THROW(platform::errors::Unimplemented(
+      PADDLE_THROW(phi::errors::Unimplemented(
           "Unsupported activation type, only supports identity, sigmoid, tanh "
           "and relu."));
   }
diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc
index 03887561934b7..002a98f3538e4 100644
--- a/paddle/fluid/operators/hash_op.cc
+++ b/paddle/fluid/operators/hash_op.cc
@@ -46,7 +46,7 @@ class HashOp : public framework::OperatorWithKernel {
     auto dims = ctx->GetInputDim("X");
     PADDLE_ENFORCE_EQ(dims.size(),
                       2UL,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The input of hash_op's dimensions must be 2"));
     std::vector<int64_t> out_dims;
     int num_hash = ctx->Attrs().Get<int>("num_hash");
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index dea3ce3fe695b..dcf16cf104cc8 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -35,7 +35,7 @@ class HingeLossOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         pred_dims,
         label_dims,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The Input(input) and Input(label) should have the same "
             "shape, but received input shape [%s] != label shape [%s]",
             pred_dims,
@@ -44,13 +44,13 @@ class HingeLossOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         pred_dims.size(),
         2,
-        platform::errors::InvalidArgument("Input(input) rank should be 2, "
-                                          "but received input rank(%d) != 2",
-                                          pred_dims.size()));
+        phi::errors::InvalidArgument("Input(input) rank should be 2, "
+                                     "but received input rank(%d) != 2",
+                                     pred_dims.size()));
 
     PADDLE_ENFORCE_EQ(pred_dims[1],
                       1,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The second dimension of Input(input) should be 1, "
                           "as each row of input contains a real value, "
                           "but received second dimension of input (%d) != 1",
@@ -112,7 +112,7 @@ class HingeLossGradOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_EQ(loss_grad_dims,
                       pred_dims,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The shape of loss gradient should be the same as "
                           "the shape of Input(input), but received the loss "
                           "gradient shape [%s] != input shape [%s]",
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index 0486dd12c4519..d11734c1a6c99 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -27,19 +27,19 @@ class Im2SequenceOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
-                      true,
-                      platform::errors::NotFound(
-                          "The input 'X' of Im2SequenceOp is not found."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("X"),
+        true,
+        phi::errors::NotFound("The input 'X' of Im2SequenceOp is not found."));
     PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"),
                       true,
-                      platform::errors::NotFound(
+                      phi::errors::NotFound(
                           "The output 'Out' of Im2SequenceOp is not found."));
     auto in_dim = ctx->GetInputDim("X");
 
     PADDLE_ENFORCE_EQ(in_dim.size(),
                       4,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dimensions size of input 'X' in Im2SequenceOp "
                           "should be 4. But "
                           "received dimensions size=[%d], dimensions=[%s].",
@@ -159,13 +159,13 @@ class Im2SequenceGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
                       true,
-                      platform::errors::NotFound(
+                      phi::errors::NotFound(
                           "The input 'X' of Im2SequenceGradOp is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")),
-                      true,
-                      platform::errors::NotFound(
-                          "The input %s of Im2SequenceGradOp is not found.",
-                          framework::GradVarName("Out")));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput(framework::GradVarName("Out")),
+        true,
+        phi::errors::NotFound("The input %s of Im2SequenceGradOp is not found.",
+                              framework::GradVarName("Out")));
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 };
diff --git a/paddle/fluid/operators/index_impl.cu.h b/paddle/fluid/operators/index_impl.cu.h
index 629717f61933a..364eb9d574036 100644
--- a/paddle/fluid/operators/index_impl.cu.h
+++ b/paddle/fluid/operators/index_impl.cu.h
@@ -89,7 +89,7 @@ void IndexKernel(const KPDevice &dev_ctx, Tensor *out, Functor func) {
           <<<grid, block, 0, stream>>>(out_data, numel, main_offset, func);
       break;
     default: {
-      PADDLE_THROW(paddle::platform::errors::Unimplemented(
+      PADDLE_THROW(phi::errors::Unimplemented(
           "Unsupported vectorized size: %d !", vec_size));
       break;
     }
diff --git a/paddle/fluid/operators/index_select_op.h b/paddle/fluid/operators/index_select_op.h
index c06885633f348..8ac3fee1d0452 100644
--- a/paddle/fluid/operators/index_select_op.h
+++ b/paddle/fluid/operators/index_select_op.h
@@ -59,7 +59,7 @@ void IndexSelectInner(const framework::ExecutionContext& context,
     PADDLE_ENFORCE_GE(
         index_data[i],
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Variable value (index) of OP(index_select) "
             "expected >= 0 and < %ld, but got %ld. Please check input "
             "value.",
@@ -68,7 +68,7 @@ void IndexSelectInner(const framework::ExecutionContext& context,
     PADDLE_ENFORCE_LT(
         index_data[i],
         input_dim[dim],
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Variable value (index) of OP(index_select) "
             "expected >= 0 and < %ld, but got %ld. Please check input "
             "value.",
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index 4d7730d687d8d..21cd6ad3e084a 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -17,7 +17,7 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #ifdef PADDLE_WITH_DNNL
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #endif
 
 namespace paddle {
@@ -31,7 +31,7 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) {
 
   PADDLE_ENFORCE_EQ("linear",
                     interp_method,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Interpolation method can only be \"linear\" when"
                         "Input(X) dimension is 3, but got method = %s .",
                         interp_method));
@@ -44,7 +44,7 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) {
     PADDLE_ENFORCE_EQ(
         inputs_name.size(),
         1,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(SizeTensor)'size of Op(interpolate) must be 1. "
             "Attr(out_shape)'s length must be 1 for 3-D input tensor, but got "
             "size = %d .",
@@ -67,7 +67,7 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) {
     PADDLE_ENFORCE_EQ(
         scale_tensor.size(),
         1,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Scale's dimension size must be 1, but got dimension = %d .",
             scale_tensor.size()));
     out_w = -1;
@@ -90,13 +90,13 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) {
     PADDLE_ENFORCE_EQ(
         out_size_dim.size(),
         1,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "OutSize's dimension size must be 1, but got dimension = %d .",
             out_size_dim.size()));
     PADDLE_ENFORCE_EQ(
         out_size_dim[0],
         1,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "OutSize's 0-th dimension's value must be 1, but got value = %d .",
             out_size_dim[0]));
     ctx->ShareLoD("X", "Out");
@@ -119,7 +119,7 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
   PADDLE_ENFORCE_EQ("bilinear" == interp_method || "nearest" == interp_method ||
                         "bicubic" == interp_method,
                     true,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Interpolation method can only be \"bilinear\" "
                         "or \"nearest\" or \"bicubic\" when "
                         "Input(X) dimension is 4, but got method is %s.",
@@ -133,7 +133,7 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
     PADDLE_ENFORCE_EQ(
         inputs_name.size(),
         2,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(SizeTensor)'size of Op(interpolate) must be 2. "
             "Attr(out_shape)'s length must be 2 for 4-D input "
             "tensor, but got size = %d .",
@@ -157,7 +157,7 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
     PADDLE_ENFORCE_EQ(
         scale_tensor.size(),
         1,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Scale's dimension size must be 1, but got dimension = %d .",
             scale_tensor.size()));
     out_h = -1;
@@ -186,13 +186,13 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
     PADDLE_ENFORCE_EQ(
         out_size_dim.size(),
         1,
-        platform::errors::InvalidArgument("OutSize's dimension size must be 1, "
-                                          "but got dimension size is %d .",
-                                          out_size_dim.size()));
+        phi::errors::InvalidArgument("OutSize's dimension size must be 1, "
+                                     "but got dimension size is %d .",
+                                     out_size_dim.size()));
     PADDLE_ENFORCE_EQ(
         out_size_dim[0],
         2,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "OutSize's dimension[0] must be 2, but got dimension[0] is %d .",
             out_size_dim[0]));
     ctx->ShareLoD("X", "Out");
@@ -215,7 +215,7 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
   PADDLE_ENFORCE_EQ(
       "trilinear",
       interp_method,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Interpolation method can only be \"trilinear\" when Input(X) "
           "dimension is 5, but got method = %s .",
           interp_method));
@@ -228,7 +228,7 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
     PADDLE_ENFORCE_EQ(
         inputs_name.size(),
         3,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(SizeTensor)'s size of Op(interpolate) must be 3. "
             "Attr(out_shape)'s length must be 3 for 5-D input "
             "tensor, but got size = %d .",
@@ -253,7 +253,7 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
     PADDLE_ENFORCE_EQ(
         scale_tensor.size(),
         1,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Scale's dimension size must be 1, but got size = %d .",
             scale_tensor.size()));
     out_d = -1;
@@ -288,12 +288,12 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
     PADDLE_ENFORCE_EQ(
         out_size_dim.size(),
         1,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "OutSize's dimension size must be 1, but got size is %d.",
             out_size_dim.size()));
     PADDLE_ENFORCE_EQ(out_size_dim[0],
                       3,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "OutSize's dim[0] must be 3, but got size is %d.",
                           out_size_dim[0]));
     ctx->ShareLoD("X", "Out");
@@ -321,7 +321,7 @@ class InterpolateOp : public framework::OperatorWithKernel {
     auto dim_x = ctx->GetInputDim("X");  // NCHW format
     PADDLE_ENFORCE(
         dim_x.size() == 3 || dim_x.size() == 4 || dim_x.size() == 5,
-        platform::errors::Unimplemented(
+        phi::errors::Unimplemented(
             "Input(X) dimension must be 3, 4 or 5, but got dimension = %d .",
             dim_x.size()));
     if (dim_x.size() == 3) {
diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu
index bfbb15b076448..8a71b6d96a055 100644
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -953,7 +953,7 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
   }
   PADDLE_ENFORCE_GT(out_w,
                     0,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "out_w in Attr(out_shape) of Op(interpolate) "
                         "should be greater than 0."));
   framework::DDim dim_out;
@@ -1049,12 +1049,12 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
   }
   PADDLE_ENFORCE_GT(out_h,
                     0,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "out_h in Attr(out_shape) of Op(interpolate) "
                         "should be greater than 0."));
   PADDLE_ENFORCE_GT(out_w,
                     0,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "out_w in Attr(out_shape) of Op(interpolate) "
                         "should be greater than 0."));
 
@@ -1205,17 +1205,17 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
   }
   PADDLE_ENFORCE_GT(out_d,
                     0,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "out_d in Attr(out_shape) of Op(interpolate) "
                         "should be greater than 0."));
   PADDLE_ENFORCE_GT(out_h,
                     0,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "out_h in Attr(out_shape) of Op(interpolate) "
                         "should be greater than 0."));
   PADDLE_ENFORCE_GT(out_w,
                     0,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "out_w in Attr(out_shape) of Op(interpolate) "
                         "should be greater than 0."));
 
@@ -1648,7 +1648,7 @@ class InterpolateOpCUDAKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(ctx.GetPlace()),
         true,
-        platform::errors::NotFound("This kernel only runs on GPU device."));
+        phi::errors::NotFound("This kernel only runs on GPU device."));
     auto* input = ctx.Input<phi::DenseTensor>("X");
     auto* output = ctx.Output<phi::DenseTensor>("Out");
 
@@ -1670,7 +1670,7 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(ctx.GetPlace()),
         true,
-        platform::errors::NotFound("This kernel only runs on GPU device."));
+        phi::errors::NotFound("This kernel only runs on GPU device."));
     auto* input_grad =
         ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto* output_grad =
diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h
index 563879e301d12..793b5fa629ee1 100644
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -37,7 +37,7 @@ inline std::vector<int> get_new_shape(
     auto tensor = list_new_shape_tensor[i];
     PADDLE_ENFORCE_EQ(tensor->dims(),
                       common::make_ddim({1}),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The shape of dimension tensor should be [1],"
                           "but received d%.",
                           tensor->dims()));
@@ -890,7 +890,7 @@ static void Interpolate1DCPUFwd(const framework::ExecutionContext& ctx,
   }
   PADDLE_ENFORCE_GT(out_w,
                     0,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "out_w in Attr(out_shape) of Op(interpolate) "
                         "should be greater than 0."));
   framework::DDim dim_out;
@@ -969,12 +969,12 @@ static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
   }
   PADDLE_ENFORCE_GT(out_h,
                     0,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "out_h in Attr(out_shape) of Op(interpolate) "
                         "should be greater than 0."));
   PADDLE_ENFORCE_GT(out_w,
                     0,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "out_w in Attr(out_shape) of Op(interpolate) "
                         "should be greater than 0."));
   framework::DDim dim_out;
@@ -1090,17 +1090,17 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
   }
   PADDLE_ENFORCE_GT(out_d,
                     0,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "out_d in Attr(out_shape) of Op(interpolate) "
                         "should be greater than 0."));
   PADDLE_ENFORCE_GT(out_h,
                     0,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "out_h in Attr(out_shape) of Op(interpolate) "
                         "should be greater than 0."));
   PADDLE_ENFORCE_GT(out_w,
                     0,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "out_w in Attr(out_shape) of Op(interpolate) "
                         "should be greater than 0."));
 
diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc
index 710cdaeb707b6..39a2b31fa6925 100644
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
@@ -61,7 +61,7 @@ class OverflowOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           true,
           false,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The input type mismatch, the type of Input(X) must be Tensor or "
               "SelectedRows, please check your input."));
     }
diff --git a/paddle/fluid/operators/isfinite_op.cu b/paddle/fluid/operators/isfinite_op.cu
index 300229cbeca66..71aaa66a5ad0d 100755
--- a/paddle/fluid/operators/isfinite_op.cu
+++ b/paddle/fluid/operators/isfinite_op.cu
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/operators/isfinite_op.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
@@ -23,7 +23,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::OverflowKernel<phi::GPUContext, int, ops::InfinityFunctor>,
     ops::OverflowKernel<phi::GPUContext, float, ops::InfinityFunctor>,
     ops::OverflowKernel<phi::GPUContext, double, ops::InfinityFunctor>,
-    ops::OverflowKernel<phi::GPUContext, plat::float16, ops::InfinityFunctor>,
+    ops::OverflowKernel<phi::GPUContext,
+                        phi::dtype::float16,
+                        ops::InfinityFunctor>,
     ops::OverflowKernel<phi::GPUContext, plat::bfloat16, ops::InfinityFunctor>);
 
 REGISTER_OP_CUDA_KERNEL(
@@ -31,5 +33,5 @@ REGISTER_OP_CUDA_KERNEL(
     ops::OverflowKernel<phi::GPUContext, int, ops::NANFunctor>,
     ops::OverflowKernel<phi::GPUContext, float, ops::NANFunctor>,
     ops::OverflowKernel<phi::GPUContext, double, ops::NANFunctor>,
-    ops::OverflowKernel<phi::GPUContext, plat::float16, ops::NANFunctor>,
+    ops::OverflowKernel<phi::GPUContext, phi::dtype::float16, ops::NANFunctor>,
     ops::OverflowKernel<phi::GPUContext, plat::bfloat16, ops::NANFunctor>);
diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h
index 5352ccc99df92..0eb6243a31873 100644
--- a/paddle/fluid/operators/isfinite_op.h
+++ b/paddle/fluid/operators/isfinite_op.h
@@ -20,7 +20,7 @@
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/common/transform.h"
 #include "paddle/phi/kernels/isfinite_kernel.h"
 #include "paddle/phi/kernels/reduce_all_kernel.h"
@@ -89,7 +89,7 @@ inline void TensorContainsNAN(const phi::DenseTensor& tensor,
     return;
   }
 #endif
-  PADDLE_THROW(platform::errors::Unimplemented("Not supported on %s.", place));
+  PADDLE_THROW(phi::errors::Unimplemented("Not supported on %s.", place));
 }
 inline void TensorContainsInf(const phi::DenseTensor& tensor,
                               phi::DenseTensor* out) {
@@ -106,7 +106,7 @@ inline void TensorContainsInf(const phi::DenseTensor& tensor,
     return;
   }
 #endif
-  PADDLE_THROW(platform::errors::Unimplemented("Not supported on %s.", place));
+  PADDLE_THROW(phi::errors::Unimplemented("Not supported on %s.", place));
 }
 inline void TensorIsfinite(const phi::DenseTensor& tensor,
                            phi::DenseTensor* out) {
@@ -123,7 +123,7 @@ inline void TensorIsfinite(const phi::DenseTensor& tensor,
     return;
   }
 #endif
-  PADDLE_THROW(platform::errors::Unimplemented("Not supported on %s.", place));
+  PADDLE_THROW(phi::errors::Unimplemented("Not supported on %s.", place));
 }
 
 // copy the result bool to cpu
@@ -173,7 +173,7 @@ class OverflowKernel : public framework::OpKernel<T> {
     } else {
       PADDLE_ENFORCE_EQ(true,
                         false,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The input type mismatch, the type of Input(X) "
                             "must be phi::DenseTensor or "
                             "SelectedRows, please check your input."));
diff --git a/paddle/fluid/operators/l1_norm_op.h b/paddle/fluid/operators/l1_norm_op.h
index c268a6c51fbc5..3cfcf1959a387 100644
--- a/paddle/fluid/operators/l1_norm_op.h
+++ b/paddle/fluid/operators/l1_norm_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
 namespace paddle {
 namespace operators {
@@ -34,7 +34,8 @@ class L1NormKernel : public framework::OpKernel<T> {
     auto &place =
         *context.template device_context<DeviceContext>().eigen_device();
 
-    EigenL1Norm<std::decay_t<decltype(place)>, T>::Eval(place, out, x);
+    phi::funcs::EigenL1Norm<std::decay_t<decltype(place)>, T>::Eval(
+        place, out, x);
   }
 };
 
@@ -49,7 +50,7 @@ class L1NormGradKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         d_out->numel(),
         1,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(GRAD@Out) of L1NormGradOP should be a scalar."));
     phi::DenseTensor *dx =
         context.Output<phi::DenseTensor>(framework::GradVarName("X"));
@@ -62,7 +63,7 @@ class L1NormGradKernel : public framework::OpKernel<T> {
         *context.template device_context<DeviceContext>().eigen_device();
 
     Eigen::DSizes<Eigen::DenseIndex, 1> x_dsize(x->numel());
-    EigenL1NormGrad<std::decay_t<decltype(place)>, T>::Eval(
+    phi::funcs::EigenL1NormGrad<std::decay_t<decltype(place)>, T>::Eval(
         place, dx_eigen, d_out_eigen, x_eigen, x_dsize);
   }
 };
diff --git a/paddle/fluid/operators/limit_by_capacity_op.cc b/paddle/fluid/operators/limit_by_capacity_op.cc
index 387e30ae647c9..77c29a4cef9f1 100644
--- a/paddle/fluid/operators/limit_by_capacity_op.cc
+++ b/paddle/fluid/operators/limit_by_capacity_op.cc
@@ -52,14 +52,14 @@ class LimitByCapacityOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         expert_count_dtype,
         capacity_dtype,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The dtype of the expert_count and capacity should be same"));
 
     PADDLE_ENFORCE_EQ(
         expert_count_dtype,
         framework::proto::VarType::INT64,
-        platform::errors::InvalidArgument("The dtype of the expert_count and "
-                                          "capacity should be same as int64"));
+        phi::errors::InvalidArgument("The dtype of the expert_count and "
+                                     "capacity should be same as int64"));
     return phi::KernelKey(expert_count_dtype, ctx.GetPlace());
   }
 };
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
deleted file mode 100644
index e017e43d7db2d..0000000000000
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ /dev/null
@@ -1,410 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/linear_chain_crf_op.h"
-
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Emission",
-             "(phi::DenseTensor<float>). When a phi::DenseTensor "
-             "input,A 2-D phi::DenseTensor"
-             " with shape [N x D], where N is the size of the "
-             "mini-batch and D is the total tag number. The unscaled emission "
-             "weight matrix for the linear chain CRF. When a Tensor input,"
-             "A Tensor with shape [N x S x D], where N is batch number,"
-             "S is max length of sequences, D is the total tag number."
-             "A phi::DenseTensor with type float32, float64.");
-    AddInput("Transition",
-             "(Tensor, default Tensor<float>) A 2-D Tensor with shape "
-             "[(D + 2) x D]. The learnable parameter for the linear_chain_crf "
-             "operator. See more details in the operator's comments.");
-    AddInput("Label",
-             "(phi::DenseTensor<int64_t>), when a phi::DenseTensor input,  "
-             "[N x 1], where N is the total element number in a mini-batch. "
-             "when a Tensor input, [N x S], where N is batch number. "
-             "S is max length of sequences. The ground truth."
-             "A  phi::DenseTensor with int64.");
-    AddInput("Length",
-             "(Tensor, default Tensor<int64_t>) A Tensor with shape "
-             "[M x 1], where M is the sequence number in a mini-batch."
-             "A Tensor with type int64.")
-        .AsDispensable();
-    AddOutput(
-        "Alpha",
-        "(Tensor, default Tensor<float>), the same shape with Emission. "
-        "The forward vectors for the entire batch. Denote it as $\alpha$. "
-        "$\alpha$ is a memo table used to calculate the normalization "
-        "factor in CRF. $\alpha[k, v]$ stores the unnormalized "
-        "probabilities of all possible unfinished sequences of tags that end "
-        "at position $k$ with tag $v$. For each $k$, "
-        "$\alpha[k, v]$ is a vector of length $D$ with a component for "
-        "each tag value $v$. This vector is called a forward vector and "
-        "will also be used in backward computations.")
-        .AsIntermediate();
-    AddOutput(
-        "EmissionExps",
-        "(Tensor, default Tensor<float>), the same shape with Emission. "
-        "The exponentials of Input(Emission). This is an intermediate "
-        "computational result in forward computation, and will be reused in "
-        "backward computation."
-        "A phi::DenseTensor with type float32, float64.")
-        .AsIntermediate();
-    AddOutput(
-        "TransitionExps",
-        "(Tensor, default Tensor<float>) A 2-D Tensor with shape "
-        "[(D + 2) x D]. The exponentials of Input(Transition). This is an "
-        "intermediate computational result in forward computation, and "
-        "will be reused in backward computation."
-        "A phi::DenseTensor with type float32, float64.")
-        .AsIntermediate();
-    AddOutput(
-        "LogLikelihood",
-        "(Tensor, default Tensor<float>) The logarithm of the conditional "
-        "likelihood of each training sample in a mini-batch. This is a 2-D "
-        "tensor with shape [S x 1], where S is the sequence number in a "
-        "mini-batch. Note: S is equal to the sequence number in a mini-batch. "
-        "A Tensor with type float32, float64.");
-    AddComment(R"DOC(
-Conditional Random Field defines an undirected probabilistic graph with nodes
-denoting random variables and edges denoting dependencies between these
-variables. CRF learns the conditional probability $P(Y|X)$, where
-$X = (x_1, x_2, ... , x_n)$ are structured inputs and
-$Y = (y_1, y_2, ... , y_n)$ are labels for the inputs.
-
-Linear chain CRF is a special case of CRF that is useful for sequence labeling
-task. Sequence labeling tasks do not assume a lot of conditional
-independences among inputs. The only constraint they impose is that the input
-and output must be linear sequences. Thus, the graph of such a CRF is a simple
-chain or a line, which results in the linear chain CRF.
-
-This operator implements the Forward-Backward algorithm for the linear chain
-CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and
-http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details.
-
-Equation:
-
-1. Denote Input(Emission) to this operator as $x$ here.
-2. The first D values of Input(Transition) to this operator are for starting
-weights, denoted as $a$ here.
-3. The next D values of Input(Transition) of this operator are for ending
-weights, denoted as $b$ here.
-4. The remaining values of Input(Transition) are for transition weights,
-denoted as $w$ here.
-5. Denote Input(Label) as $s$ here.
-
-The probability of a sequence $s$ of length $L$ is defined as:
-$$P(s) = (1/Z) \exp(a_{s_1} + b_{s_L}
-                + \sum_{l=1}^L x_{s_l}
-                + \sum_{l=2}^L w_{s_{l-1},s_l})$$
-
-where $Z$ is a normalization value so that the sum of $P(s)$ over
-all possible sequences is 1, and $x$ is the emission feature weight
-to the linear chain CRF.
-
-Finally, the linear chain CRF operator outputs the logarithm of the conditional
-likelihood of each training sample in a mini-batch.
-
-NOTE:
-
-1. The feature function for a CRF is made up of the emission features and the
-transition features. The emission feature weights are NOT computed in
-this operator. They MUST be computed first before this operator is called.
-
-2. Because this operator performs global normalization over all possible
-sequences internally, it expects UNSCALED emission feature weights.
-Please do not call this op with the emission feature being output of any
-nonlinear activation.
-
-3. The 2nd dimension of Input(Emission) MUST be equal to the tag number.
-
-)DOC");
-  }
-};
-
-class LinearChainCRFOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("Emission"), "Input", "Emission", "LinearChainCRF");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Transition"), "Input", "Transition", "LinearChainCRF");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "LinearChainCRF");
-
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Alpha"), "Output", "Alpha", "LinearChainCRF");
-    OP_INOUT_CHECK(ctx->HasOutput("EmissionExps"),
-                   "Output",
-                   "EmissionExps",
-                   "LinearChainCRF");
-    OP_INOUT_CHECK(ctx->HasOutput("TransitionExps"),
-                   "Output",
-                   "TransitionExps",
-                   "LinearChainCRF");
-    OP_INOUT_CHECK(ctx->HasOutput("LogLikelihood"),
-                   "Output",
-                   "LogLikelihood",
-                   "LinearChainCRF");
-
-    auto transition_dims = ctx->GetInputDim("Transition");
-    PADDLE_ENFORCE_EQ(transition_dims.size(),
-                      2UL,
-                      platform::errors::InvalidArgument(
-                          "The Input(Transition) should be a 2-D tensor. But "
-                          "received: input rank %u, input shape [%s].",
-                          transition_dims.size(),
-                          transition_dims));
-    bool check = true;
-    if ((!ctx->IsRuntime()) &&
-        (transition_dims[0] <= 0 || transition_dims[1] <= 0)) {
-      check = false;
-    }
-    if (check) {
-      PADDLE_ENFORCE_EQ(
-          transition_dims[0] - 2,
-          transition_dims[1],
-          platform::errors::InvalidArgument(
-              "An invalid dimension for the Input(Transition), which should "
-              "be a 2-D tensor with shape [(D + 2) x D]. But received: input "
-              "rank %u, "
-              "input shape [%s].",
-              transition_dims.size(),
-              transition_dims));
-    }
-    auto emission_dims = ctx->GetInputDim("Emission");
-    if (ctx->HasInput("Length")) {
-      PADDLE_ENFORCE_EQ(emission_dims.size(),
-                        3,
-                        platform::errors::InvalidArgument(
-                            "The Input(Emission) should be a 3-D tensor. But "
-                            "received: input rank %u, input shape [%s].",
-                            emission_dims.size(),
-                            emission_dims));
-      auto label_dims = ctx->GetInputDim("Label");
-      PADDLE_ENFORCE_EQ(
-          (label_dims.size() == 3UL && label_dims[2] == 1) ||
-              (label_dims.size() == 2UL),
-          true,
-          platform::errors::InvalidArgument(
-              "The Input(Label) should be a 3-D tensor with last dimension "
-              "fixed to 1 or a 2-D tensor in padding mode. But received: input "
-              "rank %u, input shape [%s].",
-              label_dims.size(),
-              label_dims));
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(emission_dims[0],
-                          label_dims[0],
-                          platform::errors::InvalidArgument(
-                              "The batch size of Input(Emission) "
-                              "and Input(Label) should be the same. But "
-                              "received Input(Emission): "
-                              "rank %u, shape [%s]; received Input(Label): "
-                              "rank %u, shape [%s].",
-                              emission_dims.size(),
-                              emission_dims,
-                              label_dims.size(),
-                              label_dims));
-        PADDLE_ENFORCE_EQ(emission_dims[1],
-                          label_dims[1],
-                          platform::errors::InvalidArgument(
-                              "The max length of Input(Emission) "
-                              "and Input(Label) should be the same. But "
-                              "received Input(Emission): "
-                              "rank %u, shape [%s]; received Input(Label): "
-                              "rank %u, shape [%s].",
-                              emission_dims.size(),
-                              emission_dims,
-                              label_dims.size(),
-                              label_dims));
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(
-          emission_dims.size(),
-          2,
-          platform::errors::InvalidArgument(
-              "The Input(Emission) should be a 2-D tensor. But received: "
-              "input rank %u, input shape [%s].",
-              emission_dims.size(),
-              emission_dims));
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(emission_dims[1],
-                          transition_dims[1],
-                          platform::errors::InvalidArgument(
-                              "The 2nd dimension of the Input(Emission) and "
-                              "the Input(Transition) "
-                              "should be equal to the tag number. But received "
-                              "Input(Emission): rank "
-                              "%u, shape [%s]; received Input(Transition): "
-                              "rank %u, shape [%s].",
-                              emission_dims.size(),
-                              emission_dims,
-                              transition_dims.size(),
-                              transition_dims));
-      }
-
-      auto label_dims = ctx->GetInputDim("Label");
-      PADDLE_ENFORCE_EQ(
-          label_dims.size(),
-          2,
-          platform::errors::InvalidArgument(
-              "The Input(Label) should be a 2-D tensor with the 2nd "
-              "dimensions fixed to 1. But received: input rank %u, "
-              "input shape [%s].",
-              label_dims.size(),
-              label_dims));
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(
-            emission_dims[0],
-            label_dims[0],
-            platform::errors::InvalidArgument(
-                "The first dimension of Input(Emission) and Input(Label) "
-                "should be the same. But received Input(Emission): rank %u, "
-                "shape "
-                "[%s]; received Input(Label): rank %u, shape [%s].",
-                emission_dims.size(),
-                emission_dims,
-                label_dims.size(),
-                label_dims));
-      }
-    }
-    ctx->SetOutputDim("Alpha", emission_dims);
-    ctx->SetOutputDim("EmissionExps", emission_dims);
-    ctx->SetOutputDim("TransitionExps", transition_dims);
-    // TODO(caoying) This is tricky. The 1st dimension of Output(LogLikelihood)
-    // is the sequence number in a mini-batch. The dimension set here should be
-    // resized to its correct size in the function Compute. Fix this once we can
-    // get LoD information in the InferShape interface.
-    ctx->SetOutputDim("LogLikelihood", {emission_dims[0], 1});
-  }
-
- protected:
-  // Explicitly set that the data type of computation kernel of linear_chain_crf
-  // is determined by its input "Emission".
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(
-        OperatorWithKernel::IndicateVarDataType(ctx, "Emission"),
-        platform::CPUPlace());
-  }
-};
-
-class LinearChainCRFGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("EmissionExps"),
-                   "Input",
-                   "EmissionExps",
-                   "LinearChainCRFGrad");
-    OP_INOUT_CHECK(ctx->HasInput("TransitionExps"),
-                   "Input",
-                   "TransitionExps",
-                   "LinearChainCRFGrad");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("LogLikelihood")),
-                   "Input",
-                   framework::GradVarName("LogLikelihood"),
-                   "LinearChainCRFGrad");
-
-    auto transition_exps_dims = ctx->GetInputDim("TransitionExps");
-    auto emission_exps_dims = ctx->GetInputDim("EmissionExps");
-    if (ctx->HasOutput(framework::GradVarName("Emission"))) {
-      ctx->SetOutputDim(framework::GradVarName("Emission"), emission_exps_dims);
-      if (ctx->HasInput("Length") == false) {
-        ctx->ShareLoD("Emission", framework::GradVarName("Emission"));
-      }
-    }
-
-    if (ctx->HasOutput(framework::GradVarName("Transition"))) {
-      ctx->SetOutputDim(framework::GradVarName("Transition"),
-                        transition_exps_dims);
-      ctx->ShareLoD("Transition", framework::GradVarName("Transition"));
-    }
-  }
-
- protected:
-  // Explicitly set that the data type of output of the linear_chain_crf_grad
-  // operator is determined by its input: gradients of LogLikelihood.
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("LogLikelihood")),
-                          platform::CPUPlace());
-  }
-};
-
-template <typename T>
-class LinearChainCRFGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("linear_chain_crf_grad");
-    op->SetAttrMap(this->Attrs());
-    op->SetInput("Emission", this->Input("Emission"));
-    op->SetInput("Transition", this->Input("Transition"));
-    op->SetInput("Label", this->Input("Label"));
-    op->SetInput("Alpha", this->Output("Alpha"));
-    op->SetInput("EmissionExps", this->Output("EmissionExps"));
-    op->SetInput("TransitionExps", this->Output("TransitionExps"));
-    if (this->HasInput("Length")) {
-      op->SetInput("Length", this->Input("Length"));
-    }
-    op->SetInput(framework::GradVarName("LogLikelihood"),
-                 this->OutputGrad("LogLikelihood"));
-
-    op->SetOutput(framework::GradVarName("Emission"),
-                  this->InputGrad("Emission"));
-    op->SetOutput(framework::GradVarName("Transition"),
-                  this->InputGrad("Transition"));
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(LinearChainCRFGradNoNeedBufferVarsInferer,
-                                    "Transition",
-                                    "Emission");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(linear_chain_crf,
-                  ops::LinearChainCRFOp,
-                  ops::LinearChainCRFOpMaker,
-                  ops::LinearChainCRFGradMaker<paddle::framework::OpDesc>,
-                  ops::LinearChainCRFGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(linear_chain_crf_grad,
-                  ops::LinearChainCRFGradOp,
-                  ops::LinearChainCRFGradNoNeedBufferVarsInferer);
-
-PD_REGISTER_STRUCT_KERNEL(linear_chain_crf,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::LinearChainCRFOpKernel,
-                          float,
-                          double) {}
-PD_REGISTER_STRUCT_KERNEL(linear_chain_crf_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::LinearChainCRFGradOpKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h
deleted file mode 100644
index 2891320506391..0000000000000
--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ /dev/null
@@ -1,457 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-static inline T NormalizeL1(T* x, size_t len) {
-  T sum = 0.;
-  for (size_t i = 0; i < len; ++i) sum += x[i];
-  // (This comment is from the old LinearChainCRFLayer.)
-  // Right now, we just bet that sum won't be zero. If this really happens, we
-  // will figure out what should be done then.
-  PADDLE_ENFORCE_GT(
-      sum,
-      0.,
-      platform::errors::InvalidArgument(
-          "The unnormalized probabilities of all possible unfinished "
-          "sequences must be greater than 0."));
-  T s = 1. / sum;
-  for (size_t i = 0; i < len; ++i) x[i] *= s;
-  return sum;
-}
-
-template <typename T>
-struct ScalarMul {
-  explicit ScalarMul(const T& scalar) : scalar(scalar) {}
-  T operator()(const T& val) const { return val * scalar; }
-
-  T scalar;
-};
-
-using framework::LoD;
-
-template <typename T, typename DeviceContext>
-class LinearChainCRFOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* emission_weights =
-        ctx.Input<phi::DenseTensor>("Emission");
-    const phi::DenseTensor* transition_weights =
-        ctx.Input<phi::DenseTensor>("Transition");
-
-    phi::DenseTensor* emission_exps =
-        ctx.Output<phi::DenseTensor>("EmissionExps");
-    phi::DenseTensor* transition_exps =
-        ctx.Output<phi::DenseTensor>("TransitionExps");
-    phi::DenseTensor* alpha = ctx.Output<phi::DenseTensor>("Alpha");
-    phi::DenseTensor* ll = ctx.Output<phi::DenseTensor>("LogLikelihood");
-
-    // Because the computation codes only runs on CPU, here the memory for all
-    // the outputs is FIXED to be allocated on the CPU memory.
-    emission_exps->mutable_data<T>(platform::CPUPlace());
-    alpha->mutable_data<T>(platform::CPUPlace());
-    transition_exps->mutable_data<T>(platform::CPUPlace());
-    auto emission_dims = emission_weights->dims();
-
-    const phi::DenseTensor* label = ctx.Input<phi::DenseTensor>("Label");
-    phi::DenseTensor emission_weights_tmp = *emission_weights;
-    phi::DenseTensor label_tmp = *label;
-    phi::DenseTensor emission_exps_tmp = *emission_exps;
-    phi::DenseTensor alpha_tmp = *alpha;
-    int64_t seq_num = 0;
-    int64_t batch_size;
-    int64_t tag_num;
-    const int64_t* length_data = nullptr;
-    framework::LoD in_lod;
-    if (ctx.HasInput("Length")) {
-      const phi::DenseTensor* label_length =
-          ctx.Input<phi::DenseTensor>("Length");
-      length_data = label_length->data<int64_t>();
-      seq_num = label_length->numel();
-      PADDLE_ENFORCE_EQ(
-          seq_num,
-          emission_dims[0],
-          platform::errors::InvalidArgument(
-              "the size of Input(length) must be equal to "
-              "emission_dims[0]. But input_size = %d, emission_dims[0] = %d.",
-              seq_num,
-              emission_dims[0]));
-      auto label_dims = label->dims();
-      PADDLE_ENFORCE_EQ(
-          seq_num,
-          label_dims[0],
-          platform::errors::InvalidArgument(
-              "the size of Input(length) must be equal to "
-              "label_dims[0]. But input_size = %d, label_dims[0] = %d.",
-              seq_num,
-              label_dims[0]));
-
-      batch_size = emission_dims[0] * emission_dims[1];
-      tag_num = emission_dims[2];
-      emission_weights_tmp.Resize({batch_size, tag_num});
-      label_tmp.Resize({batch_size, 1});
-      alpha_tmp.Resize({batch_size, tag_num});
-      emission_exps_tmp.Resize({batch_size, tag_num});
-      phi::funcs::set_constant(
-          ctx.device_context(), emission_exps, static_cast<T>(0.0));
-      phi::funcs::set_constant(
-          ctx.device_context(), alpha, static_cast<T>(0.0));
-    } else {
-      in_lod = ctx.Input<phi::DenseTensor>("Label")->lod();
-      PADDLE_ENFORCE_NE(in_lod.size(),
-                        0,
-                        platform::errors::InvalidArgument(
-                            "Input(Label) must be a sequence."));
-      seq_num = in_lod[0].size() - 1;
-      batch_size = emission_dims[0];
-      tag_num = emission_dims[1];
-    }
-
-    // Resize the output tensor to its correct dimension.
-    ll->Resize({seq_num, 1});
-    ll->mutable_data<T>(platform::CPUPlace());
-    // Now, all the inputs and outputs should be on the CPU memory.
-    phi::DenseTensor emission_row_max;
-    emission_row_max.mutable_data<T>(
-        common::make_ddim({static_cast<int64_t>(batch_size), 1}),
-        platform::CPUPlace());
-    auto& place =
-        *ctx.template device_context<phi::CPUContext>().eigen_device();
-    auto x = framework::EigenMatrix<T>::From(emission_weights_tmp);
-    auto x_row_max = framework::EigenMatrix<T>::From(emission_row_max);
-    x_row_max.device(place) =
-        x.maximum(Eigen::DSizes<int, 1>(1))
-            .reshape(Eigen::DSizes<int, 2>(static_cast<int>(batch_size), 1));
-    auto x_exps = framework::EigenMatrix<T>::From(emission_exps_tmp);
-    x_exps.device(place) =
-        (x - x_row_max.broadcast(Eigen::DSizes<int, 2>(1, tag_num))).exp();
-    auto w = framework::EigenMatrix<T>::From(*transition_weights);
-    auto w_exps = framework::EigenMatrix<T>::From(*transition_exps);
-    w_exps.device(place) = w.exp();
-    T* log_likelihood = ll->data<T>();
-    for (int64_t i = 0; i < seq_num; ++i) {
-      int64_t start_pos = 0;
-      int64_t end_pos = 0;
-      if (ctx.HasInput("Length")) {
-        start_pos = i * emission_dims[1];
-        end_pos = start_pos + length_data[i];
-      } else {
-        start_pos = static_cast<int64_t>(in_lod[0][i]);
-        end_pos = static_cast<int64_t>(in_lod[0][i + 1]);
-      }
-      if (end_pos == start_pos) {
-        // If an empty input sequence is given, pad 0 for its cost.
-        log_likelihood[i] = 0.;
-        continue;
-      }
-      const phi::DenseTensor one_seq =
-          emission_weights_tmp.Slice(start_pos, end_pos);
-      phi::DenseTensor one_seq_row_max =
-          emission_row_max.Slice(start_pos, end_pos);
-      phi::DenseTensor one_seq_exps =
-          emission_exps_tmp.Slice(start_pos, end_pos);
-      const phi::DenseTensor one_seq_label =
-          label_tmp.Slice(start_pos, end_pos);
-      phi::DenseTensor one_seq_alpha = alpha_tmp.Slice(start_pos, end_pos);
-      log_likelihood[i] = ForwardOneSequence(one_seq,
-                                             one_seq_row_max,
-                                             one_seq_exps,
-                                             *transition_weights,
-                                             *transition_exps,
-                                             one_seq_label,
-                                             &one_seq_alpha);
-    }
-  };
-
- private:
-  T ForwardOneSequence(const phi::DenseTensor& emission,
-                       const phi::DenseTensor& emission_row_max,
-                       const phi::DenseTensor& emission_exps,
-                       const phi::DenseTensor& trans_weights,
-                       const phi::DenseTensor& trans_weight_exps,
-                       const phi::DenseTensor& label,
-                       phi::DenseTensor* alpha) const {
-    const T* x = emission.data<T>();
-    const T* x_row_max = emission_row_max.data<T>();
-    const T* x_exps = emission_exps.data<T>();
-    const T* w = trans_weights.data<T>();
-    const T* w_exps = trans_weight_exps.data<T>();
-    T* alpha_value = alpha->data<T>();
-
-    auto x_dims = emission.dims();
-    const size_t seq_length = x_dims[0];
-    const size_t tag_num = x_dims[1];
-    // The 1st row of w are transition weights for start mask.
-    // The 2nd row of w are transition weights for end mask.
-    // Transition weights between other tags begin from the 3rd row of w.
-    const size_t state_trans_base_idx = 2;
-
-    for (size_t i = 0; i < tag_num; ++i) {
-      alpha_value[i] = w_exps[i] * x_exps[i];
-    }
-    T ll = -x_row_max[0] - std::log(NormalizeL1<T>(alpha_value, tag_num));
-
-    for (size_t k = 1; k < seq_length; ++k) {
-      for (size_t i = 0; i < tag_num; ++i) {
-        T sum = 0.;
-        for (size_t j = 0; j < tag_num; ++j) {
-          sum += alpha_value[(k - 1) * tag_num + j] *  // (*)
-                 w_exps[(j + state_trans_base_idx) * tag_num + i];
-        }
-        alpha_value[k * tag_num + i] = x_exps[k * tag_num + i] * sum;
-      }
-      // NormalizeL1 is to avoid underflow or overflow at (*).
-      ll -= x_row_max[k] +
-            std::log(NormalizeL1<T>(alpha_value + k * tag_num, tag_num));
-    }
-    T sum = 0.;
-    for (size_t i = 0; i < tag_num; ++i) {
-      sum += alpha_value[(seq_length - 1) * tag_num + i] * w_exps[tag_num + i];
-    }
-    ll -= std::log(sum);
-    // Now ll is equal to -log(Z).
-
-    const int64_t* lbl = label.data<int64_t>();
-    PADDLE_ENFORCE_LT(
-        static_cast<size_t>(*std::max_element(lbl, lbl + seq_length)),
-        tag_num,
-        platform::errors::InvalidArgument(
-            "An invalid tag label that excesses the largest tag number."));
-
-    // Calculate the nominator part, which depends on the label sequence.
-    ll += w[lbl[0]] /*start transition*/ + x[lbl[0]] +
-          w[tag_num + lbl[seq_length - 1]] /*end transition*/;
-    for (size_t k = 1; k < seq_length; ++k) {
-      ll += x[k * tag_num + lbl[k]] +
-            w[(lbl[k - 1] + state_trans_base_idx) * tag_num + lbl[k]];
-    }
-    return -ll;
-  }
-};
-
-template <typename T, typename DeviceContext>
-class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* label = ctx.Input<phi::DenseTensor>("Label");
-    const phi::DenseTensor* emission_exps =
-        ctx.Input<phi::DenseTensor>("EmissionExps");
-    const phi::DenseTensor* transition_exps =
-        ctx.Input<phi::DenseTensor>("TransitionExps");
-    const phi::DenseTensor* alpha = ctx.Input<phi::DenseTensor>("Alpha");
-    const T* ll_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("LogLikelihood"))
-            ->data<T>();
-    phi::DenseTensor* emission_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Emission"));
-    auto* emission_grad_data =
-        emission_grad->mutable_data<T>(platform::CPUPlace());
-    memset(emission_grad_data, 0, emission_grad->numel() * sizeof(T));
-    phi::DenseTensor alpha_tmp = *alpha;
-    phi::DenseTensor label_tmp = *label;
-    phi::DenseTensor emission_exps_tmp = *emission_exps;
-    phi::DenseTensor emission_grad_tmp = *emission_grad;
-    // getting seq_num  using padding or not
-    int64_t seq_num = 0;
-    framework::LoD in_lod;
-    const int64_t* length_data = nullptr;
-    if (ctx.HasInput("Length")) {
-      const phi::DenseTensor* label_length =
-          ctx.Input<phi::DenseTensor>("Length");
-      length_data = label_length->data<int64_t>();
-      seq_num = label_length->numel();
-      auto emission_dims = emission_grad->dims();
-      auto label_dims = label->dims();
-      emission_grad_tmp.Resize(
-          {emission_dims[0] * emission_dims[1], emission_dims[2]});
-      label_tmp.Resize({label_dims[0] * label_dims[1], 1});
-      alpha_tmp.Resize({emission_dims[0] * emission_dims[1], emission_dims[2]});
-      emission_exps_tmp.Resize(
-          {emission_dims[0] * emission_dims[1], emission_dims[2]});
-    } else {
-      in_lod = ctx.Input<phi::DenseTensor>("Label")->lod();
-      PADDLE_ENFORCE_NE(in_lod.size(),
-                        0,
-                        platform::errors::InvalidArgument(
-                            "Input(Label) must be a sequence."));
-      seq_num = static_cast<int64_t>(in_lod[0].size() - 1);
-    }
-
-    phi::DenseTensor* transition_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Transition"));
-
-    // TODO(caoying) Fix this constraint. When the Input(Emission) is from the
-    // data reader operator, it can have no gradients.
-    if (transition_grad) {
-      transition_grad->mutable_data<T>(platform::CPUPlace());
-      phi::funcs::set_constant(
-          ctx.device_context(), transition_grad, static_cast<T>(0.));
-    }
-    // Now, all the inputs and outputs should be on the CPU memory.
-    auto emission_dims = emission_exps->dims();
-    // Beta is the memo table used in dynamic programming to calculate the
-    // backward vectors. For a backward vector i (the i-th row of beta), it
-    // captures the unnormalized probabilities of partial sequences starting
-    // at position i.
-    phi::DenseTensor beta;
-    beta.mutable_data<T>(emission_dims, platform::CPUPlace());
-    if (ctx.HasInput("Length")) {
-      beta.Resize({emission_dims[0] * emission_dims[1], emission_dims[2]});
-    }
-
-    for (int64_t i = 0; i < seq_num; ++i) {
-      int64_t start_pos = 0;
-      int64_t end_pos = 0;
-      if (ctx.HasInput("Length")) {
-        start_pos = i * emission_dims[1];
-        end_pos = start_pos + length_data[i];
-      } else {
-        start_pos = static_cast<int64_t>(in_lod[0][i]);
-        end_pos = static_cast<int64_t>(in_lod[0][i + 1]);
-      }
-
-      if (end_pos == start_pos) {
-        continue;
-      }
-      const phi::DenseTensor one_seq_emission_exps =
-          emission_exps_tmp.Slice(start_pos, end_pos);
-      const phi::DenseTensor one_seq_label =
-          label_tmp.Slice(start_pos, end_pos);
-      const phi::DenseTensor one_seq_alpha =
-          alpha_tmp.Slice(start_pos, end_pos);
-      phi::DenseTensor one_seq_beta = beta.Slice(start_pos, end_pos);
-      phi::DenseTensor one_seq_emission_grad =
-          emission_grad_tmp.Slice(start_pos, end_pos);
-      BackwardOneSequence(ctx.template device_context<phi::CPUContext>(),
-                          ll_grad[i],
-                          one_seq_emission_exps,
-                          *transition_exps,
-                          one_seq_alpha,
-                          one_seq_label,
-                          &one_seq_beta,
-                          transition_grad,
-                          &one_seq_emission_grad);
-    }
-  };
-
- private:
-  void BackwardOneSequence(const phi::CPUContext& ctx,
-                           const T ll_grad,
-                           const phi::DenseTensor& emission_exps,
-                           const phi::DenseTensor& transition_exps,
-                           const phi::DenseTensor& alpha,
-                           const phi::DenseTensor& label,
-                           phi::DenseTensor* beta,
-                           phi::DenseTensor* transition_grad,
-                           phi::DenseTensor* emission_grad) const {
-    const T* w_exps = transition_exps.data<T>();
-    const T* x_exps = emission_exps.data<T>();
-    const int64_t* label_value = label.data<int64_t>();
-    T* beta_value = beta->data<T>();
-    auto x_dims = emission_exps.dims();
-    const size_t seq_length = x_dims[0];
-    const size_t tag_num = x_dims[1];
-    const size_t state_trans_base_idx = 2;
-
-    // Calculate the backward vectors: beta.
-    // First, calculate the initial state.
-    for (size_t i = 0; i < tag_num; ++i) {
-      beta_value[(seq_length - 1) * tag_num + i] = w_exps[tag_num + i];
-    }
-    NormalizeL1<T>(beta_value + (seq_length - 1) * tag_num, tag_num);
-    for (int k = static_cast<int>(seq_length) - 2; k >= 0; --k) {
-      for (size_t i = 0; i < tag_num; ++i) {
-        T sum = 0.;
-        for (size_t j = 0; j < tag_num; ++j) {
-          sum += w_exps[(i + state_trans_base_idx) * tag_num + j] *  // (**)
-                 x_exps[(k + 1) * tag_num + j] *
-                 beta_value[(k + 1) * tag_num + j];
-        }
-        beta_value[k * tag_num + i] = sum;
-      }
-      // NormalizeL1 is to avoid underflow or overflow at (**).
-      NormalizeL1<T>(beta_value + k * tag_num, tag_num);
-    }
-
-    auto x_grad_mat = framework::EigenMatrix<T>::From(*emission_grad);
-    auto alpha_mat = framework::EigenMatrix<T>::From(alpha);
-    auto beta_mat = framework::EigenMatrix<T>::From(*beta);
-
-    auto* place = ctx.eigen_device();
-    auto prob = alpha_mat * beta_mat;
-    auto row_sum = prob.sum(Eigen::DSizes<int, 1>(1))
-                       .reshape(Eigen::DSizes<int, 2>(seq_length, 1))
-                       .broadcast(Eigen::DSizes<int, 2>(1, tag_num));
-    x_grad_mat.device(*place) =
-        (prob / row_sum).unaryExpr(ScalarMul<T>(ll_grad));
-
-    for (size_t k = 0; k < seq_length; ++k) {
-      x_grad_mat(k, label_value[k]) -= static_cast<T>(ll_grad);
-    }
-
-    if (transition_grad) {
-      T* trans_grad = transition_grad->data<T>();
-      for (size_t k = 0; k < tag_num; ++k) {
-        // Do not multiply by the output gradient here, because x_grad_mat has
-        // already done this.
-        trans_grad[k] += x_grad_mat(/*from start state*/ 0, k);
-        trans_grad[tag_num + k] +=
-            x_grad_mat(/*to end state*/ seq_length - 1, k);
-      }
-
-      auto x_exps_mat = framework::EigenMatrix<T>::From(emission_exps);
-
-      // TODO(caoying): Fix this to avoid using this local variable if we can
-      // profile the training process.
-      phi::DenseTensor tmp;
-      tmp.mutable_data<T>(beta->dims(), platform::CPUPlace());
-      auto tmp_mat = framework::EigenMatrix<T>::From(tmp);
-      auto prob = beta_mat * x_exps_mat;
-      auto row_sum = prob.sum(Eigen::DSizes<int, 1>(1))
-                         .reshape(Eigen::DSizes<int, 2>(seq_length, 1))
-                         .broadcast(Eigen::DSizes<int, 2>(1, tag_num));
-      tmp_mat.device(*place) = prob / row_sum;
-
-      for (size_t k = 1; k < seq_length; ++k) {
-        T sum = 0.;
-        for (size_t i = 0; i < tag_num; ++i) {
-          for (size_t j = 0; j < tag_num; ++j) {
-            sum += w_exps[(i + state_trans_base_idx) * tag_num + j] *  // (**)
-                   alpha_mat(k - 1, i) * tmp_mat(k, j);
-          }
-        }
-        sum = 1. / sum;
-        for (size_t i = 0; i < tag_num; ++i) {
-          for (size_t j = 0; j < tag_num; ++j) {
-            trans_grad[(i + state_trans_base_idx) * tag_num + j] +=
-                sum * w_exps[(i + state_trans_base_idx) * tag_num + j] *
-                alpha_mat(k - 1, i) * tmp_mat(k, j) * ll_grad;
-          }
-        }
-        trans_grad[(label_value[k - 1] + state_trans_base_idx) * tag_num +
-                   label_value[k]] -= static_cast<T>(ll_grad);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/lite/CMakeLists.txt b/paddle/fluid/operators/lite/CMakeLists.txt
deleted file mode 100644
index ca3b62648378b..0000000000000
--- a/paddle/fluid/operators/lite/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-op_library(lite_engine_op DEPS lite_engine lite_tensor_utils)
diff --git a/paddle/fluid/operators/lite/lite_engine_op.cc b/paddle/fluid/operators/lite/lite_engine_op.cc
deleted file mode 100644
index 0ec1c55f7abee..0000000000000
--- a/paddle/fluid/operators/lite/lite_engine_op.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/lite/lite_engine_op.h"
-
-#include <string>
-#include <vector>
-
-namespace paddle {
-
-namespace operators {
-
-class LiteEngineOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Xs", "A list of inputs.").AsDuplicable();
-    AddOutput("Ys", "A list of outputs.").AsDuplicable();
-    AddAttr<std::string>(
-        "engine_key",
-        "The engine_key here is used to distinguish different Lite Engines");
-    AddComment("Lite engine operator.");
-  }
-};
-
-class LiteInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(lite_engine, ops::LiteEngineOp, ops::LiteEngineOpMaker);
diff --git a/paddle/fluid/operators/lite/lite_engine_op.h b/paddle/fluid/operators/lite/lite_engine_op.h
deleted file mode 100644
index 756fec24d9874..0000000000000
--- a/paddle/fluid/operators/lite/lite_engine_op.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <fstream>
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/inference/lite/engine.h"
-#include "paddle/fluid/inference/lite/tensor_utils.h"
-#include "paddle/fluid/inference/utils/singleton.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-
-namespace paddle {
-namespace operators {
-
-class LiteEngineOp : public framework::OperatorBase {
- private:
-  std::vector<std::string> in_names_;
-  std::vector<std::string> out_names_;
-  paddle::lite_api::PaddlePredictor *engine_;
-  framework::proto::VarType::Type precision_;
-  bool use_gpu_;
-  bool zero_copy_;
-
- public:
-  LiteEngineOp(const std::string &type,
-               const framework::VariableNameMap &inputs,
-               const framework::VariableNameMap &outputs,
-               const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {
-    in_names_ = Inputs("Xs");
-    out_names_ = Outputs("Ys");
-    engine_ =
-        inference::Singleton<inference::lite::EngineManager>::Global().Get(
-            Attr<std::string>("engine_key"));
-    if (Attr<bool>("enable_int8")) {
-      precision_ = framework::proto::VarType_Type_INT8;
-    } else {
-      precision_ = framework::proto::VarType_Type_FP32;
-    }
-    use_gpu_ = Attr<bool>("use_gpu");
-    zero_copy_ = Attr<bool>("zero_copy");
-  }
-
-  void SetEngine(paddle::lite_api::PaddlePredictor *engine) {
-    engine_ = engine;
-  }
-
- protected:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    Execute(scope, dev_place);
-  }
-
-  void Execute(const framework::Scope &scope,
-               const platform::Place &dev_place) const {
-    const platform::DeviceContext *ctx =
-        platform::DeviceContextPool::Instance().Get(dev_place);
-    for (size_t i = 0; i < in_names_.size(); i++) {
-      phi::DenseTensor src_t =
-          inference::analysis::GetFromScope<phi::DenseTensor>(scope,
-                                                              in_names_[i]);
-      paddle::lite_api::Tensor dst_t = *(engine_->GetInput(i));
-      VLOG(3) << "== fluid -> lite (" << in_names_[i] << " -> "
-              << engine_->GetInputNames()[i] << ")";
-      inference::lite::utils::TensorCopy(&dst_t, &src_t, *ctx, zero_copy_);
-    }
-    VLOG(3) << "lite engine run";
-    engine_->Run();
-    VLOG(3) << "lite engine run done";
-    for (size_t i = 0; i < out_names_.size(); i++) {
-      paddle::lite_api::Tensor src_t = *(engine_->GetOutput(i));
-      phi::DenseTensor *dst_t =
-          &inference::analysis::GetFromScope<phi::DenseTensor>(scope,
-                                                               out_names_[i]);
-      VLOG(3) << "== lite -> fluid (" << out_names_[i] << " -> "
-              << engine_->GetOutputNames()[i] << ")";
-      inference::lite::utils::TensorCopy(dst_t, &src_t, *ctx, zero_copy_);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/lite/ut_helper.h b/paddle/fluid/operators/lite/ut_helper.h
index 3d574b1f844c8..ba55b7066da1e 100644
--- a/paddle/fluid/operators/lite/ut_helper.h
+++ b/paddle/fluid/operators/lite/ut_helper.h
@@ -60,8 +60,8 @@ void serialize_params(std::string* str,
   for (const auto& param : params) {
     PADDLE_ENFORCE_NOT_NULL(
         scope->FindVar(param),
-        platform::errors::NotFound("Block should already have a '%s' variable",
-                                   param));
+        phi::errors::NotFound("Block should already have a '%s' variable",
+                              param));
     auto* tensor = scope->FindVar(param)->GetMutable<phi::DenseTensor>();
     framework::SerializeToStream(os, *tensor, ctx);
   }
@@ -81,7 +81,7 @@ void RandomizeTensor(phi::DenseTensor* tensor, const platform::Place& place) {
   size_t num_elements = analysis::AccuDims(dims, dims.size());
   PADDLE_ENFORCE_GT(num_elements,
                     0,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The input tensor dimension of the randomized tensor "
                         "function should be greater than zero."));
   platform::CPUPlace cpu_place;
diff --git a/paddle/fluid/operators/load_combine_op.h b/paddle/fluid/operators/load_combine_op.h
index 4641c39111fad..be94eab242491 100644
--- a/paddle/fluid/operators/load_combine_op.h
+++ b/paddle/fluid/operators/load_combine_op.h
@@ -40,7 +40,7 @@ class LoadCombineOpKernel : public framework::OpKernel<T> {
 
     PADDLE_ENFORCE_GT(out_var_names.size(),
                       0UL,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The number of variables to be loaded is %d, expect "
                           "it to be greater than 0.",
                           out_var_names.size()));
@@ -49,7 +49,7 @@ class LoadCombineOpKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           static_cast<bool>(fin),
           true,
-          platform::errors::Unavailable(
+          phi::errors::Unavailable(
               "LoadCombine operator fails to open file %s, please check "
               "whether the model file is complete or damaged.",
               filename));
@@ -58,7 +58,7 @@ class LoadCombineOpKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_NE(
           filename.empty(),
           true,
-          platform::errors::Unavailable(
+          phi::errors::Unavailable(
               "LoadCombine operator fails to open file %s, please check "
               "whether the model file is complete or damaged.",
               filename));
@@ -81,14 +81,14 @@ class LoadCombineOpKernel : public framework::OpKernel<T> {
       VLOG(4) << "loading tensor: " << out_var_names[i];
       PADDLE_ENFORCE_NOT_NULL(
           out_vars[i],
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The variable %s to be loaded cannot be found.",
               out_var_names[i]));
       // Error checking
       PADDLE_ENFORCE_EQ(
           static_cast<bool>(*buffer),
           true,
-          platform::errors::Unavailable(
+          phi::errors::Unavailable(
               "An error occurred while loading model parameters. "
               "Please check whether the model file is complete or damaged."));
       if (out_vars[i]->IsType<framework::Vocab>()) {
@@ -142,7 +142,7 @@ class LoadCombineOpKernel : public framework::OpKernel<T> {
     buffer->peek();
     PADDLE_ENFORCE_EQ(buffer->eof(),
                       true,
-                      platform::errors::Unavailable(
+                      phi::errors::Unavailable(
                           "Not allowed to load partial data via "
                           "load_combine_op, please use load_op instead."));
   }
diff --git a/paddle/fluid/operators/lod_rank_table_op.cc b/paddle/fluid/operators/lod_rank_table_op.cc
index a399ad4527ff8..310fb619bcb01 100644
--- a/paddle/fluid/operators/lod_rank_table_op.cc
+++ b/paddle/fluid/operators/lod_rank_table_op.cc
@@ -72,10 +72,9 @@ output operators.
 class LoDRankTableInferShape : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE_EQ(
-        context->HasInput("X"),
-        true,
-        platform::errors::NotFound("LoDRankTable must have input X."));
+    PADDLE_ENFORCE_EQ(context->HasInput("X"),
+                      true,
+                      phi::errors::NotFound("LoDRankTable must have input X."));
   }
 };
 
diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc
index ae464e7b47161..654bc669c7504 100644
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
@@ -33,7 +33,7 @@ class LoDResetOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_GT(
           static_cast<int64_t>(level0.size()),
           0,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "If Input(Y) is not provided, the output's LoD should be "
               "specified by attribute 'target_lod'. But the size of "
               "'target_lod' is 0."));
@@ -252,7 +252,7 @@ PD_REGISTER_STRUCT_KERNEL(lod_reset,
                           CPU,
                           ALL_LAYOUT,
                           ops::LoDResetKernel,
-                          plat::float16,
+                          phi::dtype::float16,
                           float,
                           double,
                           int,
@@ -263,7 +263,7 @@ PD_REGISTER_STRUCT_KERNEL(lod_reset,
                           XPU,
                           ALL_LAYOUT,
                           ops::LoDResetKernel,
-                          plat::float16,
+                          phi::dtype::float16,
                           float,
                           double,
                           int,
@@ -274,7 +274,7 @@ PD_REGISTER_STRUCT_KERNEL(lod_reset_grad,
                           CPU,
                           ALL_LAYOUT,
                           ops::LoDResetGradKernel,
-                          plat::float16,
+                          phi::dtype::float16,
                           float,
                           double,
                           int,
diff --git a/paddle/fluid/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h
index a468577ab9aa1..acba05514226b 100644
--- a/paddle/fluid/operators/lod_reset_op.h
+++ b/paddle/fluid/operators/lod_reset_op.h
@@ -54,7 +54,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_EQ(
             static_cast<int64_t>(last_level.back()),
             in->dims()[0],
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "The last value of Input(Y)'s last level LoD should be equal "
                 "to the first dimension of Input(X). But received the last "
                 "value of Input(Y)'s last level LoD is %d, the first dimension "
@@ -79,20 +79,20 @@ class LoDResetKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_GT(
         level0.size(),
         1UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The size of target LoD should be greater than 1. But received the "
             "size of target LoD is %d.",
             level0.size()));
     PADDLE_ENFORCE_EQ(static_cast<int64_t>(level0[0]),
                       0,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Target LoD should be a vector starting from 0. But "
                           "target LoD starts from %d.",
                           static_cast<int64_t>(level0[0])));
     PADDLE_ENFORCE_EQ(
         static_cast<int64_t>(level0.back()),
         in->dims()[0],
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The last value of 'Target LoD''s last level LoD should be equal "
             "to the first dimension of Input(X). But received the 'Target LoD' "
             "is %s, Input(X)'s shape is %s.",
@@ -101,7 +101,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
     for (size_t i = 0; i < level0.size() - 1; ++i) {
       PADDLE_ENFORCE_GE(level0[i + 1],
                         level0[i],
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "'Target LoD' should be an ascending "
                             "vector. But received the Target LoD is %s.",
                             common::make_ddim(level0)));
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index 94b0319729117..42f6a4786fb25 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -70,7 +70,7 @@ struct LoDTensorToArrayFunctor {
       Apply(static_cast<phi::GPUContext *>(dev_ctx));
 #else
       PADDLE_THROW(
-          platform::errors::Unavailable("Paddle is not compiled with CUDA."));
+          phi::errors::Unavailable("Paddle is not compiled with CUDA."));
 #endif
     }
   }
@@ -126,11 +126,11 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
     PADDLE_ENFORCE_LT(
         rank_level,
         x.lod().size(),
-        platform::errors::InvalidArgument("Input should be a phi::DenseTensor, "
-                                          "and its lod_level should be at "
-                                          "least %d, but given is %d.",
-                                          rank_level + 1,
-                                          x.lod().size()));
+        phi::errors::InvalidArgument("Input should be a phi::DenseTensor, "
+                                     "and its lod_level should be at "
+                                     "least %d, but given is %d.",
+                                     rank_level + 1,
+                                     x.lod().size()));
     out.resize(max_seq_len);
     std::vector<std::vector<CopyRange>> copy_ranges(max_seq_len);
 
@@ -215,18 +215,18 @@ class LoDTensorToArrayInferShape : public framework::InferShapeBase {
     PADDLE_ENFORCE_EQ(
         context->HasInput("X"),
         true,
-        platform::errors::NotFound(
+        phi::errors::NotFound(
             "Input(X) of LoDTensorToArrayOp should not be null."));
     PADDLE_ENFORCE_EQ(
         context->HasInput("RankTable"),
         true,
-        platform::errors::NotFound(
+        phi::errors::NotFound(
             "Input(RankTable) of LoDTensorToArrayOp should not be null."));
 
     PADDLE_ENFORCE_EQ(
         context->HasOutput("Out"),
         true,
-        platform::errors::NotFound(
+        phi::errors::NotFound(
             "Output(Out) of LoDTensorToArrayOp should not be null."));
 
     auto x_dim = context->GetInputDim("X");
diff --git a/paddle/fluid/operators/lookup_table_dequant_op.cc b/paddle/fluid/operators/lookup_table_dequant_op.cc
index 93826aab0d573..6f780b946eae8 100644
--- a/paddle/fluid/operators/lookup_table_dequant_op.cc
+++ b/paddle/fluid/operators/lookup_table_dequant_op.cc
@@ -30,17 +30,17 @@ class LookupTableDequantOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("W"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(W) of LookupTableDequantOp should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("Ids"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(Ids) of LookupTableDequantOp should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("Out"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Output(Out) of LookupTableDequantOp should not be null."));
 
     auto table_dims = ctx->GetInputDim("W");
@@ -50,7 +50,7 @@ class LookupTableDequantOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         table_dims.size(),
         2,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "ShapeError: The dimensions of the 'lookup table' must be 2. "
             "But received lookup table's dimensions = %d, "
             "lookup table's shape = [%s].",
@@ -59,7 +59,7 @@ class LookupTableDequantOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ids_dims[ids_rank - 1],
         1,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "ShapeError: The last dimensions of the 'Ids' tensor must be 1. "
             "But received Ids's last dimensions = %d, Ids's shape = [%s].",
             ids_dims[ids_rank - 1],
@@ -69,7 +69,7 @@ class LookupTableDequantOp : public framework::OperatorWithKernel {
         common::vectorize(common::slice_ddim(ids_dims, 0, ids_rank - 1));
     PADDLE_ENFORCE_GE(table_dims[1],
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "the second dim of table_dims should be "
                           "greater or equal to 2, but the actual shape "
                           "is [%s]",
diff --git a/paddle/fluid/operators/lookup_table_dequant_op.h b/paddle/fluid/operators/lookup_table_dequant_op.h
index 2f5a3d0fd7a16..191f05597668c 100644
--- a/paddle/fluid/operators/lookup_table_dequant_op.h
+++ b/paddle/fluid/operators/lookup_table_dequant_op.h
@@ -65,7 +65,7 @@ class LookupTableDequantKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_GE(
         table_var->Type(),
         framework::VarTypeTrait<phi::DenseTensor>::kId,
-        platform::errors::InvalidArgument("lookup table must be LodTensor"));
+        phi::errors::InvalidArgument("lookup table must be LodTensor"));
     auto *table_t = context.Input<phi::DenseTensor>("W");
     int64_t row_number = table_t->dims()[0];
     int64_t quant_number = table_t->dims()[1];
@@ -81,7 +81,7 @@ class LookupTableDequantKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_LT(
             ids[i],
             row_number,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "Variable value (input) of OP(fluid.layers.embedding) "
                 "expected >= 0 and < %ld, but got %ld. Please check input "
                 "value.",
@@ -90,7 +90,7 @@ class LookupTableDequantKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_GE(
             ids[i],
             0,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "Variable value (input) of OP(fluid.layers.embedding) "
                 "expected >= 0 and < %ld, but got %ld. Please check input "
                 "value.",
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index a8185691c45aa..6818b363bc89a 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/var_type_inference.h"
-#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/phi/common/bfloat16.h"
 
 namespace paddle {
 namespace operators {
@@ -40,7 +40,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         table_dims.size(),
         2,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "ShapeError: The dimensions of the 'lookup table' must be 2. "
             "But received lookup table's dimensions = %d, "
             "lookup table's shape = [%s].",
@@ -49,7 +49,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ids_dims[ids_rank - 1],
         1,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "ShapeError: The last dimensions of the 'Ids' tensor must be 1. "
             "But received Ids's last dimensions = %d, Ids's shape = [%s].",
             ids_dims[ids_rank - 1],
@@ -239,11 +239,11 @@ REGISTER_OP_CPU_KERNEL(lookup_table,
                        ops::LookupTableKernel<double>,
                        ops::LookupTableKernel<int8_t>,
                        ops::LookupTableKernel<int16_t>,
-                       ops::LookupTableKernel<paddle::platform::bfloat16>);
+                       ops::LookupTableKernel<phi::dtype::bfloat16>);
 REGISTER_OP_CPU_KERNEL(lookup_table_grad,
                        ops::LookupTableGradKernel<float>,
                        ops::LookupTableGradKernel<double>,
-                       ops::LookupTableGradKernel<paddle::platform::bfloat16>);
+                       ops::LookupTableGradKernel<phi::dtype::bfloat16>);
 
 /* ==========================  register checkpoint ===========================*/
 
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index ba8af995429a3..46ae30754a933 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/lookup_table_op.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/common/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -195,7 +195,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
           common::flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
       PADDLE_ENFORCE_EQ(d_table_value->dims(),
                         d_output_dims_2d,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "ShapeError: The shape of lookup_table@Grad and "
                             "output@Grad should be same. "
                             "But received lookup_table@Grad's shape = [%s], "
@@ -252,10 +252,10 @@ namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(lookup_table,
                         ops::LookupTableCUDAKernel<float>,
                         ops::LookupTableCUDAKernel<double>,
-                        ops::LookupTableCUDAKernel<plat::float16>,
+                        ops::LookupTableCUDAKernel<phi::dtype::float16>,
                         ops::LookupTableCUDAKernel<int8_t>,
                         ops::LookupTableCUDAKernel<int16_t>);
 REGISTER_OP_CUDA_KERNEL(lookup_table_grad,
                         ops::LookupTableGradCUDAKernel<float>,
                         ops::LookupTableGradCUDAKernel<double>,
-                        ops::LookupTableGradCUDAKernel<plat::float16>);
+                        ops::LookupTableGradCUDAKernel<phi::dtype::float16>);
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index 21f0bf6a957ae..f4e48065742ca 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -64,7 +64,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
           PADDLE_ENFORCE_LT(
               ids[i],
               row_number,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "Variable value (input) of OP(fluid.layers.embedding) "
                   "expected >= 0 and < %ld, but got %ld. Please check input "
                   "value.",
@@ -73,7 +73,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
           PADDLE_ENFORCE_GE(
               ids[i],
               0,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "Variable value (input) of OP(fluid.layers.embedding) "
                   "expected >= 0 and < %ld, but got %ld. Please check input "
                   "value.",
@@ -99,7 +99,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
           PADDLE_ENFORCE_GE(
               ids[i],
               0,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "Variable value (input) of OP(fluid.layers.embedding) "
                   "expected >= 0. But received %ld",
                   ids[i]));
@@ -129,14 +129,14 @@ class LookupTableKernel : public framework::OpKernel<T> {
             PADDLE_ENFORCE_GE(
                 ids[i],
                 0,
-                platform::errors::InvalidArgument(
+                phi::errors::InvalidArgument(
                     "Variable value (input) of OP(fluid.layers.embedding) "
                     "expected >= 0. But received %ld",
                     ids[i]));
             PADDLE_ENFORCE_GE(
                 id_index,
                 0,
-                platform::errors::InvalidArgument(
+                phi::errors::InvalidArgument(
                     "the input key should be exists. But received %d.",
                     id_index));
 
@@ -173,7 +173,7 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       auto *table_t = context.Input<phi::SelectedRows>("W");
       table_dim = table_t->value().dims();
     } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "The parameter W of a LookupTable "
           "must be either phi::DenseTensor or SelectedRows"));
     }
@@ -210,7 +210,7 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
           common::flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
       PADDLE_ENFORCE_EQ(d_table_value->dims(),
                         d_output_dims_2d,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "ShapeError: The shape of lookup_table@Grad and "
                             "output@Grad should be same. "
                             "But received lookup_table@Grad's shape = [%s], "
@@ -243,7 +243,7 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
           PADDLE_ENFORCE_LT(
               ids_data[i],
               N,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "Variable value (input) of OP(fluid.layers.embedding) "
                   "expected >= 0 and < %ld, but got %ld. Please check input "
                   "value.",
@@ -252,7 +252,7 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
           PADDLE_ENFORCE_GE(
               ids_data[i],
               0,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "Variable value (input) of OP(fluid.layers.embedding) "
                   "expected >= 0 and < %ld, but got %ld. Please check input"
                   "value.",
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
index edd8b20da160c..137d6bea417c3 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/lookup_table_v2_op.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/common/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -190,7 +190,7 @@ struct LookupTableV2GradCUDAFunctor {
           common::flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
       PADDLE_ENFORCE_EQ(d_table_value->dims(),
                         d_output_dims_2d,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "ShapeError: The shape of lookup_table@Grad and "
                             "output@Grad should be same. "
                             "But received lookup_table@Grad's shape = [%s], "
diff --git a/paddle/fluid/operators/lookup_table_v2_op.h b/paddle/fluid/operators/lookup_table_v2_op.h
index 82dbac8b21dfc..cce29cb715563 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.h
+++ b/paddle/fluid/operators/lookup_table_v2_op.h
@@ -78,7 +78,7 @@ struct LookupTableV2CPUFunctor {
           PADDLE_ENFORCE_LT(
               ids[i],
               row_number,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "Variable value (input) of OP(fluid.layers.embedding) "
                   "expected >= 0 and < %ld, but got %ld. Please check input "
                   "value.",
@@ -87,7 +87,7 @@ struct LookupTableV2CPUFunctor {
           PADDLE_ENFORCE_GE(
               ids[i],
               0,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "Variable value (input) of OP(fluid.layers.embedding) "
                   "expected >= 0 and < %ld, but got %ld. Please check input "
                   "value.",
@@ -113,7 +113,7 @@ struct LookupTableV2CPUFunctor {
           PADDLE_ENFORCE_GE(
               ids[i],
               0,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "Variable value (input) of OP(fluid.layers.embedding) "
                   "expected >= 0. But received %ld",
                   ids[i]));
@@ -121,7 +121,7 @@ struct LookupTableV2CPUFunctor {
           PADDLE_ENFORCE_GE(
               id_index,
               0,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "the input key should be exists. But received %d.",
                   id_index));
 
@@ -173,7 +173,7 @@ struct LookupTableV2GradCPUFunctor {
       auto *table_t = context_.Input<phi::SelectedRows>("W");
       table_dim = table_t->value().dims();
     } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "The parameter W of a LookupTableV2 "
           "must be either phi::DenseTensor or SelectedRows"));
     }
@@ -209,7 +209,7 @@ struct LookupTableV2GradCPUFunctor {
           common::flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
       PADDLE_ENFORCE_EQ(d_table_value->dims(),
                         d_output_dims_2d,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "ShapeError: The shape of lookup_table@Grad and "
                             "output@Grad should be same. "
                             "But received lookup_table@Grad's shape = [%s], "
@@ -242,7 +242,7 @@ struct LookupTableV2GradCPUFunctor {
           PADDLE_ENFORCE_LT(
               ids_data[i],
               N,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "Variable value (input) of OP(fluid.layers.embedding) "
                   "expected >= 0 and < %ld, but got %ld. Please check input "
                   "value.",
@@ -251,7 +251,7 @@ struct LookupTableV2GradCPUFunctor {
           PADDLE_ENFORCE_GE(
               ids_data[i],
               0,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "Variable value (input) of OP(fluid.layers.embedding) "
                   "expected >= 0 and < %ld, but got %ld. Please check input "
                   "value.",
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index bf4c72a2133b6..705af5f8d0587 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #ifdef PADDLE_WITH_DNNL
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #endif
 
 namespace paddle {
@@ -199,23 +199,23 @@ class LRNOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         x_dim.size(),
         4,
-        platform::errors::InvalidArgument("Input(input) rank should be 4, "
-                                          "but received input rank (%d) != 4",
-                                          x_dim.size()));
+        phi::errors::InvalidArgument("Input(input) rank should be 4, "
+                                     "but received input rank (%d) != 4",
+                                     x_dim.size()));
 
     int n = ctx->Attrs().Get<int>("n");
-    PADDLE_ENFORCE_GT(n,
-                      0UL,
-                      platform::errors::InvalidArgument(
-                          "Argument(n) should be positive, "
-                          "but received n(%d) not greater than 0",
-                          n));
-    PADDLE_ENFORCE_EQ(n % 2,
-                      1UL,
-                      platform::errors::InvalidArgument(
-                          "Argument(n) should be odd value, "
-                          "but received n(%d) is not an odd value",
-                          n));
+    PADDLE_ENFORCE_GT(
+        n,
+        0UL,
+        phi::errors::InvalidArgument("Argument(n) should be positive, "
+                                     "but received n(%d) not greater than 0",
+                                     n));
+    PADDLE_ENFORCE_EQ(
+        n % 2,
+        1UL,
+        phi::errors::InvalidArgument("Argument(n) should be odd value, "
+                                     "but received n(%d) is not an odd value",
+                                     n));
 
     ctx->SetOutputDim("Out", x_dim);
     ctx->ShareLoD("X", /*->*/ "Out");
diff --git a/paddle/fluid/operators/lrn_op.h b/paddle/fluid/operators/lrn_op.h
index 4d1cc268d48b6..063ec6e445044 100644
--- a/paddle/fluid/operators/lrn_op.h
+++ b/paddle/fluid/operators/lrn_op.h
@@ -78,21 +78,21 @@ class LRNKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_GE(
         alpha,
         0UL,
-        platform::errors::InvalidArgument("Argument(alpha) should >= 0.0, "
-                                          "but received alpha(%d) less than 0",
-                                          alpha));
+        phi::errors::InvalidArgument("Argument(alpha) should >= 0.0, "
+                                     "but received alpha(%d) less than 0",
+                                     alpha));
     PADDLE_ENFORCE_GE(
         beta,
         0UL,
-        platform::errors::InvalidArgument("Argument(beta) should >= 0.0, "
-                                          "but received beta(%d) less than 0",
-                                          beta));
+        phi::errors::InvalidArgument("Argument(beta) should >= 0.0, "
+                                     "but received beta(%d) less than 0",
+                                     beta));
     PADDLE_ENFORCE_GE(
         k,
         0UL,
-        platform::errors::InvalidArgument("Argument(k) should >= 0.0, "
-                                          "but received k(%d) less than 0",
-                                          k));
+        phi::errors::InvalidArgument("Argument(k) should >= 0.0, "
+                                     "but received k(%d) less than 0",
+                                     k));
 
     LRNFunctor<DeviceContext, T> f;
     f(ctx, x, out, mid, N, C, H, W, n, k, alpha, beta, data_layout);
@@ -165,7 +165,7 @@ class LRNGradKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         !ctx.Attr<bool>("is_test"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "is_test attribute should be set to False in training phase. "
             "but received is_test == True in training phase."));
 
diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc
index 8bdb455375bee..a34fc82fe177c 100644
--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -46,20 +46,20 @@ class LSTMOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         in_dims.size(),
         2,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(X)'s rank must be 2, but received %d.", in_dims.size()));
 
     if (ctx->HasInput("H0")) {
       PADDLE_ENFORCE_EQ(
           ctx->HasInput("C0"),
           true,
-          platform::errors::NotFound("Input(Cell) and Input(Hidden) of LSTM "
-                                     "should not be null at the same time."));
+          phi::errors::NotFound("Input(Cell) and Input(Hidden) of LSTM "
+                                "should not be null at the same time."));
       auto h_dims = ctx->GetInputDim("H0");
       auto c_dims = ctx->GetInputDim("C0");
       PADDLE_ENFORCE_EQ(h_dims,
                         c_dims,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The dimension of Input(H0) and Input(C0) should "
                             "be the same, but received [%s] (H0) vs [%s] (C0).",
                             h_dims,
@@ -71,19 +71,19 @@ class LSTMOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         w_dims.size(),
         2,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The rank of Input(Weight) should be 2, but received %d.",
             w_dims.size()));
     PADDLE_ENFORCE_EQ(w_dims[0],
                       frame_size,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The first dimension of Input(Weight) should be %d, "
                           "but received %d.",
                           frame_size,
                           w_dims[0]));
     PADDLE_ENFORCE_EQ(w_dims[1],
                       4 * frame_size,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The second dimension of Input(Weight) should be 4 * "
                           "%d, but received %d.",
                           frame_size,
@@ -93,13 +93,13 @@ class LSTMOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         b_dims.size(),
         2,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The rank of Input(Bias) should be 2, but received %d.",
             b_dims.size()));
     PADDLE_ENFORCE_EQ(
         b_dims[0],
         1,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The first dimension of Input(Bias) should be 1, but received %d.",
             b_dims[0]));
 
@@ -107,7 +107,7 @@ class LSTMOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           b_dims[1],
           7 * frame_size,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The second dimension of Input(Bias) should be 7 * %d if enable "
               "peepholes connection, but received %d.",
               frame_size,
@@ -116,7 +116,7 @@ class LSTMOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           b_dims[1],
           4 * frame_size,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The second dimension of Input(Bias) should be 4 * %d if disable "
               "peepholes connection, but received %d.",
               frame_size,
diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
index 0e068c47647e3..278fdbdb41761 100644
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -252,7 +252,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         frame_size,
         out_dims[1],
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The second dimension of Input(" +
                 framework::GradVarName("Hidden") +
                 ") should be %d, but received %d in LSTM@Grad operator.",
diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu
index d2c026a9042c7..77cd6433c69e9 100644
--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -98,7 +98,7 @@ void GetClassInterval(const gpuStream_t& stream,
     if (FLAGS_dynamic_static_unified_comm) {
       PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
                         true,
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "You choose to use new communication library by "
                             "setting environment "
                             "variable FLAGS_dynamic_static_unified_comm True. "
@@ -109,7 +109,7 @@ void GetClassInterval(const gpuStream_t& stream,
           comm_context_manager.Get(std::to_string(rid)));
       PADDLE_ENFORCE_NE(comm_ctx,
                         nullptr,
-                        paddle::platform::errors::Unavailable(
+                        phi::errors::Unavailable(
                             "NCCLCommContext is nullptr, collective op should "
                             "has ring_id attr."));
     } else {
@@ -287,7 +287,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
         PADDLE_ENFORCE_EQ(
             comm_context_manager.Has(std::to_string(ring_id)),
             true,
-            paddle::platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "You choose to use new communication library by "
                 "setting environment "
                 "variable FLAGS_dynamic_static_unified_comm True. "
@@ -299,7 +299,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
         PADDLE_ENFORCE_NE(
             comm_ctx,
             nullptr,
-            paddle::platform::errors::Unavailable(
+            phi::errors::Unavailable(
                 "NCCLCommContext is nullptr, collective op should "
                 "has ring_id attr."));
       } else {
diff --git a/paddle/fluid/operators/match_matrix_tensor_op.cc b/paddle/fluid/operators/match_matrix_tensor_op.cc
index 746a28ed588d5..e790262a0fd78 100644
--- a/paddle/fluid/operators/match_matrix_tensor_op.cc
+++ b/paddle/fluid/operators/match_matrix_tensor_op.cc
@@ -36,7 +36,7 @@ void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const {
   auto x_dims = ctx->GetInputDim("X");
   PADDLE_ENFORCE_EQ(x_dims.size(),
                     2,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The dimensions of Input(X) should be equal to 2, "
                         "but received %d.",
                         x_dims.size()));
@@ -44,7 +44,7 @@ void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const {
   auto y_dims = ctx->GetInputDim("Y");
   PADDLE_ENFORCE_EQ(y_dims.size(),
                     2,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The dimensions of Input(Y) should be equal to 2, "
                         "but received %d.",
                         y_dims.size()));
@@ -52,7 +52,7 @@ void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const {
   auto w_dims = ctx->GetInputDim("W");
   PADDLE_ENFORCE_EQ(w_dims.size(),
                     3,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The dimensions of Input(W) should be equal to 3, "
                         "but received %d.",
                         w_dims.size()));
@@ -61,7 +61,7 @@ void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE_EQ(
       w_dims[0],
       x_dims[1],
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The first dimension of Input(W) should be equal to the second "
           "dimension of Input(X). But received the first dimension of Input(W) "
           "is %d, the second dimension of Input(X) is %d.",
@@ -70,7 +70,7 @@ void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE_EQ(
       w_dims[1],
       dim_t,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The second dimension of Input(W) should be equal to 'dim_t', but "
           "received the second dimension of Input(W) is %d, 'dim_t' is %d.",
           w_dims[1],
@@ -78,7 +78,7 @@ void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE_EQ(
       w_dims[2],
       y_dims[1],
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The last dimension of Input(W) should be equal to "
           "the second dimension of Input(Y). But received the last dimension "
           "of Input(W) is %d, the second dimension of Input(Y) is %d.",
@@ -93,19 +93,19 @@ void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const {
     const auto& x_lod = x_var->Get<phi::DenseTensor>().lod();
     PADDLE_ENFORCE_EQ(x_lod.empty(),
                       false,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The Input(X) should hold LoD information, but "
                           "received Input(X).lod() is empty."));
     const auto& x_lod_0 = x_lod[0];
     PADDLE_ENFORCE_GE(x_lod_0.size(),
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dimensions of Input(X)'s LoD data should be "
                           "equal to 2, but received %d.",
                           x_lod_0.size()));
     PADDLE_ENFORCE_EQ(x_dims[0],
                       static_cast<int64_t>(x_lod_0.back()),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The last element of Input(X)'s LoD data should be "
                           "equal to the first dimension of Input(X). "
                           "But received the last element of Input(X)'s LoD "
@@ -118,19 +118,19 @@ void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const {
     const auto& y_lod = y_var->Get<phi::DenseTensor>().lod();
     PADDLE_ENFORCE_EQ(y_lod.empty(),
                       false,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The Input(Y) should hold LoD information, but "
                           "received Input(Y).lod() is empty."));
     const auto& y_lod_0 = y_lod[0];
     PADDLE_ENFORCE_GE(y_lod_0.size(),
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dimensions of Input(Y)'s LoD data should be "
                           "equal to 2, but received %d.",
                           y_lod_0.size()));
     PADDLE_ENFORCE_EQ(y_dims[0],
                       static_cast<int64_t>(y_lod_0.back()),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The last element of Input(Y)'s LoD data should be "
                           "equal to the first dimension of Input(Y). "
                           "But received the last element of Input(Y)'s LoD "
@@ -140,7 +140,7 @@ void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const {
 
     PADDLE_ENFORCE_EQ(x_lod_0.size(),
                       y_lod_0.size(),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dimensions of Input(X)'s and Input(Y)'s LoD "
                           "data should be equal. "
                           "But received the dimensions of Input(X)'s LoD is "
@@ -164,17 +164,17 @@ void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const {
     PADDLE_ENFORCE_GE(
         x_desc->GetLoDLevel(),
         1,
-        platform::errors::InvalidArgument("The LoD level of Input(X) should be "
-                                          "greater than 1, but received %d.",
-                                          x_desc->GetLoDLevel()));
+        phi::errors::InvalidArgument("The LoD level of Input(X) should be "
+                                     "greater than 1, but received %d.",
+                                     x_desc->GetLoDLevel()));
     framework::VarDesc* y_desc =
         PADDLE_GET(framework::VarDesc*, ctx->GetInputVarPtrs("Y")[0]);
     PADDLE_ENFORCE_GE(
         y_desc->GetLoDLevel(),
         1,
-        platform::errors::InvalidArgument("The LoD level of Input(Y) should be "
-                                          "greater than 1, but received %d.",
-                                          y_desc->GetLoDLevel()));
+        phi::errors::InvalidArgument("The LoD level of Input(Y) should be "
+                                     "greater than 1, but received %d.",
+                                     y_desc->GetLoDLevel()));
     ctx->ShareLoD("X", "Out");
   }
 
@@ -255,20 +255,20 @@ class CPUMatchMatrixTensorOPKernel : public framework::OpKernel<T> {
     const auto& x_lod = x->lod();
     PADDLE_ENFORCE_EQ(x_lod.empty(),
                       false,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The Input(X) should hold LoD information, but "
                           "received Input(X).lod() is empty."));
     const auto& x_lod_0 = x_lod[0];
     PADDLE_ENFORCE_GE(x_lod_0.size(),
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dimensions of Input(X)'s LoD data should be "
                           "equal to 2, but received %d.",
                           x_lod_0.size()));
     auto x_dims = x->dims();
     PADDLE_ENFORCE_EQ(x_dims[0],
                       static_cast<int64_t>(x_lod_0.back()),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The last element of Input(X)'s LoD data should be "
                           "equal to the first dimension of Input(X). "
                           "But received the last element of Input(X)'s LoD "
@@ -278,20 +278,20 @@ class CPUMatchMatrixTensorOPKernel : public framework::OpKernel<T> {
     const auto& y_lod = y->lod();
     PADDLE_ENFORCE_EQ(y_lod.empty(),
                       false,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The Input(Y) should hold LoD information, but "
                           "received Input(Y).lod() is empty."));
     const auto& y_lod_0 = y_lod[0];
     PADDLE_ENFORCE_GE(y_lod_0.size(),
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dimensions of Input(Y)'s LoD data should be "
                           "equal to 2, but received %d.",
                           y_lod_0.size()));
     auto y_dims = y->dims();
     PADDLE_ENFORCE_EQ(y_dims[0],
                       static_cast<int64_t>(y_lod_0.back()),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The last element of Input(Y)'s LoD data should be "
                           "equal to the first dimension of Input(Y). "
                           "But received the last element of Input(Y)'s LoD "
@@ -301,7 +301,7 @@ class CPUMatchMatrixTensorOPKernel : public framework::OpKernel<T> {
 
     PADDLE_ENFORCE_EQ(x_lod_0.size(),
                       y_lod_0.size(),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dimensions of Input(X)'s and Input(Y)'s LoD "
                           "data should be equal. "
                           "But received the dimensions of Input(X)'s LoD is "
diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc
index aeff6c394c429..974d5d5d5a3c8 100644
--- a/paddle/fluid/operators/math/beam_search.cc
+++ b/paddle/fluid/operators/math/beam_search.cc
@@ -100,10 +100,10 @@ class BeamSearchFunctor<phi::CPUContext, T> {
     lod[0].assign(high_level.begin(), high_level.end());
     lod[1].assign(low_level.begin(), low_level.end());
     if (!framework::CheckLoD(lod)) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "lod %s is not right in"
-          " beam_search, please check your code.",
-          framework::LoDToString(lod)));
+      PADDLE_THROW(
+          phi::errors::InvalidArgument("lod %s is not right in"
+                                       " beam_search, please check your code.",
+                                       framework::LoDToString(lod)));
     }
     selected_ids->set_lod(lod);
     selected_scores->set_lod(lod);
diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu
index 098f40ab526b1..702c34ce2161f 100644
--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -504,17 +504,17 @@ class BeamSearchFunctor<phi::GPUContext, T> {
                 num_used_threads));
       }
     } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
+      PADDLE_THROW(phi::errors::Unimplemented(
           "Not implemented other number of sequences yet."));
     }
 
     context.Wait();
     mix_vector.CopyToCPU();
     if (!framework::CheckLoD(selected_lod)) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "lod %s is not right in"
-          " beam_search, please check your code.",
-          framework::LoDToString(selected_lod)));
+      PADDLE_THROW(
+          phi::errors::InvalidArgument("lod %s is not right in"
+                                       " beam_search, please check your code.",
+                                       framework::LoDToString(selected_lod)));
     }
 
     selected_ids->set_lod(selected_lod);
diff --git a/paddle/fluid/operators/math/beam_search_xpu.cc b/paddle/fluid/operators/math/beam_search_xpu.cc
index 4ac0e3d886017..33484d139982c 100644
--- a/paddle/fluid/operators/math/beam_search_xpu.cc
+++ b/paddle/fluid/operators/math/beam_search_xpu.cc
@@ -41,7 +41,7 @@ void CopyDataByCondition(const T *x, T **y, int len, const Place &place) {
     PADDLE_ENFORCE_EQ(
         r,
         xpu::Error_t::SUCCESS,
-        platform::errors::External("Copy data form xpu to cpu failed"));
+        phi::errors::External("Copy data form xpu to cpu failed"));
   }
 }
 
@@ -125,10 +125,10 @@ class BeamSearchFunctor<platform::XPUDeviceContext, T> {
     lod[0].assign(high_level.begin(), high_level.end());
     lod[1].assign(low_level.begin(), low_level.end());
     if (!framework::CheckLoD(lod)) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "lod %s is not right in"
-          " beam_search, please check your code.",
-          framework::LoDToString(lod)));
+      PADDLE_THROW(
+          phi::errors::InvalidArgument("lod %s is not right in"
+                                       " beam_search, please check your code.",
+                                       framework::LoDToString(lod)));
     }
     selected_ids->set_lod(lod);
     selected_scores->set_lod(lod);
diff --git a/paddle/fluid/operators/math/bert_encoder_functor.h b/paddle/fluid/operators/math/bert_encoder_functor.h
index 76e27380b90e2..32b36b9c1515e 100644
--- a/paddle/fluid/operators/math/bert_encoder_functor.h
+++ b/paddle/fluid/operators/math/bert_encoder_functor.h
@@ -28,7 +28,7 @@ namespace cub = hipcub;
 #endif
 
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -39,7 +39,7 @@ struct CUDATypeTraits;
 
 template <>
 struct CUDATypeTraits<half> {
-  typedef platform::float16 TYPE;
+  typedef phi::dtype::float16 TYPE;
 };
 
 template <>
diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc
index 87b3695553356..7a37d929be71d 100644
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -110,7 +110,7 @@ class ConcatFunctor<platform::XPUDeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         r,
         XPU_SUCCESS,
-        platform::errors::External(
+        phi::errors::External(
             "XPU API return wrong value[%d %s], please check whether "
             "Baidu Kunlun Card is properly installed.",
             r,
@@ -169,7 +169,7 @@ class SplitFunctor<platform::XPUDeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         r,
         XPU_SUCCESS,
-        platform::errors::External(
+        phi::errors::External(
             "XPU API return wrong value[%d %s], please check whether "
             "Baidu Kunlun Card is properly installed.",
             r,
@@ -190,8 +190,8 @@ FOR_ALL_TYPES(DEFINE_FUNCTOR);
   template class SplitFunctor<platform::XPUDeviceContext, type>;
 
 DEFINE_XPU_FUNCTOR(float)
-DEFINE_XPU_FUNCTOR(platform::float16)
-DEFINE_XPU_FUNCTOR(platform::bfloat16)
+DEFINE_XPU_FUNCTOR(phi::dtype::float16)
+DEFINE_XPU_FUNCTOR(phi::dtype::bfloat16)
 #endif
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/context_project.h b/paddle/fluid/operators/math/context_project.h
index 20211160b7e5e..f510034a7ea0c 100644
--- a/paddle/fluid/operators/math/context_project.h
+++ b/paddle/fluid/operators/math/context_project.h
@@ -144,7 +144,7 @@ class ContextProjectFunctor {
     if (padding_trainable) {
       PADDLE_ENFORCE_NOT_NULL(
           padding_data,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The input tensor 'padding_data' should not be NULL."));
       for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
         if (lod_level_0[i] == lod_level_0[i + 1]) continue;
diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h
index 8d6b0b99f9d52..c4a22ece92b54 100644
--- a/paddle/fluid/operators/math/eigen_values_vectors.h
+++ b/paddle/fluid/operators/math/eigen_values_vectors.h
@@ -38,7 +38,7 @@ static void CheckEighResult(const int batch, const int info) {
   PADDLE_ENFORCE_LE(
       info,
       0,
-      platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "For batch [%d]: the [%d] off-diagonal elements of an intermediate"
           "tridiagonal form did not converge to zero",
           batch,
@@ -46,7 +46,7 @@ static void CheckEighResult(const int batch, const int info) {
   PADDLE_ENFORCE_GE(
       info,
       0,
-      platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "For batch [%d]: the [%d] argument had an illegal value",
           batch,
           info));
@@ -160,7 +160,7 @@ struct MatrixEighFunctor<phi::CPUContext, T> {
     }
     if (has_vectors) {
       PADDLE_ENFORCE_NOT_NULL(eigen_vectors,
-                              platform::errors::InvalidArgument(
+                              phi::errors::InvalidArgument(
                                   "When has_vectors is true,"
                                   "the eigenvectors needs to be calculated, "
                                   "so the eigenvectors must be provided."));
@@ -293,7 +293,7 @@ struct MatrixEighFunctor<phi::GPUContext, T> {
     }
     if (has_vectors) {
       PADDLE_ENFORCE_NOT_NULL(eigen_vectors,
-                              platform::errors::InvalidArgument(
+                              phi::errors::InvalidArgument(
                                   "When has_vectors is true,"
                                   "the eigenvectors needs to be calculated,"
                                   "so the eigenvectors must be provided."));
diff --git a/paddle/fluid/operators/math/prelu.cu b/paddle/fluid/operators/math/prelu.cu
index 9dc25e30ce9aa..eadfdf8cf39e4 100644
--- a/paddle/fluid/operators/math/prelu.cu
+++ b/paddle/fluid/operators/math/prelu.cu
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/prelu.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -135,18 +135,18 @@ void PreluScalarDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
 }
 
 template class PreluChannelWiseDirectCUDAFunctor<float>;
-template class PreluChannelWiseDirectCUDAFunctor<platform::float16>;
-template class PreluChannelWiseDirectCUDAFunctor<platform::bfloat16>;
+template class PreluChannelWiseDirectCUDAFunctor<phi::dtype::float16>;
+template class PreluChannelWiseDirectCUDAFunctor<phi::dtype::bfloat16>;
 template class PreluChannelWiseDirectCUDAFunctor<double>;
 
 template class PreluElementWiseDirectCUDAFunctor<float>;
-template class PreluElementWiseDirectCUDAFunctor<platform::float16>;
-template class PreluElementWiseDirectCUDAFunctor<platform::bfloat16>;
+template class PreluElementWiseDirectCUDAFunctor<phi::dtype::float16>;
+template class PreluElementWiseDirectCUDAFunctor<phi::dtype::bfloat16>;
 template class PreluElementWiseDirectCUDAFunctor<double>;
 
 template class PreluScalarDirectCUDAFunctor<float>;
-template class PreluScalarDirectCUDAFunctor<platform::float16>;
-template class PreluScalarDirectCUDAFunctor<platform::bfloat16>;
+template class PreluScalarDirectCUDAFunctor<phi::dtype::float16>;
+template class PreluScalarDirectCUDAFunctor<phi::dtype::bfloat16>;
 template class PreluScalarDirectCUDAFunctor<double>;
 
 }  // namespace math
diff --git a/paddle/fluid/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h
index 9bca69edd1fea..e14e1ca572cab 100644
--- a/paddle/fluid/operators/math/sampler.h
+++ b/paddle/fluid/operators/math/sampler.h
@@ -36,7 +36,7 @@ class Sampler {
     PADDLE_ENFORCE_GT(
         range,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Range should be greater than 0, but received %d.", range));
     if (seed == 0) {
       std::random_device r;
diff --git a/paddle/fluid/operators/math/tree2col.cc b/paddle/fluid/operators/math/tree2col.cc
index 41c131de0f392..f9950cd95de0b 100644
--- a/paddle/fluid/operators/math/tree2col.cc
+++ b/paddle/fluid/operators/math/tree2col.cc
@@ -58,7 +58,7 @@ void Tree2ColUtil::construct_tree(const phi::DenseTensor &EdgeSet,
   const auto &edge_set_dims = EdgeSet.dims();
   PADDLE_ENFORCE_EQ(edge_set_dims[1],
                     2,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The second dimension of the EdgeSet shall be 2, but "
                         "got %ld != 2. Please check the input value.",
                         edge_set_dims[1]));
diff --git a/paddle/fluid/operators/math/unpooling.cc b/paddle/fluid/operators/math/unpooling.cc
index 78c41f1b8387a..a4e64b4d84fc2 100644
--- a/paddle/fluid/operators/math/unpooling.cc
+++ b/paddle/fluid/operators/math/unpooling.cc
@@ -43,7 +43,7 @@ class Unpool2dMaxFunctor<phi::CPUContext, T> {
           PADDLE_ENFORCE_LT(
               index,
               output_feasize,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "index should less than output tensor height * output tensor "
                   "width. Expected %ld < %ld, but got "
                   "%ld >= %ld. Please check input value.",
@@ -88,7 +88,7 @@ class Unpool2dMaxGradFunctor<phi::CPUContext, T> {
           PADDLE_ENFORCE_LT(
               index,
               output_feasize,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "index should less than output tensor height * output tensor "
                   "width. Expected %ld < %ld, but got "
                   "%ld >= %ld. Please check input value.",
@@ -134,7 +134,7 @@ class Unpool3dMaxFunctor<phi::CPUContext, T> {
           PADDLE_ENFORCE_LT(
               index,
               output_feasize,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "index should less than output tensor depth * output tensor "
                   "height "
                   "* output tensor width. Expected %ld < %ld, but got "
@@ -182,7 +182,7 @@ class Unpool3dMaxGradFunctor<phi::CPUContext, T> {
           PADDLE_ENFORCE_LT(
               index,
               output_feasize,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "index should less than output tensor depth * output tensor "
                   "height "
                   "* output tensor width. Expected %ld < %ld, but got "
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 895a427bae6e2..c55a1e6b14123 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -961,7 +961,7 @@ REGISTER_OP_CUDA_KERNEL(
     matmul,
     ops::MatMulKernel<phi::GPUContext, float>,
     ops::MatMulKernel<phi::GPUContext, double>,
-    ops::MatMulKernel<phi::GPUContext, paddle::platform::float16>);
+    ops::MatMulKernel<phi::GPUContext, phi::dtype::float16>);
 #endif
 
 #if defined(PADDLE_WITH_CUDA)
@@ -971,13 +971,13 @@ REGISTER_OP_CUDA_KERNEL(
     ops::MatMulKernel<phi::GPUContext, int8_t>,
     ops::MatMulKernel<phi::GPUContext, float>,
     ops::MatMulKernel<phi::GPUContext, double>,
-    ops::MatMulKernel<phi::GPUContext, paddle::platform::float16>);
+    ops::MatMulKernel<phi::GPUContext, phi::dtype::float16>);
 #else
 REGISTER_OP_CUDA_KERNEL(
     matmul,
     ops::MatMulKernel<phi::GPUContext, float>,
     ops::MatMulKernel<phi::GPUContext, double>,
-    ops::MatMulKernel<phi::GPUContext, paddle::platform::float16>);
+    ops::MatMulKernel<phi::GPUContext, phi::dtype::float16>);
 #endif
 #endif
 
@@ -985,7 +985,7 @@ REGISTER_OP_CUDA_KERNEL(
     matmul_grad,
     ops::MatMulGradKernel<phi::GPUContext, float>,
     ops::MatMulGradKernel<phi::GPUContext, double>,
-    ops::MatMulGradKernel<phi::GPUContext, paddle::platform::float16>);
+    ops::MatMulGradKernel<phi::GPUContext, phi::dtype::float16>);
 REGISTER_OP_CUDA_KERNEL(matmul_grad_grad,
                         ops::MatMulDoubleGradKernel<phi::GPUContext, float>,
                         ops::MatMulDoubleGradKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/matmul_op_xpu.cc b/paddle/fluid/operators/matmul_op_xpu.cc
index de2aa41d971df..095a90737f9ad 100644
--- a/paddle/fluid/operators/matmul_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_op_xpu.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/xpu_api_wrapper.h"
+#include "paddle/phi/kernels/xpu/xpu_api_wrapper.h"
 
 namespace paddle {
 namespace operators {
@@ -156,12 +156,13 @@ REGISTER_OP_XPU_KERNEL(
     matmul,
     ops::MatMulXPUKernel<paddle::platform::XPUDeviceContext, float>,
     ops::MatMulXPUKernel<paddle::platform::XPUDeviceContext, plat::bfloat16>,
-    ops::MatMulXPUKernel<paddle::platform::XPUDeviceContext, plat::float16>);
+    ops::MatMulXPUKernel<paddle::platform::XPUDeviceContext,
+                         phi::dtype::float16>);
 REGISTER_OP_XPU_KERNEL(
     matmul_grad,
     ops::MatMulGradXPUKernel<paddle::platform::XPUDeviceContext, float>,
     ops::MatMulGradXPUKernel<paddle::platform::XPUDeviceContext,
                              plat::bfloat16>,
     ops::MatMulGradXPUKernel<paddle::platform::XPUDeviceContext,
-                             plat::float16>);
+                             phi::dtype::float16>);
 #endif
diff --git a/paddle/fluid/operators/memcpy_d2h_op.cc b/paddle/fluid/operators/memcpy_d2h_op.cc
index 7233e437e147a..9c1087d42b2b3 100644
--- a/paddle/fluid/operators/memcpy_d2h_op.cc
+++ b/paddle/fluid/operators/memcpy_d2h_op.cc
@@ -67,10 +67,10 @@ class MemcpyD2HKernel {
     if (x == nullptr) {
       return;
     }
-    PADDLE_ENFORCE_EQ(ctx.HasOutput("Out"),
-                      true,
-                      platform::errors::NotFound(
-                          "Output(Out) of memcpy_d2h_op is not found."));
+    PADDLE_ENFORCE_EQ(
+        ctx.HasOutput("Out"),
+        true,
+        phi::errors::NotFound("Output(Out) of memcpy_d2h_op is not found."));
     auto *out = ctx.OutputVar("Out");
     // Get dev_ctx from ExecutionContext, it's D2H stream
     auto &dev_ctx = ctx.device_context();
@@ -136,13 +136,13 @@ REGISTER_OP_IPU_KERNEL_FUNCTOR(memcpy_d2h,
                                ops::MemcpyD2HKernel,
                                bool,
                                ops::MemcpyD2HKernel,
-                               paddle::platform::bfloat16,
+                               phi::dtype::bfloat16,
                                ops::MemcpyD2HKernel,
                                paddle::platform::complex<float>,
                                ops::MemcpyD2HKernel,
                                paddle::platform::complex<double>,
                                ops::MemcpyD2HKernel,
-                               plat::float16,
+                               phi::dtype::float16,
                                ops::MemcpyD2HKernel,
                                int16_t,
                                ops::MemcpyD2HKernel);
diff --git a/paddle/fluid/operators/memcpy_d2h_op.h b/paddle/fluid/operators/memcpy_d2h_op.h
index 4f948e4482f8a..2a69ae556adfd 100644
--- a/paddle/fluid/operators/memcpy_d2h_op.h
+++ b/paddle/fluid/operators/memcpy_d2h_op.h
@@ -53,7 +53,7 @@ class MemcpyD2HFunctor {
 
   void operator()(const phi::SelectedRows &rows) const {
     // (JZ-LIANG) to support SelectedRows
-    PADDLE_THROW(platform::errors::Unimplemented(
+    PADDLE_THROW(phi::errors::Unimplemented(
         "Memcpy for SelectedRows is NOT support yet."));
   }
 
@@ -62,7 +62,7 @@ class MemcpyD2HFunctor {
     PADDLE_ENFORCE_EQ(
         true,
         false,
-        platform::errors::PermissionDenied(
+        phi::errors::PermissionDenied(
             "Not support type for Memcpy  op with type %s", typeid(T).name()));
   }
 
@@ -76,7 +76,7 @@ class MemcpyD2HFunctor {
     } else if (dst_place_type_ == 0) {
       framework::TensorCopy(src, platform::CPUPlace(), dev_ctx_, &dst);
     } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
+      PADDLE_THROW(phi::errors::Unimplemented(
           "memcpy dst_place_type: %d is not supported yet.", dst_place_type_));
     }
     // NOTE(Aurelius84): host <-> device memory copies of a memory block of 64
diff --git a/paddle/fluid/operators/memcpy_h2d_op.cc b/paddle/fluid/operators/memcpy_h2d_op.cc
index 457b629268659..85cd21831c9b1 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.cc
+++ b/paddle/fluid/operators/memcpy_h2d_op.cc
@@ -68,10 +68,10 @@ class MemcpyH2DKernel {
     if (x == nullptr) {
       return;
     }
-    PADDLE_ENFORCE_EQ(ctx.HasOutput("Out"),
-                      true,
-                      platform::errors::NotFound(
-                          "Output(Out) of memcpy_d2h_op is not found."));
+    PADDLE_ENFORCE_EQ(
+        ctx.HasOutput("Out"),
+        true,
+        phi::errors::NotFound("Output(Out) of memcpy_d2h_op is not found."));
     auto *out = ctx.OutputVar("Out");
     // Get dev_ctx from ExecutionContext, it's H2D stream
     auto &dev_ctx = ctx.device_context();
@@ -137,13 +137,13 @@ REGISTER_OP_IPU_KERNEL_FUNCTOR(memcpy_h2d,
                                ops::MemcpyH2DKernel,
                                bool,
                                ops::MemcpyH2DKernel,
-                               paddle::platform::bfloat16,
+                               phi::dtype::bfloat16,
                                ops::MemcpyH2DKernel,
                                paddle::platform::complex<float>,
                                ops::MemcpyH2DKernel,
                                paddle::platform::complex<double>,
                                ops::MemcpyH2DKernel,
-                               plat::float16,
+                               phi::dtype::float16,
                                ops::MemcpyH2DKernel,
                                int16_t,
                                ops::MemcpyH2DKernel);
diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h
index 5f480461d77cd..6b83ab1541976 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.h
+++ b/paddle/fluid/operators/memcpy_h2d_op.h
@@ -53,7 +53,7 @@ class MemcpyH2DFunctor {
       framework::TensorCopy(
           lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, &out_tensor);
     } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
+      PADDLE_THROW(phi::errors::Unimplemented(
           "memcpy dst_place_type: %d is not supported yet.", dst_place_type_));
     }
     out_tensor.set_lod(lod_tensor.lod());
@@ -61,7 +61,7 @@ class MemcpyH2DFunctor {
 
   void operator()(const phi::SelectedRows &rows) const {
     // (JZ-LIANG) to support SelectedRows
-    PADDLE_THROW(platform::errors::Unimplemented(
+    PADDLE_THROW(phi::errors::Unimplemented(
         "Memcpy for SelectedRows is NOT support yet."));
   }
 
@@ -70,7 +70,7 @@ class MemcpyH2DFunctor {
     PADDLE_ENFORCE_EQ(
         true,
         false,
-        platform::errors::PermissionDenied(
+        phi::errors::PermissionDenied(
             "Not support type for Memcpy  op with type %s", typeid(T).name()));
   }
 
diff --git a/paddle/fluid/operators/memcpy_op.cc b/paddle/fluid/operators/memcpy_op.cc
index bb3e29df16d53..8031e318f3af9 100644
--- a/paddle/fluid/operators/memcpy_op.cc
+++ b/paddle/fluid/operators/memcpy_op.cc
@@ -87,7 +87,7 @@ class MemcpyKernel {
     PADDLE_ENFORCE_EQ(
         ctx.HasOutput("Out"),
         true,
-        platform::errors::NotFound("Output(Out) of memcpy_op is not found."));
+        phi::errors::NotFound("Output(Out) of memcpy_op is not found."));
     auto *out = ctx.OutputVar("Out");
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(ctx.GetPlace());
diff --git a/paddle/fluid/operators/memcpy_op.h b/paddle/fluid/operators/memcpy_op.h
index bfdd43eaaa519..81432dcb30f6b 100644
--- a/paddle/fluid/operators/memcpy_op.h
+++ b/paddle/fluid/operators/memcpy_op.h
@@ -66,7 +66,7 @@ class MemcpyFunctor {
           lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, &out_tensor);
 #endif
     } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
+      PADDLE_THROW(phi::errors::Unimplemented(
           "memcpy dst_place_type: %d is not supported yet.", dst_place_type_));
     }
     out_tensor.set_lod(lod_tensor.lod());
@@ -74,7 +74,7 @@ class MemcpyFunctor {
 
   void operator()(const phi::SelectedRows &rows) const {
     // (JZ-LIANG) to support SelectedRows
-    PADDLE_THROW(platform::errors::Unimplemented(
+    PADDLE_THROW(phi::errors::Unimplemented(
         "Memcpy for SelectedRows is NOT support yet."));
   }
 
@@ -83,7 +83,7 @@ class MemcpyFunctor {
     PADDLE_ENFORCE_EQ(
         true,
         false,
-        platform::errors::PermissionDenied(
+        phi::errors::PermissionDenied(
             "Not support type for Memcpy op with type %s", typeid(T).name()));
   }
 
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
deleted file mode 100644
index 3ed27460e16b6..0000000000000
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ /dev/null
@@ -1,276 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/lod_utils.h"
-
-namespace phi {
-class DenseTensor;
-}  // namespace phi
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-class Scope;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-using LoD = framework::LoD;
-
-class MergeLoDTensorOp : public framework::OperatorBase {
- public:
-  MergeLoDTensorOp(const std::string &type,
-                   const framework::VariableNameMap &inputs,
-                   const framework::VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- protected:
-  void RunBase(const framework::Scope &scope,
-               const platform::Place &dev_place) const {
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-
-    auto &x = scope.FindVar(Input("X"))->Get<phi::DenseTensor>();
-    auto &mask = scope.FindVar(Input("Mask"))->Get<phi::DenseTensor>();
-    auto &in_true = scope.FindVar(Input("InTrue"))->Get<phi::DenseTensor>();
-    auto &in_false = scope.FindVar(Input("InFalse"))->Get<phi::DenseTensor>();
-    auto *out = scope.FindVar(Output("Out"))->GetMutable<phi::DenseTensor>();
-    auto level = static_cast<size_t>(Attr<int>("level"));
-
-    PADDLE_ENFORCE_EQ(
-        in_true.numel() || in_false.numel(),
-        true,
-        platform::errors::InvalidArgument(
-            "Input(InTrue) or Input(InFalse) should be initialized."));
-
-    auto &mask_dim = mask.dims();
-    std::unique_ptr<phi::DenseTensor> cpu_mask{new phi::DenseTensor()};
-    if (platform::is_cpu_place(mask.place())) {
-      cpu_mask->ShareDataWith(mask);
-    } else if (platform::is_gpu_place(mask.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      framework::TensorCopy(
-          mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
-#else
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "Not supported GPU, Please recompile or reinstall paddle with CUDA "
-          "support."));
-#endif
-    }
-    auto *mask_data = cpu_mask->data<bool>();
-
-    platform::Place place = dev_place;
-    int64_t batch_size = in_true.dims()[0] + in_false.dims()[0];
-    auto data_type = in_true.IsInitialized() ? in_true.type() : in_false.type();
-    int rank = 0;
-    framework::DDim in_dims;
-    if (in_true.IsInitialized()) {
-      rank = in_true.dims().size();
-      in_dims = common::slice_ddim(in_true.dims(), 1, rank);
-    } else {
-      rank = in_false.dims().size();
-      in_dims = common::slice_ddim(in_false.dims(), 1, rank);
-    }
-
-    auto in_dim_vec = common::vectorize(in_dims);
-    in_dim_vec.insert(in_dim_vec.begin(), batch_size);
-
-    framework::DDim out_dims = common::make_ddim(in_dim_vec);
-    out->Resize(out_dims);
-
-    out->mutable_data(place, data_type);
-
-    auto *out_lod = out->mutable_lod();
-    out_lod->clear();
-    size_t out_offset = 0;
-
-    // Build phi::DenseTensor `out`
-
-    size_t in_true_idx = 0;
-    size_t in_false_idx = 0;
-    for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
-      const phi::DenseTensor *input = nullptr;
-      size_t *in_idx = nullptr;
-      if (static_cast<int>(mask_data[i]) == 0) {
-        input = &in_false;
-        in_idx = &in_false_idx;
-      } else {
-        input = &in_true;
-        in_idx = &in_true_idx;
-      }
-      auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
-          input->lod(), *in_idx, (*in_idx) + 1, 0);
-      auto &lod_length = lod_and_offset.first;
-
-      phi::AppendLoD(out_lod, lod_length);
-
-      size_t start_offset = lod_and_offset.second.first;
-      size_t end_offset = lod_and_offset.second.second;
-
-      PADDLE_ENFORCE_GE(end_offset,
-                        start_offset,
-                        platform::errors::InvalidArgument(
-                            "The end offset less than start offset, end offset "
-                            "is %d, start offset is %d.",
-                            end_offset,
-                            start_offset));
-      size_t len = end_offset - start_offset;
-      if (len == 0) {
-        continue;
-      }
-      auto slice = out->Slice(out_offset, out_offset + len);         // NOLINT
-      framework::TensorCopy(input->Slice(start_offset, end_offset),  // NOLINT
-                            place,
-                            dev_ctx,
-                            &slice);
-      out_offset += len;
-      (*in_idx) += 1;
-    }
-
-    for (size_t i = 0; i < level; i++) {
-      out_lod->insert(out_lod->begin(), x.lod()[i]);
-    }
-  }
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    RunBase(scope, dev_place);
-  }
-};
-
-class MergeLoDTensorInferOp : public MergeLoDTensorOp {
- public:
-  MergeLoDTensorInferOp(const std::string &type,
-                        const framework::VariableNameMap &inputs,
-                        const framework::VariableNameMap &outputs,
-                        const framework::AttributeMap &attrs)
-      : MergeLoDTensorOp(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    RunBase(scope, dev_place);
-    framework::Variable *in_true_var = scope.FindVar(Input("InTrue"));
-    framework::Variable *in_false_var = scope.FindVar(Input("InFalse"));
-    in_true_var->Clear();
-    in_false_var->Clear();
-    in_true_var->GetMutable<phi::DenseTensor>();
-    in_false_var->GetMutable<phi::DenseTensor>();
-  }
-};
-
-class MergeLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input phi::DenseTensor, contains complete lod information to "
-             "construct the output");
-    AddInput("Mask", "A bool column vector which mask the input");
-    AddInput("InTrue", "The True branch to be merged");
-    AddInput("InFalse", "The False branch to be merged");
-    AddOutput("Out", "The merged output phi::DenseTensor");
-    AddAttr<int>("level", "(int) the specific lod level to rank.")
-        .SetDefault(0)
-        .EqualGreaterThan(0);
-    AddComment(
-        R"DOC(
-        Merge True and False branches of phi::DenseTensor into a single Output,
-        with a mask at certain lod level. X is used to obtain complete
-        lod information. Please refer to SplitLoDTensorOp.)DOC");
-  }
-};
-
-class MergeLoDTensorInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "merge_lod_tensor");
-    OP_INOUT_CHECK(
-        context->HasInput("Mask"), "Input", "Mask", "merge_lod_tensor");
-    OP_INOUT_CHECK(
-        context->HasInput("InTrue"), "Input", "InTrue", "merge_lod_tensor");
-    OP_INOUT_CHECK(
-        context->HasInput("InFalse"), "Input", "InFalse", "merge_lod_tensor");
-    OP_INOUT_CHECK(
-        context->HasOutput("Out"), "Output", "Out", "merge_lod_tensor");
-    auto mask_dim = context->GetInputDim("Mask");
-    PADDLE_ENFORCE_EQ(mask_dim.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "If you are using IfElse OP:"
-                          "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
-                          "ie.true_block():\n    out_1 = ie.input(x)\n\n"
-                          "Please ensure that the cond is a 2-D tensor and "
-                          "the second dim size of cond is 1. "
-                          "But now the cond's shape is [%s].\n",
-                          mask_dim));
-    if (context->IsRuntime() || mask_dim[1] > 0) {
-      PADDLE_ENFORCE_EQ(mask_dim[1],
-                        1,
-                        platform::errors::InvalidArgument(
-                            "If you are using IfElse OP:"
-                            "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
-                            "ie.true_block():\n    out_1 = ie.input(x)\n\n"
-                            "Please ensure that the cond is a 2-D tensor "
-                            "and the second dim size of cond is 1. "
-                            "But now the cond's shape is [%s].\n",
-                            mask_dim));
-    }
-
-    context->SetOutputDim("Out", context->GetInputDim("InTrue"));
-  }
-};
-
-template <typename T>
-class MergeLoDTensorGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("split_lod_tensor");
-    grad_op->SetInput("X", this->OutputGrad("Out"));
-    grad_op->SetInput("Mask", this->Input("Mask"));
-    grad_op->SetOutput("OutTrue", this->InputGrad("InTrue"));
-    grad_op->SetOutput("OutFalse", this->InputGrad("InFalse"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(merge_lod_tensor,
-                  ops::MergeLoDTensorOp,
-                  ops::MergeLoDTensorOpProtoMaker,
-                  ops::MergeLoDTensorInferShape,
-                  ops::MergeLoDTensorGradMaker<paddle::framework::OpDesc>,
-                  ops::MergeLoDTensorGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(
-    merge_lod_tensor_infer,
-    ops::MergeLoDTensorInferOp,
-    ops::MergeLoDTensorOpProtoMaker,
-    ops::MergeLoDTensorInferShape,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/metrics/CMakeLists.txt b/paddle/fluid/operators/metrics/CMakeLists.txt
deleted file mode 100644
index b968dbf288ee2..0000000000000
--- a/paddle/fluid/operators/metrics/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-include(operators)
-if(WITH_UNITY_BUILD)
-  # Load Unity Build rules for operators in paddle/fluid/operators/metrics.
-  include(unity_build_rule.cmake)
-endif()
-register_operators()
diff --git a/paddle/fluid/operators/metrics/precision_recall_op.cc b/paddle/fluid/operators/metrics/precision_recall_op.cc
deleted file mode 100644
index 63385cb59171f..0000000000000
--- a/paddle/fluid/operators/metrics/precision_recall_op.cc
+++ /dev/null
@@ -1,250 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/metrics/precision_recall_op.h"
-
-namespace paddle {
-namespace operators {
-
-class PrecisionRecallOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("MaxProbs"),
-        true,
-        platform::errors::NotFound(
-            "PrecisionRecallOp Input(MaxProbs) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Indices"),
-        true,
-        platform::errors::NotFound(
-            "PrecisionRecallOp Input(Indices) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Labels"),
-        true,
-        platform::errors::NotFound(
-            "PrecisionRecallOp Input(Labels) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("BatchMetrics"),
-        true,
-        platform::errors::NotFound(
-            "PrecisionRecallOp Output(BatchMetrics) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("AccumMetrics"),
-        true,
-        platform::errors::NotFound(
-            "PrecisionRecallOp Output(AccumMetrics) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("AccumStatesInfo"),
-        true,
-        platform::errors::NotFound(
-            "PrecisionRecallOp Output(AccumStatesInfo) should not be null."));
-
-    int64_t cls_num =
-        static_cast<int64_t>(ctx->Attrs().Get<int>("class_number"));
-    auto max_probs_dims = ctx->GetInputDim("MaxProbs");
-    auto labels_dims = ctx->GetInputDim("Labels");
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(max_probs_dims[1],
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Each instance of PrecisionRecallOp "
-                            "Input(MaxProbs) contains one max probability, "
-                            "the shape of Input(MaxProbs) should be "
-                            "[batch_size, 1], the 2nd dimension of "
-                            "Input(MaxProbs) should be 1. But the 2nd "
-                            "dimension we received is %d",
-                            max_probs_dims[1]));
-      PADDLE_ENFORCE_EQ(
-          ctx->GetInputDim("Indices"),
-          max_probs_dims,
-          platform::errors::InvalidArgument(
-              "The shape of PrecisionRecallOp Input(Indices) should be same "
-              "with "
-              "max_probs_dims. But received the shape of Input(Indices) is "
-              "[%d, %d], max_probs_dims is [%d, %d]",
-              ctx->GetInputDim("Indices")[0],
-              ctx->GetInputDim("Indices")[1],
-              max_probs_dims[0],
-              max_probs_dims[1]));
-      PADDLE_ENFORCE_EQ(
-          max_probs_dims[0],
-          labels_dims[0],
-          platform::errors::InvalidArgument(
-              "The 1st dimension of PrecisionRecallOp Input(MaxProbs) and "
-              "Input(Labels) both should be batch_size"
-              "But the 1st dimension we received max_probs_dims[0] = %d, "
-              "labels_dims[0] = %d",
-              max_probs_dims[0],
-              labels_dims[0]));
-      PADDLE_ENFORCE_EQ(labels_dims[1],
-                        1,
-                        platform::errors::InvalidArgument(
-                            "The 2nd dimension of PrecisionRecallOp "
-                            "Input(Labels) contains instance label and "
-                            "the shape should be equal to 1. But the 2nd "
-                            "dimension we received is %d",
-                            labels_dims[1]));
-    }
-    if (ctx->HasInput("Weights")) {
-      auto weights_dims = ctx->GetInputDim("Weights");
-
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(
-            weights_dims,
-            common::make_ddim({max_probs_dims[0], 1}),
-            platform::errors::InvalidArgument(
-                "The shape of PrecisionRecallOp Input(Weights) should be "
-                "[batch_size, 1]. But the shape we received is [%d, %d]",
-                weights_dims[0],
-                weights_dims[1]));
-      }
-    }
-    if (ctx->HasInput("StatesInfo")) {
-      auto states_dims = ctx->GetInputDim("StatesInfo");
-
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(
-            states_dims,
-            common::make_ddim({cls_num, 4}),
-            platform::errors::InvalidArgument(
-                "The shape of PrecisionRecallOp Input(StatesInfo) should be "
-                "[class_number, 4]. But the shape we received is [%d, %d]",
-                states_dims[0],
-                states_dims[1]));
-      }
-    }
-
-    // Layouts of BatchMetrics and AccumMetrics both are:
-    // [
-    //  macro average precision, macro average recall, macro average F1 score,
-    //  micro average precision, micro average recall, micro average F1 score
-    // ]
-    ctx->SetOutputDim("BatchMetrics", {6});
-    ctx->SetOutputDim("AccumMetrics", {6});
-    // Shape of AccumStatesInfo is [class_number, 4]
-    // The layout of each row is:
-    // [ TP, FP, TN, FN ]
-    ctx->SetOutputDim("AccumStatesInfo", {cls_num, 4});
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return phi::KernelKey(
-        OperatorWithKernel::IndicateVarDataType(ctx, "MaxProbs"),
-        ctx.GetPlace());
-  }
-};
-
-class PrecisionRecallOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("MaxProbs",
-             "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
-             "where N is the batch size. Each row contains the max probability "
-             "of an instance which computed by the previous top_k (k=1) "
-             "operator.");
-    AddInput("Indices",
-             "(Tensor, default Tensor<int>) A 2-D tensor with shape N x 1, "
-             "where N is the batch size. Each row contains the corresponding "
-             "index which computed by the previous top_k (k=1) operator.");
-    AddInput("Labels",
-             "(Tensor, default Tensor<int>) A 2-D tensor with shape N x 1, "
-             "where N is the batch size. Each element is a label and the "
-             "value should be in [0, class_number - 1].");
-    AddInput("Weights",
-             "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
-             "where N is the batch size. This input is optional. If provided, "
-             "weight of instance would be considered when computing metrics.")
-        .AsDispensable();
-    AddInput("StatesInfo",
-             "(Tensor, default Tensor<int>) A 2-D tensor with shape D x 4, "
-             "where D is the number of classes. This input is optional. If "
-             "provided, current state will be accumulated to this state and "
-             "the accumulation state will be the output state.")
-        .AsDispensable();
-    AddOutput("BatchMetrics",
-              "(Tensor, default Tensor<float>) A 1-D tensor with shape {6}. "
-              "This output tensor contains metrics for current batch data. "
-              "The layout is [macro average precision, macro average recall, "
-              "macro f1 score, micro average precision, micro average recall, "
-              "micro f1 score].");
-    AddOutput("AccumMetrics",
-              "(Tensor, default Tensor<float>) A 1-D tensor with shape {6}. "
-              "This output tensor contains metrics for accumulated data. "
-              "The layout is [macro average precision, macro average recall, "
-              "macro f1 score, micro average precision, micro average recall, "
-              "micro f1 score].");
-    AddOutput("AccumStatesInfo",
-              "(Tensor, default Tensor<float>) A 2-D tensor with shape D x 4, "
-              "where D is equal to class number. This output tensor contains "
-              "accumulated state variables used to compute metrics. The layout "
-              "for each class is [true positives, false positives, "
-              "true negatives, false negatives].");
-    AddAttr<int>("class_number", "(int) Number of classes to be evaluated.");
-    AddComment(R"DOC(
-Precision Recall Operator.
-
-When given Input(Indices) and Input(Labels), this operator can be used
-to compute various metrics including:
-1. macro average precision
-2. macro average recall
-3. macro f1 score
-4. micro average precision
-5. micro average recall
-6. micro f1 score
-
-To compute the above metrics, we need to do statistics for true positives,
-false positives and false negatives. Here the count of true negatives is not
-necessary, but counting it may provide potential usage and the cost is
-trivial, so the operator also provides the count of true negatives.
-
-We define state as a 2-D tensor with shape [class_number, 4]. Each row of a
-state contains statistic variables for corresponding class. Layout of each row
-is: TP(true positives), FP(false positives), TN(true negatives),
-FN(false negatives). If Input(Weights) is provided, TP, FP, TN, FN will be
-calculated by given weight instead of the instance count.
-
-This operator also supports metrics computing for cross-batch situation. To
-achieve this, Input(StatesInfo) should be provided. State of current batch
-data will be accumulated to Input(StatesInfo) and Output(AccumStatesInfo)
-is the accumulation state.
-
-Output(BatchMetrics) is metrics of current batch data while
-Output(AccumStatesInfo) is metrics of accumulation data.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    precision_recall,
-    ops::PrecisionRecallOp,
-    ops::PrecisionRecallOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-PD_REGISTER_STRUCT_KERNEL(precision_recall,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::PrecisionRecallKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/metrics/precision_recall_op.h b/paddle/fluid/operators/metrics/precision_recall_op.h
deleted file mode 100644
index 6eef5658c5c00..0000000000000
--- a/paddle/fluid/operators/metrics/precision_recall_op.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-enum StateVariable { TP = 0, FP, TN, FN };
-
-template <typename T, typename DeviceContext>
-class PrecisionRecallKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in0 = ctx.Input<phi::DenseTensor>("Indices");
-    auto* in1 = ctx.Input<phi::DenseTensor>("Labels");
-    auto* in2 = ctx.Input<phi::DenseTensor>("Weights");
-    auto* in3 = ctx.Input<phi::DenseTensor>("StatesInfo");
-    auto* out0 = ctx.Output<phi::DenseTensor>("BatchMetrics");
-    auto* out1 = ctx.Output<phi::DenseTensor>("AccumMetrics");
-    auto* out2 = ctx.Output<phi::DenseTensor>("AccumStatesInfo");
-
-    const int* ids_data = in0->data<int>();
-    const int* labels_data = in1->data<int>();
-    size_t cls_num = static_cast<size_t>(ctx.Attr<int>("class_number"));
-    const T* weights_data = in2 ? in2->data<T>() : nullptr;
-    const T* states_data = in3 ? in3->data<T>() : nullptr;
-    double* batch_metrics_data = out0->mutable_data<double>(ctx.GetPlace());
-    double* accum_metrics_data = out1->mutable_data<double>(ctx.GetPlace());
-    out2->mutable_data<T>(ctx.GetPlace());
-    auto accum_states = EigenMatrix<T>::From(*out2);
-    accum_states.setZero();
-    T* accum_states_data = out2->data<T>();
-
-    size_t sample_num = in0->dims()[0];
-    size_t state_var_num = 4;  // TP FP TN FN
-
-    // get states info for current batch
-    for (size_t i = 0; i < sample_num; ++i) {
-      size_t idx = ids_data[i];
-      size_t label = labels_data[i];
-
-      PADDLE_ENFORCE_GE(
-          idx,
-          0,
-          platform::errors::InvalidArgument(
-              "Class index of each instance should be "
-              "greater than or equal to 0, But the index we received is %d",
-              idx));
-      PADDLE_ENFORCE_LT(idx,
-                        cls_num,
-                        platform::errors::InvalidArgument(
-                            "Class index of each instance should be less than "
-                            "cls_num = %d, But the index we received is %d",
-                            cls_num,
-                            idx));
-
-      PADDLE_ENFORCE_GE(label,
-                        0,
-                        platform::errors::InvalidArgument(
-                            "Label of each instance should be greater than or "
-                            "equal to 0, But the label we received is %d",
-                            label));
-      PADDLE_ENFORCE_LT(label,
-                        cls_num,
-                        platform::errors::InvalidArgument(
-                            "Label of each instance should be less than "
-                            "cls_num = %d, But the label we received is %d",
-                            cls_num,
-                            label));
-
-      T w = weights_data ? weights_data[i] : 1.0;
-      if (idx == label) {
-        accum_states_data[idx * state_var_num + TP] += w;
-        for (size_t j = 0; j < cls_num; ++j) {
-          accum_states_data[j * state_var_num + TN] += w;
-        }
-        accum_states_data[idx * state_var_num + TN] -= w;
-      } else {
-        accum_states_data[label * state_var_num + FN] += w;
-        accum_states_data[idx * state_var_num + FP] += w;
-        for (size_t j = 0; j < cls_num; ++j) {
-          accum_states_data[j * state_var_num + TN] += w;
-        }
-        accum_states_data[idx * state_var_num + TN] -= w;
-        accum_states_data[label * state_var_num + TN] -= w;
-      }
-    }
-
-    ComputeMetrics(
-        accum_states_data, batch_metrics_data, state_var_num, cls_num);
-
-    if (states_data) {
-      for (size_t i = 0; i < cls_num; ++i) {
-        for (size_t j = 0; j < state_var_num; ++j) {
-          size_t idx = i * state_var_num + j;
-          accum_states_data[idx] += states_data[idx];
-        }
-      }
-    }
-
-    ComputeMetrics(
-        accum_states_data, accum_metrics_data, state_var_num, cls_num);
-  }
-
-  // expose to be reused
-  static inline T CalcPrecision(T tp_count, T fp_count) {
-    if (tp_count > 0.0 || fp_count > 0.0) {
-      return tp_count / (tp_count + fp_count);
-    }
-    return 1.0;
-  }
-
-  static inline T CalcRecall(T tp_count, T fn_count) {
-    if (tp_count > 0.0 || fn_count > 0.0) {
-      return tp_count / (tp_count + fn_count);
-    }
-    return 1.0;
-  }
-
-  static inline T CalcF1Score(T precision, T recall) {
-    if (precision > 0.0 || recall > 0.0) {
-      return 2 * precision * recall / (precision + recall);
-    }
-    return 0.0;
-  }
-
- protected:
-  void ComputeMetrics(const T* states_data,
-                      double* metrics_data,
-                      size_t state_var_num,
-                      size_t cls_num) const {
-    T total_tp_count = 0;
-    T total_fp_count = 0;
-    T total_fn_count = 0;
-    T macro_avg_precision = 0.0;
-    T macro_avg_recall = 0.0;
-
-    for (size_t i = 0; i < cls_num; ++i) {
-      T tp_count = states_data[i * state_var_num + TP];
-      T fp_count = states_data[i * state_var_num + FP];
-      T fn_count = states_data[i * state_var_num + FN];
-      total_tp_count += tp_count;
-      total_fp_count += fp_count;
-      total_fn_count += fn_count;
-      macro_avg_precision += CalcPrecision(tp_count, fp_count);
-      macro_avg_recall += CalcRecall(tp_count, fn_count);
-    }
-    macro_avg_precision /= cls_num;
-    macro_avg_recall /= cls_num;
-    T macro_f1_score = CalcF1Score(macro_avg_precision, macro_avg_recall);
-
-    T micro_avg_precision = CalcPrecision(total_tp_count, total_fp_count);
-    T micro_avg_recall = CalcRecall(total_tp_count, total_fn_count);
-    T micro_f1_score = CalcF1Score(micro_avg_precision, micro_avg_recall);
-
-    // fill metrics data
-    metrics_data[0] = macro_avg_precision;
-    metrics_data[1] = macro_avg_recall;
-    metrics_data[2] = macro_f1_score;
-    metrics_data[3] = micro_avg_precision;
-    metrics_data[4] = micro_avg_recall;
-    metrics_data[5] = micro_f1_score;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/metrics/unity_build_rule.cmake b/paddle/fluid/operators/metrics/unity_build_rule.cmake
deleted file mode 100644
index 58acbc3b1e62f..0000000000000
--- a/paddle/fluid/operators/metrics/unity_build_rule.cmake
+++ /dev/null
@@ -1,8 +0,0 @@
-# This file records the Unity Build compilation rules.
-# The source files in a `register_unity_group` called are compiled in a unity
-# file.
-# Generally, the combination rules in this file do not need to be modified.
-# If there are some redefined error in compiling with the source file which
-# in combination rule, you can remove the source file from the following rules.
-register_unity_group(cc accuracy_op.cc auc_op.cc precision_recall_op.cc)
-register_unity_group(cu accuracy_op.cu auc_op.cu)
diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
deleted file mode 100644
index 64bc176d97149..0000000000000
--- a/paddle/fluid/operators/minus_op.cc
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/minus_op.h"
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-class MinusOp : public framework::OperatorWithKernel {
- public:
-  MinusOp(const std::string &type,
-          const framework::VariableNameMap &inputs,
-          const framework::VariableNameMap &outputs,
-          const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"),
-        true,
-        platform::errors::NotFound("Input(X) of MinusOp is not found."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Y"),
-        true,
-        platform::errors::NotFound("Input(Y) of MinusOp is not found."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"),
-        true,
-        platform::errors::NotFound("Output(Out) of MinusOp is not found."));
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    if (ctx->IsRuntime() ||
-        (common::product(x_dims) > 0 && common::product(y_dims) > 0)) {
-      PADDLE_ENFORCE_EQ(
-          x_dims,
-          y_dims,
-          platform::errors::InvalidArgument(
-              "Minus operator must take two tensor with same dim, but received "
-              "input X dim is:[%s], Y dim is:[%s]",
-              x_dims,
-              y_dims));
-    }
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The left tensor of minus operator.");
-    AddInput("Y", "The right tensor of minus operator.");
-    AddOutput("Out", "The output tensor of minus operator.");
-
-    AddComment(R"DOC(
-Minus Operator.
-
-Equation:
-
-    $Out = X - Y$
-
-Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD information with input `X`.
-
-)DOC");
-  }
-};
-
-class MinusGradDescMaker : public framework::GradOpDescMakerBase {
- public:
-  using framework::GradOpDescMakerBase::GradOpDescMakerBase;
-
-  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
-    std::vector<std::unique_ptr<framework::OpDesc>> ops;
-    auto x_g = this->InputGrad("X");
-    if (!x_g.empty()) {
-      auto *x_g_op = new framework::OpDesc();
-      x_g_op->SetType("scale");
-      x_g_op->SetInput("X", this->OutputGrad("Out"));
-      x_g_op->SetOutput("Out", x_g);
-      x_g_op->SetAttr("scale", 1.0f);
-      ops.emplace_back(x_g_op);
-    }
-
-    auto y_g = this->InputGrad("Y");
-    if (!y_g.empty()) {
-      auto *y_g_op = new framework::OpDesc();
-      y_g_op->SetType("scale");
-      y_g_op->SetInput("X", this->OutputGrad("Out"));
-      y_g_op->SetOutput("Out", y_g);
-      y_g_op->SetAttr("scale", -1.0f);
-      ops.emplace_back(y_g_op);
-    }
-
-    return ops;
-  }
-};
-
-class MinusGradMaker : public imperative::GradOpBaseMakerBase {
- public:
-  using imperative::GradOpBaseMakerBase::GradOpBaseMakerBase;
-
-  std::shared_ptr<imperative::GradOpNode> operator()() const override {
-    auto x_g = this->InputGrad("X");
-    auto y_g = this->InputGrad("Y");
-
-    auto node = this->NewGradNode();
-
-    if (!x_g.empty()) {
-      imperative::TracedGradOp op(node);
-      op.SetType("scale");
-      op.SetInput("X", this->OutputGrad("Out"));
-      op.SetOutput("Out", x_g);
-      op.SetAttr("scale", 1.0f);
-      op.SetDefaultAttrsMap(DefaultAttrsMap());
-    }
-
-    if (!y_g.empty()) {
-      imperative::TracedGradOp op(node);
-      op.SetType("scale");
-      op.SetInput("X", this->OutputGrad("Out"));
-      op.SetOutput("Out", y_g);
-      op.SetAttr("scale", -1.0f);
-      op.SetDefaultAttrsMap(DefaultAttrsMap());
-    }
-
-    return node;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(minus,
-                  ops::MinusOp,
-                  ops::MinusOpMaker,
-                  ops::MinusGradDescMaker,
-                  ops::MinusGradMaker);
-PD_REGISTER_STRUCT_KERNEL(minus, CPU, ALL_LAYOUT, ops::MinusKernel, float) {}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_STRUCT_KERNEL(minus, GPU, ALL_LAYOUT, ops::MinusKernel, float) {}
-#endif
diff --git a/paddle/fluid/operators/minus_op.h b/paddle/fluid/operators/minus_op.h
deleted file mode 100644
index 8cc18fe0c97ec..0000000000000
--- a/paddle/fluid/operators/minus_op.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class MinusKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* left_tensor = context.Input<phi::DenseTensor>("X");
-    auto* right_tensor = context.Input<phi::DenseTensor>("Y");
-    auto* out_tensor = context.Output<phi::DenseTensor>("Out");
-
-    out_tensor->mutable_data<T>(context.GetPlace());
-    auto& dev =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenSub<std::decay_t<decltype(dev)>, T>::Eval(
-        dev,
-        framework::EigenVector<T>::Flatten(*out_tensor),
-        framework::EigenVector<T>::Flatten(*left_tensor),
-        framework::EigenVector<T>::Flatten(*right_tensor));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/miopen_rnn_cache.h b/paddle/fluid/operators/miopen_rnn_cache.h
index 19255363259b5..2a8b38d38d577 100644
--- a/paddle/fluid/operators/miopen_rnn_cache.h
+++ b/paddle/fluid/operators/miopen_rnn_cache.h
@@ -92,10 +92,10 @@ struct CudnnRNNCache {
 
     const auto numDirections = is_bidirec_ ? 2 : 1;
 
-    PADDLE_ENFORCE_EQ(miopen_type,
-                      miopenFloat,
-                      platform::errors::InvalidArgument(
-                          "MIOPEN do not support double datatype."));
+    PADDLE_ENFORCE_EQ(
+        miopen_type,
+        miopenFloat,
+        phi::errors::InvalidArgument("MIOPEN do not support double datatype."));
     auto miopen_size = sizeof(float);
 
     x_desc_ = new miopenTensorDescriptor_t[seq_length_];
@@ -259,7 +259,7 @@ struct CudnnRNNCache {
     PADDLE_ENFORCE_EQ(
         weights_size_,
         miopen_size * weight_numel,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The miopen lstm and setting weight size should be same."));
 
     int dim_w[3];
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc
index c6d553865277e..4da376ce97487 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
@@ -33,16 +33,16 @@ class ModifiedHuberLossOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         x_dims.size(),
         2,
-        platform::errors::InvalidArgument("Input(input) rank should be 2, "
-                                          "but received input rank(%d) != 2",
-                                          x_dims.size()));
+        phi::errors::InvalidArgument("Input(input) rank should be 2, "
+                                     "but received input rank(%d) != 2",
+                                     x_dims.size()));
 
     if (ctx->IsRuntime() ||
         (common::product(x_dims) > 0 && common::product(y_dims) > 0)) {
       PADDLE_ENFORCE_EQ(
           x_dims,
           y_dims,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The Input(input) and Input(label) should have the same "
               "shape, but received input shape [%s] != label shape [%s]",
               x_dims,
@@ -52,7 +52,7 @@ class ModifiedHuberLossOp : public framework::OperatorWithKernel {
     if (ctx->IsRuntime()) {
       PADDLE_ENFORCE_EQ(x_dims[1],
                         1,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The second dimension of Input(input) should be 1, "
                             "but received second dimension of input (%d) != 1",
                             x_dims[1]));
@@ -123,7 +123,7 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           intermediate_dims,
           y_dims,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The shape of Intermediate variable which will be reused in "
               "backward processing should the same as "
               "the shape of Input(label), but received Intermediate variable "
@@ -134,7 +134,7 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           out_grad_dims,
           y_dims,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The shape of output gradient should be the same as "
               "the shape of Input(label), but received the output gradient "
               "shape [%s] != label shape [%s]",
diff --git a/paddle/fluid/operators/modified_huber_loss_op.h b/paddle/fluid/operators/modified_huber_loss_op.h
index 88cb91d454e72..d0fb4dd40a667 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.h
+++ b/paddle/fluid/operators/modified_huber_loss_op.h
@@ -32,7 +32,7 @@ struct CheckLabelValue {
     PADDLE_ENFORCE_EQ(
         val == static_cast<T>(0) || val == static_cast<T>(1),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(label) value of modified_huber_loss_op expected to be 0 "
             "or 1, but got %ld. Please check label value.",
             val));
diff --git a/paddle/fluid/operators/nccl/nccl_op.cc b/paddle/fluid/operators/nccl/nccl_op.cc
index c5a1097e2f157..dd3fd52d3b24d 100644
--- a/paddle/fluid/operators/nccl/nccl_op.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cc
@@ -34,13 +34,13 @@ class NCCLInitOp : public framework::OperatorBase {
                const platform::Place &place) const override {
     PADDLE_ENFORCE_NOT_NULL(
         scope.FindVar(Input(kParallelScopes)),
-        platform::errors::NotFound("Can not find variable '%s' in the scope.",
-                                   kParallelScopes));
+        phi::errors::NotFound("Can not find variable '%s' in the scope.",
+                              kParallelScopes));
     const auto &name = Output("Communicator");
     PADDLE_ENFORCE_NOT_NULL(
         scope.FindVar(name),
-        platform::errors::NotFound(
-            "Output(%s) is needed for ncclInit operator.", name));
+        phi::errors::NotFound("Output(%s) is needed for ncclInit operator.",
+                              name));
     // A parallel do may not use all the gpus. For example, the batch size is 7
     // in the last batch while we have 8 gpu. In this case, parallel_do will
     // create 7 parallel scopes, so should ncclInitOp create 7 gpu peers
@@ -52,7 +52,7 @@ class NCCLInitOp : public framework::OperatorBase {
     }
     PADDLE_ENFORCE_EQ(!gpus.empty(),
                       true,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "gpus is empty, NCCL must init with gpus"));
 
     platform::Communicator *comm =
@@ -104,11 +104,10 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NCCLAllReduce");
 
     std::string reduction = ctx->Attrs().Get<std::string>("reduction");
-    PADDLE_ENFORCE_EQ(
-        (reduction == "ncclSum" || reduction == "ncclProd" ||
-         reduction == "ncclMin" || reduction == "ncclMax"),
-        true,
-        platform::errors::InvalidArgument("invalid nccl reduction."));
+    PADDLE_ENFORCE_EQ((reduction == "ncclSum" || reduction == "ncclProd" ||
+                       reduction == "ncclMin" || reduction == "ncclMax"),
+                      true,
+                      phi::errors::InvalidArgument("invalid nccl reduction."));
 
     auto x_dims = ctx->GetInputsDim("X");
     ctx->SetOutputsDim("Out", x_dims);
@@ -150,11 +149,10 @@ class NCCLReduceOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NCCLReduce");
 
     std::string reduction = ctx->Attrs().Get<std::string>("reduction");
-    PADDLE_ENFORCE_EQ(
-        (reduction == "ncclSum" || reduction == "ncclProd" ||
-         reduction == "ncclMin" || reduction == "ncclMax"),
-        true,
-        platform::errors::InvalidArgument("invalid nccl reduction."));
+    PADDLE_ENFORCE_EQ((reduction == "ncclSum" || reduction == "ncclProd" ||
+                       reduction == "ncclMin" || reduction == "ncclMax"),
+                      true,
+                      phi::errors::InvalidArgument("invalid nccl reduction."));
 
     auto x_dims = ctx->GetInputsDim("X");
     ctx->SetOutputsDim("Out", x_dims);
@@ -201,10 +199,9 @@ class NCCLBcastOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NCCLBcast");
 
     int root = ctx->Attrs().Get<int>("root");
-    PADDLE_ENFORCE_EQ(
-        root != platform::kInvalidGPUId,
-        true,
-        platform::errors::InvalidArgument("Bcast root must be set."));
+    PADDLE_ENFORCE_EQ(root != platform::kInvalidGPUId,
+                      true,
+                      phi::errors::InvalidArgument("Bcast root must be set."));
 
     auto x_dims = ctx->GetInputsDim("X");
     ctx->SetOutputsDim("Out", x_dims);
diff --git a/paddle/fluid/operators/nccl/nccl_op.cu.cc b/paddle/fluid/operators/nccl/nccl_op.cu.cc
index abb24cc8cae10..f1d6073a37231 100644
--- a/paddle/fluid/operators/nccl/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cu.cc
@@ -46,7 +46,7 @@ static ncclRedOp_t str_to_nccl_red_type(std::string reduction) {
   auto it = str_to_type.find(reduction);
   PADDLE_ENFORCE_EQ(it != str_to_type.end(),
                     true,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Invalid nccl reduction. Must be ncclMin | ncclMax | "
                         "ncclProd | ncclSum"));
   return it->second;
@@ -58,7 +58,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()),
                       true,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "This kernel only runs on GPU device."));
     auto* x = ctx.Input<phi::DenseTensor>("X");
     auto* out = ctx.Output<phi::DenseTensor>("Out");
@@ -91,10 +91,10 @@ template <typename T, typename DeviceContext>
 class NCCLReduceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "This kernel only runs on GPU device."));
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()),
+        true,
+        phi::errors::InvalidArgument("This kernel only runs on GPU device."));
     auto x = ctx.Input<phi::DenseTensor>("X");  // x0, x1, x2
     auto out = ctx.Output<phi::DenseTensor>("Out");
     auto* comm = ctx.Input<Communicator>("Communicator");
@@ -132,10 +132,10 @@ template <typename T, typename DeviceContext>
 class NCCLBcastKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "This kernel only runs on GPU device."));
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()),
+        true,
+        phi::errors::InvalidArgument("This kernel only runs on GPU device."));
     int root = ctx.Attr<int>("root");
     auto* comm = ctx.Input<Communicator>("Communicator");
     // device id
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 1b622b7571667..ac260615969b4 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -45,7 +45,7 @@ class NCEOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           x_dims[0],
           label_dims[0],
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The first dimension of Input(Input) and Input(Label) should be "
               "equal in runtime. But received: Input(Input)'s shape = [%s] "
               "with 1st dim =  %d, Input(Label)'s shape = [%s] with 1st dim = "
@@ -61,7 +61,7 @@ class NCEOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           ctx->GetInputDim("Weight")[0],
           ctx->GetInputDim("Bias")[0],
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The first dimension of Input(Weight) and Input(Bias) "
               "should be equal. But received: Input(Weight)'s shape = [%s] "
               "with 1st dim = %d, and Input(Bias)'s shape = [%s] with 1st dim "
@@ -78,7 +78,7 @@ class NCEOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         num_total_classes,
         ctx->GetInputDim("Weight")[0],
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The number of total classes should be equal to the first "
             "dimension of Input(Weight). But received: Attr(num_total_classes) "
             "= %d, Input(Weight)'s shape = [%s] with 1st dim = %d.",
@@ -89,7 +89,7 @@ class NCEOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           custom_neg_classes.size(),
           static_cast<size_t>(num_neg_samples),
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The size of Attr(custom_neg_classes) should be equal "
               "to the number of negative samples. But received: "
               "custom_neg_classes.size() = %d, num_neg_samples = %d.",
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index 41262dca6e53c..25a970a5fa6da 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -104,7 +104,7 @@ class NCEKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_EQ(
             dist_probs->numel(),
             num_total_classes,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "ShapeError: The number of elements in Input(CustomDistProbs) "
                 "should be equal to the number of total classes. But Received: "
                 "Input(CustomDistProbs).numel() = %d, Attr(num_total_classes) "
@@ -114,7 +114,7 @@ class NCEKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_EQ(
             dist_alias->numel(),
             num_total_classes,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "ShapeError: The number of elements in Input(CustomDistAlias) "
                 "should be equal to the number of total classes. But Received: "
                 "Input(CustomDistAlias).numel() = %d, Attr(num_total_classes) "
@@ -124,7 +124,7 @@ class NCEKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_EQ(
             dist_alias_probs->numel(),
             num_total_classes,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "ShapeError: The number of elements in "
                 "Input(CustomDistAliasProbs) "
                 "should be equal to the number of total classes. But Received: "
@@ -144,7 +144,7 @@ class NCEKernel : public framework::OpKernel<T> {
         break;
       }
       default: {
-        PADDLE_THROW(platform::errors::InvalidArgument(
+        PADDLE_THROW(phi::errors::InvalidArgument(
             "Unsupported SamplerType. SamplerType should be 0: Uniform, "
             "1: LogUniform or 2: CustomDist. Received SamplerType: %d",
             sampler_type));
@@ -180,7 +180,7 @@ class NCEKernel : public framework::OpKernel<T> {
     for (int x = 0; x < sample_labels->numel(); x++) {
       PADDLE_ENFORCE_GE(sample_labels_data[x],
                         0,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "ValueError: Every sample label should be "
                             "non-negative. But received: "
                             "Input(SampleLabels)[%d] = %d",
@@ -290,7 +290,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_EQ(
             dist_probs->numel(),
             num_total_classes,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "ShapeError: The number of elements in Input(CustomDistProbs) "
                 "should be equal to the number of total classes. But Received: "
                 "Input(CustomDistProbs).numel() = %d, Attr(num_total_classes) "
@@ -300,7 +300,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_EQ(
             dist_alias->numel(),
             num_total_classes,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "ShapeError: The number of elements in Input(CustomDistAlias) "
                 "should be equal to the number of total classes. But Received: "
                 "Input(CustomDistAlias).numel() = %d, Attr(num_total_classes) "
@@ -310,7 +310,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_EQ(
             dist_alias_probs->numel(),
             num_total_classes,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "ShapeError: The number of elements in "
                 "Input(CustomDistAliasProbs) "
                 "should be equal to the number of total classes. But Received: "
@@ -330,7 +330,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
         break;
       }
       default: {
-        PADDLE_THROW(platform::errors::InvalidArgument(
+        PADDLE_THROW(phi::errors::InvalidArgument(
             "Unsupported SamplerType. SamplerType should be 0: Uniform, "
             "1: LogUniform or 2: CustomDist. Received SamplerType: %d",
             sampler_type));
@@ -399,7 +399,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
         auto *table_t = context.Input<phi::SelectedRows>("Weight");
         table_dim = table_t->value().dims();
       } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
+        PADDLE_THROW(phi::errors::InvalidArgument(
             "The parameter Weight of a NCE_OP "
             "must be either phi::DenseTensor or SelectedRows"));
       }
diff --git a/paddle/fluid/operators/number_count_op.cc b/paddle/fluid/operators/number_count_op.cc
index a67d6455bcf5f..7fb293891d3a5 100644
--- a/paddle/fluid/operators/number_count_op.cc
+++ b/paddle/fluid/operators/number_count_op.cc
@@ -35,7 +35,7 @@ class NumberCountOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_EQ(number_dtype,
                       framework::proto::VarType::INT64,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dtype of the number_dtype should be int64"));
     return phi::KernelKey(number_dtype, ctx.GetPlace());
   }
diff --git a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc b/paddle/fluid/operators/onednn/interpolate_onednn_op.cc
similarity index 97%
rename from paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
rename to paddle/fluid/operators/onednn/interpolate_onednn_op.cc
index 34e9679b29bb6..eff574b5a577b 100644
--- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
+++ b/paddle/fluid/operators/onednn/interpolate_onednn_op.cc
@@ -115,9 +115,8 @@ class InterpolateOneDNNKernel : public framework::OpKernel<T> {
         std::all_of(
             out_dims.begin(), out_dims.end(), [](int i) { return i > 0; }),
         0,
-        platform::errors::InvalidArgument(
-            "out_d, out_h, out_w of Op(interpolate) "
-            "should be greater than 0."));
+        phi::errors::InvalidArgument("out_d, out_h, out_w of Op(interpolate) "
+                                     "should be greater than 0."));
 
     const std::vector<int64_t> nc_dims = {in_dims[0], in_dims[1]};
     out_dims.insert(out_dims.begin(), nc_dims.begin(), nc_dims.end());
diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/onednn/lrn_onednn_op.cc
similarity index 94%
rename from paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
rename to paddle/fluid/operators/onednn/lrn_onednn_op.cc
index 7b22d5d3c6ff0..77d76add0174e 100644
--- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/onednn/lrn_onednn_op.cc
@@ -69,7 +69,7 @@ class LRNOneDNNHandler
     PADDLE_ENFORCE_EQ(
         ctx.Attr<bool>("is_test"),
         false,
-        platform::errors::PreconditionNotMet(
+        phi::errors::PreconditionNotMet(
             "is_test attribute should be set to False in training phase."));
 
     const int n = ctx.Attr<int>("n");
@@ -123,11 +123,11 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         is_float_type,
         true,
-        platform::errors::PreconditionNotMet("DNNL LRN must use float data."));
-    PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()),
-                      true,
-                      paddle::platform::errors::PreconditionNotMet(
-                          "Operator DNNL LRN must use CPUPlace"));
+        phi::errors::PreconditionNotMet("DNNL LRN must use float data."));
+    PADDLE_ENFORCE_EQ(
+        platform::is_cpu_place(ctx.GetPlace()),
+        true,
+        phi::errors::PreconditionNotMet("Operator DNNL LRN must use CPUPlace"));
     auto& dev_ctx = ctx.template device_context<OneDNNContext>();
     const auto& onednn_engine = dev_ctx.GetEngine();
 
@@ -169,11 +169,11 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     const bool is_float_type = std::is_same<T, float>::value;
     PADDLE_ENFORCE_EQ(is_float_type,
                       true,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "DNNL LRN GradOpKernel must use float data."));
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()),
                       true,
-                      paddle::platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "Operator DNNL LRNGrad must use CPUPlace"));
 
     auto in_x = ctx.Input<phi::DenseTensor>("X");
diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc b/paddle/fluid/operators/onednn/matmul_onednn_op.cc
similarity index 97%
rename from paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
rename to paddle/fluid/operators/onednn/matmul_onednn_op.cc
index 80af1b00b743c..b501cec806069 100644
--- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
+++ b/paddle/fluid/operators/onednn/matmul_onednn_op.cc
@@ -400,15 +400,15 @@ class MatMulMKLDNNKernel : public paddle::framework::OpKernel<T> {
                                 trans_y,
                                 out);
     } else if (is_bfloat16) {
-      ExecuteMatMulV1<T, paddle::platform::bfloat16>(ctx,
-                                                     onednn_engine,
-                                                     x,
-                                                     x_bd_dims,
-                                                     trans_x,
-                                                     y,
-                                                     y_bd_dims,
-                                                     trans_y,
-                                                     out);
+      ExecuteMatMulV1<T, phi::dtype::bfloat16>(ctx,
+                                               onednn_engine,
+                                               x,
+                                               x_bd_dims,
+                                               trans_x,
+                                               y,
+                                               y_bd_dims,
+                                               trans_y,
+                                               out);
     } else {
       ExecuteMatMulV1<T, int8_t>(ctx,
                                  onednn_engine,
@@ -661,7 +661,7 @@ REGISTER_OP_KERNEL(matmul,
                    MKLDNN,
                    ::phi::CPUPlace,
                    MatMulMKLDNNKernel<float>,
-                   MatMulMKLDNNKernel<paddle::platform::bfloat16>,
+                   MatMulMKLDNNKernel<phi::dtype::bfloat16>,
                    MatMulMKLDNNKernel<int8_t>,
                    MatMulMKLDNNKernel<uint8_t>);
 
@@ -669,4 +669,4 @@ REGISTER_OP_KERNEL(matmul_grad,
                    MKLDNN,
                    ::phi::CPUPlace,
                    MatMulGradMKLDNNKernel<float>,
-                   MatMulGradMKLDNNKernel<paddle::platform::bfloat16>);
+                   MatMulGradMKLDNNKernel<phi::dtype::bfloat16>);
diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/onednn/quantize_onednn_op.cc
similarity index 96%
rename from paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
rename to paddle/fluid/operators/onednn/quantize_onednn_op.cc
index 9b1cff1008677..3ad56469c922c 100644
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/onednn/quantize_onednn_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #include "paddle/phi/backends/onednn/onednn_reuse.h"
 
 namespace paddle {
@@ -39,10 +39,10 @@ class QuantOpKernel : public framework::OpKernel<T> {
 
     PADDLE_ENFORCE_NE(quantization_scale,
                       0.0f,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Quantization scale must be different than 0.0f"));
     PADDLE_ENFORCE(quantization_shift <= 255 && quantization_shift >= 0,
-                   platform::errors::InvalidArgument(
+                   phi::errors::InvalidArgument(
                        "Quantization shift must be lower or equal to ",
                        "255 and greater or equal to 0, but got %f",
                        quantization_shift));
diff --git a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc b/paddle/fluid/operators/onednn/requantize_onednn_op.cc
similarity index 92%
rename from paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
rename to paddle/fluid/operators/onednn/requantize_onednn_op.cc
index f467a9c57a8ca..2d277625dc34d 100644
--- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/onednn/requantize_onednn_op.cc
@@ -47,17 +47,17 @@ class ReQuantOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_NE(
         scale_in,
         0.0f,
-        platform::errors::InvalidArgument("Scale of input cannot be 0.0"));
+        phi::errors::InvalidArgument("Scale of input cannot be 0.0"));
     PADDLE_ENFORCE_NE(
         scale_out,
         0.0f,
-        platform::errors::InvalidArgument("Scale of output cannot be 0.0"));
+        phi::errors::InvalidArgument("Scale of output cannot be 0.0"));
     if (shift_in != 0) {
       PADDLE_ENFORCE_EQ(
           input->dtype(),
           DataType::UINT8,
-          platform::errors::Unimplemented("Requantize does not support nonzero "
-                                          "shift for signed input."));
+          phi::errors::Unimplemented("Requantize does not support nonzero "
+                                     "shift for signed input."));
     }
 
     auto& dev_ctx = ctx.template device_context<phi::OneDNNContext>();
@@ -140,4 +140,4 @@ PD_REGISTER_STRUCT_KERNEL(requantize,
                           ops::ReQuantOpKernel,
                           int8_t,
                           uint8_t,
-                          paddle::platform::bfloat16) {}
+                          phi::dtype::bfloat16) {}
diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/onednn/reshape_onednn_op.cc
similarity index 94%
rename from paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
rename to paddle/fluid/operators/onednn/reshape_onednn_op.cc
index 8632160b04ae0..7dba03ca6a799 100644
--- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
+++ b/paddle/fluid/operators/onednn/reshape_onednn_op.cc
@@ -37,7 +37,7 @@ static std::vector<int> extract_shape(
     PADDLE_ENFORCE_EQ(
         tensor->dims(),
         common::make_ddim({1}),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "If the element type of 'shape' in ReshapeOp is phi::DenseTensor, "
             "the element's shape must be [1]. But received the element's shape "
             "is [%s]",
@@ -104,7 +104,7 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
         break;
       case ReshapeKernelOpName::flatten:
       default:
-        PADDLE_THROW(paddle::platform::errors::OutOfRange(
+        PADDLE_THROW(phi::errors::OutOfRange(
             "Reshape kernel doesn not support that operator name"));
     }
   }
@@ -180,7 +180,7 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_EQ(
             unk_dim_idx,
             -1,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "Only one dimension value of 'shape' in ReshapeOp can "
                 "be -1. But received shape = [%s], shape[%d] is also -1.",
                 common::make_ddim(shape),
@@ -190,7 +190,7 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_LT(
             static_cast<int>(i),
             in_dims.size(),
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "The index of 0 in `shape` must be less than "
                 "the input tensor X's dimensions. "
                 "But received shape = [%s], shape[%d] = 0, X's shape = [%s], "
@@ -203,7 +203,7 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_GT(
             shape[i],
             0,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "Each dimension value of 'shape' in ReshapeOp must not "
                 "be negative except one unknown dimension. "
                 "But received  shape = [%s], shape[%d] = %d.",
@@ -227,7 +227,7 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_EQ(
             output_shape[unk_dim_idx] * capacity,
             -in_size,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "The 'shape' attribute in ReshapeOp is invalid. "
                 "The input tensor X'size must be divisible by known "
                 "capacity of 'shape'. "
@@ -245,7 +245,7 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_EQ(
             capacity,
             in_size,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "The 'shape' in ReshapeOp is invalid. "
                 "The input tensor X'size must be equal to the capacity of "
                 "'shape'. "
@@ -319,7 +319,7 @@ class ReshapeGradMKLDNNKernel : public ReshapeMKLDNNKernel<T, op_name> {
         InferShapeFlattenGradOp(ctx, x_dims);
         break;
       default:
-        PADDLE_THROW(paddle::platform::errors::OutOfRange(
+        PADDLE_THROW(phi::errors::OutOfRange(
             "Reshape grad kernel doesn not support that operator name"));
     }
   }
@@ -345,7 +345,7 @@ REGISTER_OP_KERNEL(
     MKLDNN,
     phi::CPUPlace,
     ops::ReshapeMKLDNNKernel<float, ReshapeKernelOpName::squeeze>,
-    ops::ReshapeMKLDNNKernel<paddle::platform::bfloat16,
+    ops::ReshapeMKLDNNKernel<phi::dtype::bfloat16,
                              ReshapeKernelOpName::squeeze>);
 
 REGISTER_OP_KERNEL(
@@ -353,7 +353,7 @@ REGISTER_OP_KERNEL(
     MKLDNN,
     phi::CPUPlace,
     ops::ReshapeGradMKLDNNKernel<float, ReshapeKernelOpName::squeeze>,
-    ops::ReshapeGradMKLDNNKernel<paddle::platform::bfloat16,
+    ops::ReshapeGradMKLDNNKernel<phi::dtype::bfloat16,
                                  ReshapeKernelOpName::squeeze>);
 
 REGISTER_OP_KERNEL(
@@ -361,7 +361,7 @@ REGISTER_OP_KERNEL(
     MKLDNN,
     phi::CPUPlace,
     ops::ReshapeMKLDNNKernel<float, ReshapeKernelOpName::reshape>,
-    ops::ReshapeMKLDNNKernel<paddle::platform::bfloat16,
+    ops::ReshapeMKLDNNKernel<phi::dtype::bfloat16,
                              ReshapeKernelOpName::reshape>);
 
 REGISTER_OP_KERNEL(
@@ -369,7 +369,7 @@ REGISTER_OP_KERNEL(
     MKLDNN,
     phi::CPUPlace,
     ops::ReshapeGradMKLDNNKernel<float, ReshapeKernelOpName::reshape>,
-    ops::ReshapeGradMKLDNNKernel<paddle::platform::bfloat16,
+    ops::ReshapeGradMKLDNNKernel<phi::dtype::bfloat16,
                                  ReshapeKernelOpName::reshape>);
 
 REGISTER_OP_KERNEL(
@@ -377,7 +377,7 @@ REGISTER_OP_KERNEL(
     MKLDNN,
     phi::CPUPlace,
     ops::ReshapeMKLDNNKernel<float, ReshapeKernelOpName::flatten>,
-    ops::ReshapeMKLDNNKernel<paddle::platform::bfloat16,
+    ops::ReshapeMKLDNNKernel<phi::dtype::bfloat16,
                              ReshapeKernelOpName::flatten>);
 
 REGISTER_OP_KERNEL(
@@ -385,5 +385,5 @@ REGISTER_OP_KERNEL(
     MKLDNN,
     phi::CPUPlace,
     ops::ReshapeGradMKLDNNKernel<float, ReshapeKernelOpName::flatten>,
-    ops::ReshapeGradMKLDNNKernel<paddle::platform::bfloat16,
+    ops::ReshapeGradMKLDNNKernel<phi::dtype::bfloat16,
                                  ReshapeKernelOpName::flatten>);
diff --git a/paddle/fluid/operators/mkldnn/shuffle_channel_mkldnn_op.cc b/paddle/fluid/operators/onednn/shuffle_channel_onednn_op.cc
similarity index 97%
rename from paddle/fluid/operators/mkldnn/shuffle_channel_mkldnn_op.cc
rename to paddle/fluid/operators/onednn/shuffle_channel_onednn_op.cc
index 6f656c5f1a2a1..19396cbe489ce 100644
--- a/paddle/fluid/operators/mkldnn/shuffle_channel_mkldnn_op.cc
+++ b/paddle/fluid/operators/onednn/shuffle_channel_onednn_op.cc
@@ -75,4 +75,4 @@ REGISTER_OP_KERNEL(shuffle_channel,
                    MKLDNN,
                    phi::CPUPlace,
                    ops::ShuffleChannelMKLDNNKernel<float>,
-                   ops::ShuffleChannelMKLDNNKernel<paddle::platform::bfloat16>);
+                   ops::ShuffleChannelMKLDNNKernel<phi::dtype::bfloat16>);
diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/onednn/transpose_onednn_op.cc
similarity index 100%
rename from paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
rename to paddle/fluid/operators/onednn/transpose_onednn_op.cc
diff --git a/paddle/fluid/operators/ops_signature/elementwise_sig.cc b/paddle/fluid/operators/ops_signature/elementwise_sig.cc
index b1150268fbad1..82f891bb48a00 100644
--- a/paddle/fluid/operators/ops_signature/elementwise_sig.cc
+++ b/paddle/fluid/operators/ops_signature/elementwise_sig.cc
@@ -168,7 +168,7 @@ KernelSignature ElementwiseDivGradOpArgumentMapping(
 KernelSignature ElementwiseDivDoubleGradOpArgumentMapping(
     const ArgumentMappingContext& ctx UNUSED) {
   return KernelSignature("divide_double_grad",
-                         {"Y", "Out", "DX", "DDX", "DDY"},
+                         {"Y", "Out", "Out@GRAD", "DX", "DDX", "DDY"},
                          {"axis"},
                          {"Y@GRAD", "DOut", "DDOut"});
 }
diff --git a/paddle/fluid/operators/ops_signature/pow2_decay_with_linear_warmup_sig.cc b/paddle/fluid/operators/ops_signature/pow2_decay_with_linear_warmup_sig.cc
deleted file mode 100644
index cf35b6e998095..0000000000000
--- a/paddle/fluid/operators/ops_signature/pow2_decay_with_linear_warmup_sig.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature Pow2DecayWithLinearWarmupOpArgumentMapping(
-    const ArgumentMappingContext& ctx UNUSED) {
-  return KernelSignature("pow2_decay_with_linear_warmup",
-                         {"LearningRate", "Step"},
-                         {"warmup_steps", "total_steps", "base_lr", "end_lr"},
-                         {"LearningRateOut", "StepOut"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(pow2_decay_with_linear_warmup,
-                           phi::Pow2DecayWithLinearWarmupOpArgumentMapping);
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
index 6c64c6a1f72ff..23441206a55c1 100644
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
@@ -36,14 +36,14 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
                    "DecayedAdagradOp");
     PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("Param").front(),
                       framework::proto::VarType::LOD_TENSOR,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The input var's type should be phi::DenseTensor, "
                           "but the received is %s",
                           ctx->Inputs("Param").front(),
                           ctx->GetInputsVarType("Param").front()));
     PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("Grad").front(),
                       framework::proto::VarType::LOD_TENSOR,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The input var's type should be phi::DenseTensor, "
                           "but the received is %s",
                           ctx->Inputs("Grad").front(),
@@ -57,26 +57,26 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
     auto lr_dims = ctx->GetInputDim("LearningRate");
     PADDLE_ENFORCE_NE(common::product(lr_dims),
                       0,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Maybe the Input variable LearningRate has not "
                           "been initialized. You may need to confirm "
                           "if you put exe.run(startup_program) "
                           "after optimizer.minimize function."));
-    PADDLE_ENFORCE_EQ(common::product(lr_dims),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "LearningRate should have one element"));
+    PADDLE_ENFORCE_EQ(
+        common::product(lr_dims),
+        1,
+        phi::errors::InvalidArgument("LearningRate should have one element"));
     auto param_dims = ctx->GetInputDim("Param");
     PADDLE_ENFORCE_EQ(
         param_dims,
         ctx->GetInputDim("Grad"),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Param and Grad input of DecayedAdagradOp should have "
             "the same dimension."));
     PADDLE_ENFORCE_EQ(
         param_dims,
         ctx->GetInputDim("Moment"),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Param and Moment input of DecayedAdagradOp should have "
             "the same dimension."));
 
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index b51d12c003e38..a54aebc3eba5e 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -60,7 +60,7 @@ static void CheckCommContextHasRingId(
     const distributed::CommContextManager &comm_context_manager, int ring_id) {
   PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
                     true,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "You choose to use new communication library by "
                         "setting environment "
                         "variable FLAGS_dynamic_static_unified_comm True. "
@@ -1773,7 +1773,7 @@ void DistributedFusedLambKernel(
         comm_context_manager.Get(std::to_string(ring_ids[0])));
     PADDLE_ENFORCE_NE(comm_ctx,
                       nullptr,
-                      paddle::platform::errors::Unavailable(
+                      phi::errors::Unavailable(
                           "NCCLCommContext is nullptr, collective op should "
                           "has ring_id attr."));
 
diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.cc b/paddle/fluid/operators/optimizers/dpsgd_op.cc
index d8762b8bd719a..4c5b7bb369ad8 100644
--- a/paddle/fluid/operators/optimizers/dpsgd_op.cc
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.cc
@@ -22,41 +22,41 @@ class DpsgdOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Param"),
-                      true,
-                      platform::errors::NotFound(
-                          "Input(Param) of DpsgdOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"),
-                      true,
-                      platform::errors::NotFound(
-                          "Input(Grad) of DpsgdOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Param"),
+        true,
+        phi::errors::NotFound("Input(Param) of DpsgdOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Grad"),
+        true,
+        phi::errors::NotFound("Input(Grad) of DpsgdOp should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("LearningRate"),
         true,
-        platform::errors::NotFound(
+        phi::errors::NotFound(
             "Input(LearningRate) of DpsgdOp should not be null."));
     PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("Param").front(),
                       framework::proto::VarType::LOD_TENSOR,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The input var's type should be phi::DenseTensor, "
                           "but the received is %s",
                           ctx->GetInputsVarType("Param").front()));
     PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("Grad").front(),
                       framework::proto::VarType::LOD_TENSOR,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The input var's type should be phi::DenseTensor, "
                           "but the received is %s",
                           ctx->GetInputsVarType("Grad").front()));
 
     PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"),
                       true,
-                      platform::errors::NotFound(
+                      phi::errors::NotFound(
                           "Output(ParamOut) of DpsgdOp should not be null."));
 
     auto lr_dims = ctx->GetInputDim("LearningRate");
     PADDLE_ENFORCE_EQ(common::product(lr_dims),
                       1,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Learning rate should have 1 dimension. But Received "
                           "LearningRate's dims [%s].",
                           common::product(lr_dims)));
@@ -64,7 +64,7 @@ class DpsgdOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         param_dims,
         ctx->GetInputDim("Grad"),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Param and Grad input of DpsgdOp should have same dimension. But "
             "received Para's dim [%s] and Grad's dim [%s].",
             param_dims,
diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.h b/paddle/fluid/operators/optimizers/dpsgd_op.h
index 0f2980ff368f4..427dc15f74638 100644
--- a/paddle/fluid/operators/optimizers/dpsgd_op.h
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.h
@@ -31,7 +31,7 @@ class DpsgdOpKernel : public framework::OpKernel<T> {
     const auto *param_var = ctx.InputVar("Param");
     PADDLE_ENFORCE_EQ(param_var->IsType<phi::DenseTensor>(),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The Var(%s)'s type should be phi::DenseTensor, "
                           "but the received is %s",
                           ctx.InputNames("Param").front(),
@@ -40,7 +40,7 @@ class DpsgdOpKernel : public framework::OpKernel<T> {
     const auto *grad_var = ctx.InputVar("Grad");
     PADDLE_ENFORCE_EQ(grad_var->IsType<phi::DenseTensor>(),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The Var(%s)'s type should be phi::DenseTensor, "
                           "but the received is %s",
                           ctx.InputNames("Grad").front(),
@@ -56,12 +56,12 @@ class DpsgdOpKernel : public framework::OpKernel<T> {
     auto sz = param_out->numel();
     PADDLE_ENFORCE_EQ(param->numel(),
                       sz,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input parameter's number of elements is error, "
                           "expected %zu, but received %zu."));
     PADDLE_ENFORCE_EQ(grad->numel(),
                       sz,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input gradient's number of elements is error, "
                           "expected %zu, but received %zu."));
 
diff --git a/paddle/fluid/operators/optimizers/ftrl_op.cc b/paddle/fluid/operators/optimizers/ftrl_op.cc
index e6eadadc17b6c..37edf5b8f8aa8 100644
--- a/paddle/fluid/operators/optimizers/ftrl_op.cc
+++ b/paddle/fluid/operators/optimizers/ftrl_op.cc
@@ -45,7 +45,7 @@ class FTRLOp : public framework::OperatorWithKernel {
     auto param_dim = ctx->GetInputDim("Param");
     PADDLE_ENFORCE_EQ(param_dim,
                       ctx->GetInputDim("Grad"),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Two input of FTRL Op's dimension must be same, but "
                           "param_dim is %d, Grad is %d",
                           param_dim,
@@ -54,14 +54,14 @@ class FTRLOp : public framework::OperatorWithKernel {
     auto lr_dim = ctx->GetInputDim("LearningRate");
     PADDLE_ENFORCE_NE(common::product(lr_dim),
                       0,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Maybe the Input variable LearningRate has not "
                           "been initialized. You may need to confirm "
                           "if you put exe.run(startup_program) "
                           "after optimizer.minimize function."));
     PADDLE_ENFORCE_EQ(common::product(lr_dim),
                       1,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Learning Rate should be a scalar, but got %d",
                           common::product(lr_dim)));
 
diff --git a/paddle/fluid/operators/optimizers/ftrl_op.h b/paddle/fluid/operators/optimizers/ftrl_op.h
index d563b84b8d5c6..347dcbafa38d5 100644
--- a/paddle/fluid/operators/optimizers/ftrl_op.h
+++ b/paddle/fluid/operators/optimizers/ftrl_op.h
@@ -221,8 +221,8 @@ class FTRLOpKernel : public framework::OpKernel<T> {
           lin_accum_out->mutable_data<T>(ctx.GetPlace()));
       for_range(functor);
     } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported Variable Type of Grad"));
+      PADDLE_THROW(
+          phi::errors::InvalidArgument("Unsupported Variable Type of Grad"));
     }
   }
 };
diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc
deleted file mode 100644
index 0c5a9721e279b..0000000000000
--- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-class Pow2DecayWithLinearWarmupOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    auto dim = common::make_ddim({1});
-    ctx->SetOutputDim("LearningRateOut", dim);
-    ctx->SetOutputDim("StepOut", dim);
-  }
-
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto data_type =
-        OperatorWithKernel::IndicateVarDataType(ctx, "LearningRate");
-    return phi::KernelKey(data_type, ctx.GetPlace());
-  }
-};
-
-class Pow2DecayWithLinearWarmupOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("LearningRate", "(Tensor) The input learning rate Tensor.");
-    AddInput("Step", "(Tensor) The input global step Tensor.");
-    AddOutput("LearningRateOut",
-              "(Tensor) The output learning rate Tensor. Same with "
-              "Input(LearningRate).");
-    AddOutput(
-        "StepOut",
-        "(Tensor) The output learning rate Tensor. Same with Input(Step).");
-    AddAttr<int64_t>("warmup_steps", "(int64_t) The warmup steps.");
-    AddAttr<int64_t>(
-        "total_steps",
-        "(int64_t) The total steps for changing the learning rate.");
-    AddAttr<float>("base_lr",
-                   "(float) The final learning rate value after warmup.");
-    AddAttr<float>("end_lr",
-                   "(float) The final learning rate value after total_steps.");
-    AddComment(R"DOC(
-The Pow2DecayWithLinearWarmup learning rate scheduler.
-
-When step_num < warmup_steps, lr = base_lr * step_num / warmup_steps
-
-When warmup_steps <= step_num <= total_steps,
-   factor = 1 - (step_num - warmup_steps) / (total_steps - warmup_steps)
-   lr = (base_lr - end_lr) * factor * factor + end_lr
-
-When step_num > total_steps, lr = end_lr
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(pow2_decay_with_linear_warmup,
-                             ops::Pow2DecayWithLinearWarmupOp,
-                             ops::Pow2DecayWithLinearWarmupOpMaker);
diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cc b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
index 074cc26c994e3..bc842d03a3c44 100644
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
@@ -34,7 +34,7 @@ class ProximalGDOp : public framework::OperatorWithKernel {
     auto param_dim = ctx->GetInputDim("Param");
     PADDLE_ENFORCE_EQ(param_dim,
                       ctx->GetInputDim("Grad"),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The shape of Intput(Param) should be equal to the "
                           "Input(Grad) of ProximalGD Op. But received "
                           "Input(Param).dimensions=[%s], "
@@ -46,7 +46,7 @@ class ProximalGDOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         common::product(lr_dim),
         1,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Learning Rate should be a scalar. But received dimensions:[%s]",
             lr_dim));
 
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu
index f6d2435590f9e..a489454ff12a9 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ b/paddle/fluid/operators/optimizers/sgd_op.cu
@@ -71,7 +71,7 @@ class SGDOpKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
     const auto* param_var = ctx.InputVar("Param");
     PADDLE_ENFORCE_EQ(param_var->IsType<phi::DenseTensor>(),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The Var(%s)'s type should be phi::DenseTensor, "
                           "but the received is %s",
                           ctx.InputNames("Param").front(),
@@ -93,7 +93,7 @@ class SGDOpKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
           ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut");
       PADDLE_ENFORCE_EQ(has_master,
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The Input(MasterParam) and Output(MasterParamOut) "
                             "should not be null when "
                             "the attr `multi_precision` is true"));
@@ -131,7 +131,7 @@ class SGDOpKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           param,
           param_out,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The input tensor Param of SgdOp should be equal with ParamOut "
               "if variable's type is SelectedRows."));
       auto* grad = ctx.Input<phi::SelectedRows>("Grad");
@@ -140,7 +140,7 @@ class SGDOpKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
       auto out_dims = param_out->dims();
       PADDLE_ENFORCE_EQ(in_height,
                         out_dims[0],
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The input tensor Grad's height of SgdOp should be "
                             "equal with ParamOut's dims. But received Grad's "
                             "height [%s] and ParamOut's dims [%s]",
@@ -153,7 +153,7 @@ class SGDOpKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
       int64_t in_row_numel = in_value.numel() / in_rows.size();
       PADDLE_ENFORCE_EQ(in_row_numel,
                         param_out->numel() / in_height,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The in_row_numel of SgdOp should be equal with "
                             "param_out's numel / in_height."));
 
@@ -179,7 +179,7 @@ class SGDOpKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
     } else {
       PADDLE_ENFORCE_EQ(false,
                         true,
-                        platform::errors::PermissionDenied(
+                        phi::errors::PermissionDenied(
                             "Unsupported Variable Type of Grad "
                             "in SgdOp. Excepted LodTensor or "
                             "SelectedRows, But received [%s]",
diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h
index 66ba8de469fee..ced04109e10bc 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.h
+++ b/paddle/fluid/operators/optimizers/sgd_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/kernels/funcs/jit/kernels.h"
 
 namespace paddle {
@@ -92,7 +92,7 @@ struct sgd_dense_param_kernel<T,
 
 // LodTensor
 template <>
-struct sgd_dense_param_kernel<platform::bfloat16,
+struct sgd_dense_param_kernel<phi::dtype::bfloat16,
                               framework::VarTypeTrait<phi::DenseTensor>::kId> {
   void operator()(const framework::ExecutionContext &ctx) const {
     VLOG(4) << "[CPU]: sgd_dense_param_kernel<bfloat16, phi::DenseTensor>";
@@ -100,12 +100,12 @@ struct sgd_dense_param_kernel<platform::bfloat16,
     const auto *param = ctx.Input<phi::DenseTensor>("Param");
     auto *param_out = ctx.Output<phi::DenseTensor>("ParamOut");
     const auto *grad = ctx.Input<phi::DenseTensor>("Grad");
-    param_out->mutable_data<platform::bfloat16>(ctx.GetPlace());
+    param_out->mutable_data<phi::dtype::bfloat16>(ctx.GetPlace());
 
-    auto p = framework::EigenVector<platform::bfloat16>::Flatten(*param);
-    auto g = framework::EigenVector<platform::bfloat16>::Flatten(*grad);
-    auto o = framework::EigenVector<platform::bfloat16>::Flatten(*param_out);
-    const auto *lr = learning_rate->data<platform::bfloat16>();
+    auto p = framework::EigenVector<phi::dtype::bfloat16>::Flatten(*param);
+    auto g = framework::EigenVector<phi::dtype::bfloat16>::Flatten(*grad);
+    auto o = framework::EigenVector<phi::dtype::bfloat16>::Flatten(*param_out);
+    const auto *lr = learning_rate->data<phi::dtype::bfloat16>();
 
     o = p - lr[0] * g;
   }
@@ -113,7 +113,7 @@ struct sgd_dense_param_kernel<platform::bfloat16,
 
 // SelectedRows
 template <>
-struct sgd_dense_param_kernel<platform::bfloat16,
+struct sgd_dense_param_kernel<phi::dtype::bfloat16,
                               framework::VarTypeTrait<phi::SelectedRows>::kId> {
   void operator()(const framework::ExecutionContext &ctx) const {
     VLOG(4) << "[CPU]: sgd_dense_param_kernel<bfloat16, SelectedRows>";
@@ -127,15 +127,15 @@ struct sgd_dense_param_kernel<platform::bfloat16,
     const int64_t grad_val_height = static_cast<int64_t>(grad_rows.size());
     const auto grad_width = grad_value.numel() / grad_val_height;
 
-    const auto *grad_data = grad_value.data<platform::bfloat16>();
-    auto *out_data = param_out->data<platform::bfloat16>();
-    const auto *lr = learning_rate->data<platform::bfloat16>();
+    const auto *grad_data = grad_value.data<phi::dtype::bfloat16>();
+    auto *out_data = param_out->data<phi::dtype::bfloat16>();
+    const auto *lr = learning_rate->data<phi::dtype::bfloat16>();
 
     for (size_t i = 0; i < grad_rows.size(); ++i) {
       PADDLE_ENFORCE_LT(
           grad_rows[i],
           grad_height,
-          platform::errors::OutOfRange(
+          phi::errors::OutOfRange(
               "Grad rows index value should be less than grad height."
               "Got [%s], but expected less than [%s]",
               grad_rows[i],
@@ -170,7 +170,7 @@ class SGDOpKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           false,
           true,
-          platform::errors::PermissionDenied(
+          phi::errors::PermissionDenied(
               "Unsupported Variable Type of Parameter in SgdOp. Excepted "
               "LodTensor or SelectedRows, But received [%s]",
               paddle::framework::ToTypeName(param_var->Type())));
@@ -188,22 +188,22 @@ class SGDOpKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
       const auto sz = param_out->numel();
       PADDLE_ENFORCE_EQ(param->numel(),
                         sz,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The input tensor Param's numel of SgdOp "
                             "should be equal with ParamOut's numel. "
                             "But received Param's "
                             "numel = [%s], ParamOut's numel = [%s]",
                             param->numel(),
                             sz));
-      PADDLE_ENFORCE_EQ(grad->numel(),
-                        sz,
-                        platform::errors::InvalidArgument(
-                            "The input tensor Grad's numel of SgdOp "
-                            "should be equal with ParamOut's numel. "
-                            "But received Grad's "
-                            "numel = [%s], ParamOut's numel = [%s]",
-                            grad->numel(),
-                            sz));
+      PADDLE_ENFORCE_EQ(
+          grad->numel(),
+          sz,
+          phi::errors::InvalidArgument("The input tensor Grad's numel of SgdOp "
+                                       "should be equal with ParamOut's numel. "
+                                       "But received Grad's "
+                                       "numel = [%s], ParamOut's numel = [%s]",
+                                       grad->numel(),
+                                       sz));
 
       dense_param_and_grad_kernel(ctx);
     } else if (grad_var->IsType<phi::SelectedRows>()) {
@@ -212,7 +212,7 @@ class SGDOpKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
       // It's better to find a more elegant solution.
       PADDLE_ENFORCE_EQ(param,
                         param_out,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The input tensor Param of SgdOp "
                             "should be equal with ParamOut if variable's "
                             "type is SelectedRows. "));
@@ -228,7 +228,7 @@ class SGDOpKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           grad->height(),
           out_dims[0],
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The input tensor Grad's height of SgdOp "
               "should be equal with ParamOut's dims. But received  Grad's "
               "height [%s] and ParamOut's dims [%s]",
@@ -246,7 +246,7 @@ class SGDOpKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           grad_width,
           param_width,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The grad_value's numel of SgdOp "
               "should be equal with param_out's numel. But received "
               "grad_value's numel [%s] and param_out's numel [%s]",
@@ -258,7 +258,7 @@ class SGDOpKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           false,
           true,
-          platform::errors::PermissionDenied(
+          phi::errors::PermissionDenied(
               "Unsupported Variable Type of Grad in SgdOp. Excepted "
               "LodTensor or SelectedRows, But received [%s]",
               paddle::framework::ToTypeName(grad_var->Type())));
@@ -273,7 +273,7 @@ class SGDOpKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
 
     PADDLE_ENFORCE_EQ(grad_var->IsType<phi::SelectedRows>(),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "When param is SelectedRows, gradient should also "
                           "be SelectedRows"));
     const auto &param = param_var->Get<phi::SelectedRows>();
@@ -291,7 +291,7 @@ class SGDOpKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         param_row_width,
         grad_row_width,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The param_row in SgdOP should have the same size with grad_row. "
             "But received param_row's width is [%s], and grad_row's width is "
             "[%s]",
@@ -306,7 +306,7 @@ class SGDOpKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
       PADDLE_ENFORCE_GE(
           id_index,
           static_cast<int64_t>(0),
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The id in SgdOp should be >= 0. But received id_index is [%s]",
               id_index));
       for (int64_t j = 0; j < grad_row_width; j++) {
diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.cc b/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
index c9f9181664e51..7ef426cedad19 100644
--- a/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
@@ -25,7 +25,7 @@ class SparseMomentumOpInferVarType : public framework::VarTypeInference {
     auto in_var_type = ctx->GetInputType("Param");
     PADDLE_ENFORCE_EQ(in_var_type == framework::proto::VarType::LOD_TENSOR,
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Only support LodTensor, Unexpected Input Type."));
 
     ctx->SetOutputType("ParamOut", in_var_type, framework::ALL_ELEMENTS);
diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.cu b/paddle/fluid/operators/optimizers/sparse_momentum_op.cu
index a0df85e1453da..0a98ee4b3e5de 100644
--- a/paddle/fluid/operators/optimizers/sparse_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.cu
@@ -14,7 +14,7 @@
 
 #include "paddle/fluid/operators/optimizers/sparse_momentum_op.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/float16.h"
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
@@ -24,4 +24,4 @@ PD_REGISTER_STRUCT_KERNEL(sparse_momentum,
                           ops::SparseMomentumOpKernel,
                           float,
                           double,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.h b/paddle/fluid/operators/optimizers/sparse_momentum_op.h
index 4c47fd2b62178..6f1a9712115af 100644
--- a/paddle/fluid/operators/optimizers/sparse_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.h
@@ -21,9 +21,9 @@
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/float16.h"
 
 #ifdef __NVCC__
 #include "cub/cub.cuh"
@@ -154,7 +154,7 @@ class SparseMomentumOp : public framework::OperatorWithKernel {
     auto lr_dims = common::product(ctx->GetInputDim("LearningRate"));
     PADDLE_ENFORCE_EQ(lr_dims != 0 && lr_dims == 1,
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Learning_rate should be a scalar. But Received "
                           "LearningRate's dim [%s]",
                           lr_dims));
@@ -163,7 +163,7 @@ class SparseMomentumOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         param_dim,
         ctx->GetInputDim("Velocity"),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Param and Velocity of SparseMomentumOp should have the same "
             "dimension. But received Param's dim [%s] and Velocity [%s].",
             param_dim,
@@ -384,8 +384,8 @@ class SparseMomentumOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         axis == 0 || axis == 1,
         true,
-        platform::errors::InvalidArgument("The axis of sparse_momentum_op only "
-                                          "support axis=0 or axis=1 now."));
+        phi::errors::InvalidArgument("The axis of sparse_momentum_op only "
+                                     "support axis=0 or axis=1 now."));
 
     auto learning_rate = ctx.Input<phi::DenseTensor>("LearningRate");
     auto param = ctx.Input<phi::DenseTensor>("Param");
@@ -400,13 +400,13 @@ class SparseMomentumOpKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_GT(
           index->dims()[0],
           0,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The index of sparse_momentum_op should not be empty"
               "when the index's rank is 1."));
     } else if (index->dims().size() == 2) {
       PADDLE_ENFORCE_EQ(index->dims()[1],
                         1,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "If the index's rank of sparse_momentum_op is 2,"
                             " the second dimension should be 1."));
     }
@@ -418,7 +418,7 @@ class SparseMomentumOpKernel : public framework::OpKernel<T> {
           ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut");
       PADDLE_ENFORCE_EQ(has_master,
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The Input(MasterParam) and Output(MasterParamOut) "
                             "should not be null when "
                             "the attr `multi_precision` is true"));
@@ -443,16 +443,16 @@ class SparseMomentumOpKernel : public framework::OpKernel<T> {
     auto param_dims = param->dims();
     auto grad_dims = grad->dims();
 
-    PADDLE_ENFORCE_EQ(param_dims.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The Param's rank of sparse_momentum_op"
-                          " must be 2 now."));
-    PADDLE_ENFORCE_EQ(grad_dims.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The Grad's rank of sparse_momentum_op"
-                          " must be 2 now."));
+    PADDLE_ENFORCE_EQ(
+        param_dims.size(),
+        2,
+        phi::errors::InvalidArgument("The Param's rank of sparse_momentum_op"
+                                     " must be 2 now."));
+    PADDLE_ENFORCE_EQ(
+        grad_dims.size(),
+        2,
+        phi::errors::InvalidArgument("The Grad's rank of sparse_momentum_op"
+                                     " must be 2 now."));
 
     phi::DenseTensor sorted_index, grad_index, sort_value;
     auto sorted_index_ptr =
@@ -511,7 +511,7 @@ class SparseMomentumOpKernel : public framework::OpKernel<T> {
         grad_index_ptr[i] = vec_tosort[i].second;
       }
     } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
+      PADDLE_THROW(phi::errors::Unimplemented(
           "sparse_momentum %s is not supported.", ctx.GetPlace()));
     }
 
diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
deleted file mode 100644
index 6529bbc29fcfe..0000000000000
--- a/paddle/fluid/operators/pad2d_op.cc
+++ /dev/null
@@ -1,649 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <array>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-void Pad2DConstNCHW(const T* in_data,
-                    const int num,
-                    const int channels,
-                    const int in_height,
-                    const int in_width,
-                    const int out_height,
-                    const int out_width,
-                    const int pad_top,
-                    const int pad_left,
-                    T value,
-                    T* out_data) {
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      for (int out_h = 0; out_h < out_height; ++out_h) {
-        for (int out_w = 0; out_w < out_width; ++out_w) {
-          int in_h = out_h - pad_top;
-          int in_w = out_w - pad_left;
-          out_data[out_h * out_width + out_w] =
-              (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)
-                  ? value
-                  : in_data[in_h * in_width + in_w];
-        }
-      }
-      in_data += in_height * in_width;
-      out_data += out_height * out_width;
-    }
-  }
-}
-
-template <typename T>
-void Pad2DConstNHWC(const T* in_data,
-                    const int num,
-                    const int channels,
-                    const int in_height,
-                    const int in_width,
-                    const int out_height,
-                    const int out_width,
-                    const int pad_top,
-                    const int pad_left,
-                    T value,
-                    T* out_data) {
-  for (int n = 0; n < num; ++n) {
-    for (int out_h = 0; out_h < out_height; ++out_h) {
-      for (int out_w = 0; out_w < out_width; ++out_w) {
-        int in_h = out_h - pad_top;
-        int in_w = out_w - pad_left;
-        const int out_index = (out_h * out_width + out_w) * channels;
-        if (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width) {
-          for (int c = 0; c < channels; ++c) {
-            out_data[out_index + c] = value;
-          }
-        } else {
-          const int in_index = (in_h * in_width + in_w) * channels;
-          for (int c = 0; c < channels; ++c) {
-            out_data[out_index + c] = in_data[in_index + c];
-          }
-        }
-      }
-    }
-    in_data += in_height * in_width * channels;
-    out_data += out_height * out_width * channels;
-  }
-}
-
-template <typename T>
-void Pad2DReflectNCHW(const T* in_data,
-                      const int num,
-                      const int channels,
-                      const int in_height,
-                      const int in_width,
-                      const int out_height,
-                      const int out_width,
-                      const int pad_top,
-                      const int pad_left,
-                      T* out_data) {
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      for (int out_h = 0; out_h < out_height; ++out_h) {
-        for (int out_w = 0; out_w < out_width; ++out_w) {
-          int in_h = out_h - pad_top;
-          int in_w = out_w - pad_left;
-          in_h = std::max(in_h, -in_h);  // reflect by 0
-          in_h =
-              std::min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
-          in_w = std::max(in_w, -in_w);                  // reflect by 0
-          in_w =
-              std::min(in_w, 2 * in_width - in_w - 2);  // reflect by in_width
-          out_data[out_h * out_width + out_w] = in_data[in_h * in_width + in_w];
-        }
-      }
-      in_data += in_height * in_width;
-      out_data += out_height * out_width;
-    }
-  }
-}
-
-template <typename T>
-void Pad2DReflectNHWC(const T* in_data,
-                      const int num,
-                      const int channels,
-                      const int in_height,
-                      const int in_width,
-                      const int out_height,
-                      const int out_width,
-                      const int pad_top,
-                      const int pad_left,
-                      T* out_data) {
-  for (int n = 0; n < num; ++n) {
-    for (int out_h = 0; out_h < out_height; ++out_h) {
-      for (int out_w = 0; out_w < out_width; ++out_w) {
-        const int out_index = (out_h * out_width + out_w) * channels;
-        int in_h = out_h - pad_top;
-        int in_w = out_w - pad_left;
-        in_h = std::max(in_h, -in_h);
-        in_h = std::min(in_h, 2 * in_height - in_h - 2);
-        in_w = std::max(in_w, -in_w);
-        in_w = std::min(in_w, 2 * in_width - in_w - 2);
-        const int in_index = (in_h * in_width + in_w) * channels;
-
-        for (int c = 0; c < channels; ++c) {
-          out_data[out_index + c] = in_data[in_index + c];
-        }
-      }
-    }
-    in_data += in_height * in_width * channels;
-    out_data += out_height * out_width * channels;
-  }
-}
-
-template <typename T>
-void Pad2DEdgeNCHW(const T* in_data,
-                   const int num,
-                   const int channels,
-                   const int in_height,
-                   const int in_width,
-                   const int out_height,
-                   const int out_width,
-                   const int pad_top,
-                   const int pad_left,
-                   T* out_data) {
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      for (int out_h = 0; out_h < out_height; ++out_h) {
-        for (int out_w = 0; out_w < out_width; ++out_w) {
-          int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
-          int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
-          out_data[out_h * out_width + out_w] = in_data[in_h * in_width + in_w];
-        }
-      }
-      in_data += in_height * in_width;
-      out_data += out_height * out_width;
-    }
-  }
-}
-
-template <typename T>
-void Pad2DEdgeNHWC(const T* in_data,
-                   const int num,
-                   const int channels,
-                   const int in_height,
-                   const int in_width,
-                   const int out_height,
-                   const int out_width,
-                   const int pad_top,
-                   const int pad_left,
-                   T* out_data) {
-  for (int n = 0; n < num; ++n) {
-    for (int out_h = 0; out_h < out_height; ++out_h) {
-      for (int out_w = 0; out_w < out_width; ++out_w) {
-        const int out_index = (out_h * out_width + out_w) * channels;
-        int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
-        int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
-        const int in_index = (in_h * in_width + in_w) * channels;
-        for (int c = 0; c < channels; ++c) {
-          out_data[out_index + c] = in_data[in_index + c];
-        }
-      }
-    }
-    in_data += in_height * in_width * channels;
-    out_data += out_height * out_width * channels;
-  }
-}
-
-template <typename T>
-void Pad2DGradConstNCHW(T* d_in_data,
-                        const int num,
-                        const int channels,
-                        const int in_height,
-                        const int in_width,
-                        const int out_height,
-                        const int out_width,
-                        const int pad_top,
-                        const int pad_left,
-                        const T* d_out_data) {
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      for (int out_h = 0; out_h < out_height; ++out_h) {
-        for (int out_w = 0; out_w < out_width; ++out_w) {
-          int in_h = out_h - pad_top;
-          int in_w = out_w - pad_left;
-          if (!(in_h < 0 || in_w < 0 || in_h >= in_height ||
-                in_w >= in_width)) {
-            d_in_data[in_h * in_width + in_w] =
-                d_out_data[out_h * out_width + out_w];
-          }
-        }
-      }
-      d_in_data += in_height * in_width;
-      d_out_data += out_height * out_width;
-    }
-  }
-}
-
-template <typename T>
-void Pad2DGradConstNHWC(T* d_in_data,
-                        const int num,
-                        const int channels,
-                        const int in_height,
-                        const int in_width,
-                        const int out_height,
-                        const int out_width,
-                        const int pad_top,
-                        const int pad_left,
-                        const T* d_out_data) {
-  for (int n = 0; n < num; ++n) {
-    for (int out_h = 0; out_h < out_height; ++out_h) {
-      for (int out_w = 0; out_w < out_width; ++out_w) {
-        int in_h = out_h - pad_top;
-        int in_w = out_w - pad_left;
-        const int out_index = (out_h * out_width + out_w) * channels;
-        if (!(in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)) {
-          const int in_index = (in_h * in_width + in_w) * channels;
-          for (int c = 0; c < channels; ++c) {
-            d_in_data[in_index + c] = d_out_data[out_index + c];
-          }
-        }
-      }
-    }
-    d_in_data += in_height * in_width * channels;
-    d_out_data += out_height * out_width * channels;
-  }
-}
-
-template <typename T>
-void Pad2DGradReflectNCHW(T* d_in_data,
-                          const int num,
-                          const int channels,
-                          const int in_height,
-                          const int in_width,
-                          const int out_height,
-                          const int out_width,
-                          const int pad_top,
-                          const int pad_left,
-                          const T* d_out_data) {
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      for (int out_h = 0; out_h < out_height; ++out_h) {
-        for (int out_w = 0; out_w < out_width; ++out_w) {
-          int in_h = out_h - pad_top;
-          int in_w = out_w - pad_left;
-          in_h = std::max(in_h, -in_h);  // reflect over 0
-          in_h = std::min(in_h,
-                          2 * in_height - in_h - 2);  // reflect over in_height
-          in_w = std::max(in_w, -in_w);               // reflect over 0
-          in_w =
-              std::min(in_w, 2 * in_width - in_w - 2);  // reflect over in_width
-          d_in_data[in_h * in_width + in_w] +=
-              d_out_data[out_h * out_width + out_w];
-        }
-      }
-      d_in_data += in_height * in_width;
-      d_out_data += out_height * out_width;
-    }
-  }
-}
-
-template <typename T>
-void Pad2DGradReflectNHWC(T* d_in_data,
-                          const int num,
-                          const int channels,
-                          const int in_height,
-                          const int in_width,
-                          const int out_height,
-                          const int out_width,
-                          const int pad_top,
-                          const int pad_left,
-                          const T* d_out_data) {
-  for (int n = 0; n < num; ++n) {
-    for (int out_h = 0; out_h < out_height; ++out_h) {
-      for (int out_w = 0; out_w < out_width; ++out_w) {
-        const int out_index = (out_h * out_width + out_w) * channels;
-        int in_h = out_h - pad_top;
-        int in_w = out_w - pad_left;
-        in_h = std::max(in_h, -in_h);
-        in_h = std::min(in_h, 2 * in_height - in_h - 2);
-        in_w = std::max(in_w, -in_w);
-        in_w = std::min(in_w, 2 * in_width - in_w - 2);
-        const int in_index = (in_h * in_width + in_w) * channels;
-        for (int c = 0; c < channels; ++c) {
-          d_in_data[in_index + c] += d_out_data[out_index + c];
-        }
-      }
-    }
-    d_in_data += in_height * in_width * channels;
-    d_out_data += out_height * out_width * channels;
-  }
-}
-
-template <typename T>
-void Pad2DGradEdgeNCHW(T* d_in_data,
-                       const int num,
-                       const int channels,
-                       const int in_height,
-                       const int in_width,
-                       const int out_height,
-                       const int out_width,
-                       const int pad_top,
-                       const int pad_left,
-                       const T* d_out_data) {
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      for (int out_h = 0; out_h < out_height; ++out_h) {
-        for (int out_w = 0; out_w < out_width; ++out_w) {
-          int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
-          int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
-          d_in_data[in_h * in_width + in_w] +=
-              d_out_data[out_h * out_width + out_w];
-        }
-      }
-      d_in_data += in_height * in_width;
-      d_out_data += out_height * out_width;
-    }
-  }
-}
-
-template <typename T>
-void Pad2DGradEdgeNHWC(T* d_in_data,
-                       const int num,
-                       const int channels,
-                       const int in_height,
-                       const int in_width,
-                       const int out_height,
-                       const int out_width,
-                       const int pad_top,
-                       const int pad_left,
-                       const T* d_out_data) {
-  for (int n = 0; n < num; ++n) {
-    for (int out_h = 0; out_h < out_height; ++out_h) {
-      for (int out_w = 0; out_w < out_width; ++out_w) {
-        const int out_index = (out_h * out_width + out_w) * channels;
-        int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
-        int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
-        const int in_index = (in_h * in_width + in_w) * channels;
-        for (int c = 0; c < channels; ++c) {
-          d_in_data[in_index + c] += d_out_data[out_index + c];
-        }
-      }
-    }
-    d_in_data += in_height * in_width * channels;
-    d_out_data += out_height * out_width * channels;
-  }
-}
-
-static inline void GetPaddings(int* paddings,
-                               const framework::ExecutionContext& context) {
-  auto* paddings_t = context.Input<phi::DenseTensor>("Paddings");
-  if (paddings_t) {
-    auto paddings_data = paddings_t->data<int>();
-    paddings[0] = paddings_data[0];
-    paddings[1] = paddings_data[1];
-    paddings[2] = paddings_data[2];
-    paddings[3] = paddings_data[3];
-  } else {
-    auto pads = context.Attr<std::vector<int>>("paddings");
-    std::copy(pads.begin(), pads.end(), paddings);
-  }
-}
-class Pad2dOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Pad2d");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Pad2d");
-
-    auto x_dim = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(x_dim.size(),
-                      4,
-                      platform::errors::InvalidArgument(
-                          "The size of Input(X)'s dimension should be equal to "
-                          "4, but received %d. ",
-                          x_dim.size()));
-
-    std::vector<int64_t> out_dims(x_dim.size());
-    auto data_format = ctx->Attrs().Get<std::string>("data_format");
-    out_dims[0] = x_dim[0];
-    if (ctx->HasInput("Paddings")) {
-      auto paddings_dim = ctx->GetInputDim("Paddings");
-      PADDLE_ENFORCE_EQ(paddings_dim.size(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Size of Input(Paddings)'s dimension should be "
-                            "equal to 1, but received %d.",
-                            paddings_dim.size()));
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(paddings_dim[0],
-                          4,
-                          platform::errors::InvalidArgument(
-                              "Shape of Input(Paddings) should be equal to "
-                              "[4], but received [%d].",
-                              paddings_dim[0]));
-      }
-      out_dims[1] = x_dim[1];
-      out_dims[2] = x_dim[2];
-      out_dims[3] = x_dim[3];
-    } else {
-      auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-      PADDLE_ENFORCE_EQ(
-          paddings.size(),
-          4,
-          platform::errors::InvalidArgument(
-              "Size of paddings should be equal to 4, but received %d.",
-              static_cast<int>(paddings.size())));
-      if (data_format == "NCHW") {
-        out_dims[1] = x_dim[1];  // channel
-        out_dims[2] = ((!ctx->IsRuntime()) && (x_dim[2] < 0))
-                          ? x_dim[2]
-                          : (x_dim[2] + paddings[0] + paddings[1]);  // height
-        out_dims[3] = ((!ctx->IsRuntime()) && (x_dim[3] < 0))
-                          ? x_dim[3]
-                          : (x_dim[3] + paddings[2] + paddings[3]);  // width
-      } else {                                                       // NHWC
-        out_dims[3] = x_dim[3];                                      // channel
-        out_dims[1] = ((!ctx->IsRuntime()) && (x_dim[1] < 0))
-                          ? x_dim[1]
-                          : (x_dim[1] + paddings[0] + paddings[1]);  // height
-        out_dims[2] = ((!ctx->IsRuntime()) && (x_dim[2] < 0))
-                          ? x_dim[2]
-                          : (x_dim[2] + paddings[2] + paddings[3]);  // width
-      }
-    }
-
-    ctx->SetOutputDim("Out", common::make_ddim(out_dims));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-#ifdef PADDLE_WITH_DNNL
-    // only constant mode and non-blocked layouts are supported for oneDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type) &&
-        ctx.Attr<std::string>("mode") == "constant" &&
-        ctx.Input<phi::DenseTensor>("X")->mem_desc().get_inner_nblks() == 0) {
-      return phi::KernelKey(phi::Backend::ONEDNN,
-                            phi::DataLayout::ONEDNN,
-                            phi::TransToPhiDataType(input_data_type));
-    }
-#endif
-    return phi::KernelKey(input_data_type, ctx.GetPlace());
-  }
-
-  phi::KernelKey GetKernelTypeForVar(
-      const std::string& var_name,
-      const phi::DenseTensor& tensor,
-      const phi::KernelKey& expected_kernel_type) const override {
-#ifdef PADDLE_WITH_DNNL
-    if ((expected_kernel_type.layout() == phi::DataLayout::ONEDNN) &&
-        (tensor.layout() != phi::DataLayout::ONEDNN)) {
-      auto attrs = Attrs();
-      auto ar = paddle::framework::AttrReader(attrs);
-      const std::string data_format = ar.Get<std::string>("data_format");
-      return phi::KernelKey(tensor.place(),
-                            common::StringToDataLayout(data_format),
-                            expected_kernel_type.dtype());
-    }
-#endif
-    return phi::KernelKey(
-        tensor.place(), tensor.layout(), expected_kernel_type.dtype());
-  }
-};
-
-class Pad2dOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input of pad2d op. "
-             "The input should be a 4-D tensor with formate NCHW or NHWC.");
-    AddOutput("Out",
-              "The output of pad2d op. "
-              "A tensor with the same shape as X.");
-    AddInput("Paddings",
-             "A 1-D tensor to describe the padding rules."
-             "paddings=[0, 1, 2, 3] means "
-             "padding 0 row to top, 1 row to bottom, 2 columns to left "
-             "and 3 columns to right. Size of paddings must be 4.")
-        .AsDispensable();
-    AddAttr<std::vector<int>>(
-        "paddings",
-        "(vector<int>) "
-        "A list<int> to describe the padding rules."
-        "paddings=[0, 1, 2, 3] means "
-        "padding 0 row to top, 1 row to bottom, 2 columns to left "
-        "and 3 columns to right. Size of paddings must be 4.");
-    AddAttr<float>("pad_value",
-                   "(float, default 0.0) "
-                   "The value to fill the padded areas in constant mode.")
-        .SetDefault(0.0f);
-    AddAttr<std::string>("mode",
-                         "(float, default constant) "
-                         "Three modes: constant(default), reflect, edge.")
-        .SetDefault("constant");
-    AddAttr<std::string>(
-        "data_format",
-        "(string, default NCHW) Only used in "
-        "An optional string from: \"NHWC\", \"NCHW\". "
-        "Defaults to \"NHWC\". Specify the data format of the input data.")
-        .SetDefault("NCHW");
-    AddComment(R"DOC(
-Pad2d Operator.
-Pad 2-d images according to 'paddings' and 'mode'.
-If mode is 'reflect', paddings[0] and paddings[1] must be no greater
-than height-1. And the width dimension has the same condition.
-
-Given that X is a channel of image from input:
-
-X = [[1, 2, 3],
-     [4, 5, 6]]
-
-Case 0:
-
-paddings = [0, 1, 2, 3],
-mode = 'constant'
-pad_value = 0
-
-Out = [[0, 0, 1, 2, 3, 0, 0, 0]
-       [0, 0, 4, 5, 6, 0, 0, 0]
-       [0, 0, 0, 0, 0, 0, 0, 0]]
-
-Case 1:
-
-paddings = [0, 1, 2, 1],
-mode = 'reflect'
-
-Out = [[3, 2, 1, 2, 3, 2]
-       [6, 5, 4, 5, 6, 5]
-       [3, 2, 1, 2, 3, 2]]
-
-Case 2:
-
-paddings = [0, 1, 2, 1],
-mode = 'edge'
-
-Out = [[1, 1, 1, 2, 3, 3]
-       [4, 4, 4, 5, 6, 6]
-       [4, 4, 4, 5, 6, 6]]
-)DOC");
-  }
-};
-
-class Pad2dOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Pad2d@Grad");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   framework::GradVarName("Out"),
-                   "Pad2d@Grad");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Out")),
-                          ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class Pad2dOpGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> bind) const override {
-    bind->SetInput("X", this->Input("X"));
-    if (this->HasInput("Paddings")) {
-      bind->SetInput("Paddings", this->Input("Paddings"));
-    }
-    bind->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    bind->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    bind->SetAttrMap(this->Attrs());
-    bind->SetType("pad2d_grad");
-  }
-};
-
-// TODO(zjl): Paddings can also be skipped!
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(Pad2dOpGradNoNeedBufferVarsInferer, "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(pad2d,
-                  ops::Pad2dOp,
-                  ops::Pad2dOpMaker,
-                  ops::Pad2dOpGradMaker<paddle::framework::OpDesc>,
-                  ops::Pad2dOpGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(pad2d_grad,
-                  ops::Pad2dOpGrad,
-                  ops::Pad2dOpGradNoNeedBufferVarsInferer);
diff --git a/paddle/fluid/operators/partial_concat_op.cc b/paddle/fluid/operators/partial_concat_op.cc
index 7fca2eea27c45..518da44d1a08e 100644
--- a/paddle/fluid/operators/partial_concat_op.cc
+++ b/paddle/fluid/operators/partial_concat_op.cc
@@ -26,26 +26,26 @@ class PartialConcatOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GE(
         ctx->Inputs("X").size(),
         1UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Inputs(X) of Partial ConcatOp should not be empty."));
 
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("Out"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Output(Out) of Partial ConcatOp should not be null."));
 
     auto inputs_dims = ctx->GetInputsDim("X");
     PADDLE_ENFORCE_EQ(inputs_dims[0].size(),
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Only supports 2-D array with batch size in the 1st "
                           "dimension and data in the 2nd."));
 
     const size_t inputs_num = inputs_dims.size();
     PADDLE_ENFORCE_GT(inputs_num,
                       0,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "ShapeError: Input tensors count should > 0. But "
                           "received inputs' length is 0."));
     if (inputs_num == 1) {
@@ -57,7 +57,7 @@ class PartialConcatOp : public framework::OperatorWithKernel {
     for (size_t i = 0; i < inputs_num; ++i) {
       PADDLE_ENFORCE_EQ(inputs_dims[i].size(),
                         2,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "It only supports two dimensions input now."));
       if (i == 0) {
         batch_size = inputs_dims[0][0];
@@ -65,11 +65,11 @@ class PartialConcatOp : public framework::OperatorWithKernel {
       } else {
         PADDLE_ENFORCE_EQ(inputs_dims[i][0],
                           batch_size,
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "The batch size of all inputs must be same"));
         PADDLE_ENFORCE_EQ(inputs_dims[i][1],
                           input_len,
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "The input length of all inputs must be same"));
       }
     }
@@ -101,10 +101,10 @@ class PartialConcatOp : public framework::OperatorWithKernel {
         break;
       }
     }
-    PADDLE_ENFORCE_EQ(flag,
-                      1,
-                      platform::errors::InvalidArgument(
-                          "All Inputs of PartialSum OP are Empty!"));
+    PADDLE_ENFORCE_EQ(
+        flag,
+        1,
+        phi::errors::InvalidArgument("All Inputs of PartialSum OP are Empty!"));
     return phi::KernelKey(input_data_type, ctx.GetPlace());
   }
 };
@@ -124,7 +124,7 @@ class PartialConcatGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         in_names.size(),
         out_names.size(),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The number of arguments in %s[%d] and %s[%d] is not equal.",
             in_x,
             in_names.size(),
diff --git a/paddle/fluid/operators/partial_concat_op.cu b/paddle/fluid/operators/partial_concat_op.cu
index fb746b2944acc..a597cb11f08ff 100644
--- a/paddle/fluid/operators/partial_concat_op.cu
+++ b/paddle/fluid/operators/partial_concat_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/partial_concat_op.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/float16.h"
 
 namespace plat = paddle::platform;
 
@@ -73,13 +73,13 @@ class PartialConcatOpCUDAKernel : public framework::OpKernel<T> {
     phi::DenseTensor *out = ctx.Output<phi::DenseTensor>("Out");
     PADDLE_ENFORCE_EQ(in_vars[0] != nullptr,
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The input of partial concat should not be null."));
 
     auto input_dim = in_vars[0]->dims();
     PADDLE_ENFORCE_EQ(input_dim.size(),
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Only supports 2-D array with batch size in the 1st "
                           "dimension and data in the 2nd."));
     auto in_size = input_dim[1];
@@ -156,7 +156,7 @@ class PartialConcatGradOpCUDAKernel : public framework::OpKernel<T> {
 
     PADDLE_ENFORCE_EQ(ins[0] != nullptr,
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The input of partial concat should not be null."));
     // all parameters
     auto batch_size = ins[0]->dims()[0];
@@ -240,7 +240,7 @@ PD_REGISTER_STRUCT_KERNEL(partial_concat,
                           double,
                           int,
                           int64_t,
-                          plat::float16,
+                          phi::dtype::float16,
                           phi::dtype::complex<float>,
                           phi::dtype::complex<double>) {}
 PD_REGISTER_STRUCT_KERNEL(partial_concat_grad,
@@ -251,6 +251,6 @@ PD_REGISTER_STRUCT_KERNEL(partial_concat_grad,
                           double,
                           int,
                           int64_t,
-                          plat::float16,
+                          phi::dtype::float16,
                           phi::dtype::complex<float>,
                           phi::dtype::complex<double>) {}
diff --git a/paddle/fluid/operators/partial_concat_op.h b/paddle/fluid/operators/partial_concat_op.h
index fb0d17aa97b84..16dca9c8c8050 100644
--- a/paddle/fluid/operators/partial_concat_op.h
+++ b/paddle/fluid/operators/partial_concat_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/utils.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/strided_memcpy.h"
 
 namespace paddle {
@@ -28,7 +28,7 @@ static inline int64_t ComputeStartIndex(int64_t start_index, int64_t size) {
   PADDLE_ENFORCE_EQ(
       start_index >= -size && start_index < size,
       true,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The start_index is expected to be in range of [%d, %d), but got %d",
           -size,
           size,
@@ -47,13 +47,13 @@ class PartialConcatKernel : public framework::OpKernel<T> {
     phi::DenseTensor* out = ctx.Output<phi::DenseTensor>("Out");
     PADDLE_ENFORCE_EQ(ins[0] != nullptr,
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The input of partial concat should not be null."));
 
     auto input_dim = ins[0]->dims();
     PADDLE_ENFORCE_EQ(input_dim.size(),
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Only supports 2-D array with batch size in the 1st "
                           "dimension and data in the 2nd."));
     auto in_size = input_dim[1];
@@ -94,7 +94,7 @@ class PartialConcatGradientOpKernel : public framework::OpKernel<T> {
 
     PADDLE_ENFORCE_EQ(ins[0] != nullptr,
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The input of partial concat should not be null."));
     // all parameters
     auto batch_size = ins[0]->dims()[0];
diff --git a/paddle/fluid/operators/partial_sum_op.cc b/paddle/fluid/operators/partial_sum_op.cc
index 0ac288069f11a..b0c97b4fcc914 100644
--- a/paddle/fluid/operators/partial_sum_op.cc
+++ b/paddle/fluid/operators/partial_sum_op.cc
@@ -25,12 +25,12 @@ class PartialSumOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE_GE(ctx->Inputs("X").size(),
                       1UL,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Inputs(X) of PartialSumOp should not be empty."));
 
     PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Output(Out) of PartialSumOp should not be null."));
 
     auto inputs_dims = ctx->GetInputsDim("X");
@@ -38,7 +38,7 @@ class PartialSumOp : public framework::OperatorWithKernel {
     const size_t inputs_num = inputs_dims.size();
     PADDLE_ENFORCE_GT(inputs_num,
                       0,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "ShapeError: Input tensors count should > 0. But "
                           "received inputs' length is 0."));
     if (inputs_num == 1) {
@@ -55,7 +55,7 @@ class PartialSumOp : public framework::OperatorWithKernel {
     for (size_t i = 0; i < inputs_num; ++i) {
       PADDLE_ENFORCE_EQ(inputs_dims[i].size(),
                         2,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "Only support two dimensions input now."));
       if (i == 0) {
         batch_size = inputs_dims[0][0];
@@ -63,23 +63,23 @@ class PartialSumOp : public framework::OperatorWithKernel {
       } else {
         PADDLE_ENFORCE_EQ(inputs_dims[i][0],
                           batch_size,
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "The batch size of all inputs must be same"));
         PADDLE_ENFORCE_EQ(inputs_dims[i][1],
                           input_len,
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "The input len of all inputs must be same"));
       }
     }
-    PADDLE_ENFORCE_GT(input_len,
-                      start_index,
-                      platform::errors::OutOfRange(
-                          "start_index must be less than input len"));
+    PADDLE_ENFORCE_GT(
+        input_len,
+        start_index,
+        phi::errors::OutOfRange("start_index must be less than input len"));
     if (length > 0) {
       PADDLE_ENFORCE_GE(
           input_len,
           start_index + length,
-          platform::errors::OutOfRange(
+          phi::errors::OutOfRange(
               "start_index + length is larger than input length"));
     }
 
@@ -104,10 +104,10 @@ class PartialSumOp : public framework::OperatorWithKernel {
       }
     }
 
-    PADDLE_ENFORCE_EQ(flag,
-                      1,
-                      platform::errors::InvalidArgument(
-                          "All Inputs of PartialSum OP are Empty!"));
+    PADDLE_ENFORCE_EQ(
+        flag,
+        1,
+        phi::errors::InvalidArgument("All Inputs of PartialSum OP are Empty!"));
     return phi::KernelKey(input_data_type, platform::CPUPlace());
   }
 };
@@ -127,7 +127,7 @@ class PartialSumGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         in_names.size(),
         out_names.size(),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The number of arguments in %s[%d] and %s[%d] is not equal.",
             in_x,
             in_names.size(),
diff --git a/paddle/fluid/operators/partial_sum_op.cu b/paddle/fluid/operators/partial_sum_op.cu
index a38ec4c839469..25758cfde4870 100644
--- a/paddle/fluid/operators/partial_sum_op.cu
+++ b/paddle/fluid/operators/partial_sum_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/partial_sum_op.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/float16.h"
 
 namespace plat = paddle::platform;
 
@@ -80,7 +80,7 @@ class PartialSumOpCUDAKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         in_vars[0] != nullptr,
         true,
-        platform::errors::InvalidArgument("The input should not be null."));
+        phi::errors::InvalidArgument("The input should not be null."));
 
     auto place = ctx.GetPlace();  // GPUPlace only now
     auto start_index = ctx.Attr<int>("start_index");
@@ -156,7 +156,7 @@ class PartialSumGradOpCUDAKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         ins[0] != nullptr,
         true,
-        platform::errors::InvalidArgument("The input should not be null."));
+        phi::errors::InvalidArgument("The input should not be null."));
     auto start_index = ctx.Attr<int>("start_index");
     auto length = ctx.Attr<int>("length");
     if (length == -1) {
diff --git a/paddle/fluid/operators/partial_sum_op.h b/paddle/fluid/operators/partial_sum_op.h
index 1b88eafae77db..f0b55728efbc6 100644
--- a/paddle/fluid/operators/partial_sum_op.h
+++ b/paddle/fluid/operators/partial_sum_op.h
@@ -30,7 +30,7 @@ class PartialSumKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         ins[0] != nullptr,
         true,
-        platform::errors::InvalidArgument("The input should not be null."));
+        phi::errors::InvalidArgument("The input should not be null."));
 
     auto place = ctx.GetPlace();  // CPUPlace only now
 
@@ -68,7 +68,7 @@ class PartialSumGradientOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         ins[0] != nullptr,
         true,
-        platform::errors::InvalidArgument("The input should not be null."));
+        phi::errors::InvalidArgument("The input should not be null."));
     auto start_index = ctx.Attr<int>("start_index");
     auto length = ctx.Attr<int>("length");
     auto batch_size = ins[0]->dims()[0];
diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc
index 96d8bbaa6f772..2974b38ffb5ba 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.cc
+++ b/paddle/fluid/operators/positive_negative_pair_op.cc
@@ -49,7 +49,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
               ctx->HasInput("AccumulateNegativePair") &&
               ctx->HasInput("AccumulateNeutralPair"),
           true,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "All optional inputs(AccumulatePositivePair, "
               "AccumulateNegativePair, AccumulateNeutralPair) of "
               "PositiveNegativePairOp are required if one of them "
@@ -57,21 +57,21 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           ctx->GetInputDim("AccumulatePositivePair"),
           scalar_dim,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Shape of Input(AccumulatePositivePair) should be [1]. Received "
               "shape of Input(AccumulatePositivePair): [%s].",
               ctx->GetInputDim("AccumulatePositivePair")));
       PADDLE_ENFORCE_EQ(
           ctx->GetInputDim("AccumulateNegativePair"),
           scalar_dim,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Shape of Input(AccumulateNegativePair) should be [1]. Received "
               "shape of Input(AccumulateNegativePair): [%s].",
               ctx->GetInputDim("AccumulateNegativePair")));
       PADDLE_ENFORCE_EQ(
           ctx->GetInputDim("AccumulateNeutralPair"),
           scalar_dim,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Shape of Input(AccumulateNeutralPair) should be [1]. Received "
               "shape of Input(AccumulateNeutralPair): [%s].",
               ctx->GetInputDim("AccumulateNeutralPair")));
@@ -82,13 +82,13 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
     auto query_dim = ctx->GetInputDim("QueryID");
     PADDLE_ENFORCE_EQ(score_dim.size(),
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Score should be a 2-D tensor. Received shape of "
                           "Input(Score): [%s].",
                           score_dim));
     PADDLE_ENFORCE_EQ(label_dim.size(),
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Label should be a 2-D tensor. Received shape of "
                           "Input(Label): [%s].",
                           label_dim));
@@ -98,7 +98,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           label_dim[0],
           score_dim[0],
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Input(Score) and Input(Label) should have the same "
               "height (batch size). Received: the shape of Input(Score) is "
               "[%s], while the shape of Input(Label) is [%s]. The first "
@@ -109,7 +109,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           label_dim[1],
           1,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The width of Label should be 1, i.e. each item should "
               "have a scalar label. Received shape of Input(Label) is [%s]. "
               "The second dimension of it is %d, while the expected is %d.",
@@ -120,7 +120,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           query_dim,
           label_dim,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Input(QueryID) should have the same shape as Input(Label). "
               "Received: the shape of Input(QueryID) is [%s], "
               "while the shape of Input(Label) is [%s].",
@@ -131,7 +131,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
         PADDLE_ENFORCE_EQ(
             ctx->GetInputDim("Weight"),
             label_dim,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "Input(Weight) should have the same shape as Input(Label). "
                 "Received: the shape of Input(Weight) is [%s] while the shape "
                 "of Input(Label) is [%s].",
@@ -144,7 +144,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_LT(
           column,
           depth,
-          platform::errors::OutOfRange(
+          phi::errors::OutOfRange(
               "Attr(column) should be less than depth(the second "
               "dimension of Input(Score)). Received Attr(column): %d, while "
               "depth is %d.",
@@ -153,7 +153,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_GE(
           column,
           -depth,
-          platform::errors::OutOfRange(
+          phi::errors::OutOfRange(
               "Attr(column) should be greater than equal to negative "
               "depth, i.e. the second dimension of Input(Score). "
               "Received Attr(column): %d, while negative depth is %d.",
diff --git a/paddle/fluid/operators/prim_ops/CMakeLists.txt b/paddle/fluid/operators/prim_ops/CMakeLists.txt
deleted file mode 100644
index 7a1278219bb6d..0000000000000
--- a/paddle/fluid/operators/prim_ops/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-include(operators)
-if(WITH_UNITY_BUILD)
-  # Load Unity Build rules for operators in paddle/fluid/operators/prim_ops.
-  include(unity_build_rule.cmake)
-endif()
-register_operators()
diff --git a/paddle/fluid/operators/prim_ops/abs_p_op.cc b/paddle/fluid/operators/prim_ops/abs_p_op.cc
deleted file mode 100644
index 87b5243d6afe7..0000000000000
--- a/paddle/fluid/operators/prim_ops/abs_p_op.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-class AbsPrimOp : public framework::OperatorBase {
- public:
-  AbsPrimOp(const std::string &type,
-            const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator abs_p should not be executed directly"));
-  }
-};
-
-class AbsPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of abs_p op.");
-    AddOutput("Y", "(Tensor), The output tensor of abs_p op.");
-    AddComment(R"DOC(Autograd primitive abs_p operator.)DOC");
-  }
-};
-
-class AbsPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    PADDLE_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_var->GetShape());
-  }
-};
-
-class AbsPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Output(ctx, "Y")[0];
-    SetType(ctx, y_name, GetType(ctx, x_name));
-    SetDataType(ctx, y_name, GetDataType(ctx, x_name));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(abs_p,
-                  paddle::operators::AbsPrimOp,
-                  paddle::operators::AbsPrimOpMaker,
-                  paddle::operators::AbsPrimOpShapeInference,
-                  paddle::operators::AbsPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/add_p_op.cc b/paddle/fluid/operators/prim_ops/add_p_op.cc
deleted file mode 100644
index 7fbbdf136929c..0000000000000
--- a/paddle/fluid/operators/prim_ops/add_p_op.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class VarDesc;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class AddPrimOp : public framework::OperatorBase {
- public:
-  AddPrimOp(const std::string &type,
-            const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator add_p should not be executed directly"));
-  }
-};
-
-class AddPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of add_p op.");
-    AddInput("Y", "(Tensor), The input tensor of add_p op.");
-    AddOutput("Z", "(Tensor), The output tensor of add_p op.");
-    AddComment(R"DOC(
-Autograd primitive add_p operator.
-)DOC");
-  }
-};
-
-class AddPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0];
-    framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0];
-
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    framework::VarDesc *y_var = PADDLE_GET(framework::VarDesc *, y_var_ptr);
-    auto x_shape = x_var->GetShape();
-    auto y_shape = y_var->GetShape();
-    size_t x_rank = x_shape.size();
-    size_t y_rank = y_shape.size();
-    PADDLE_ENFORCE_EQ(x_rank,
-                      y_rank,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_rank,
-                          y_rank));
-    for (size_t i = 0; i < x_rank; ++i) {
-      PADDLE_ENFORCE_EQ(
-          x_shape[i],
-          y_shape[i],
-          platform::errors::InvalidArgument(
-              "The shape of two input tensor at dimension %d should be same, "
-              "but get %d and %d",
-              i,
-              x_shape[i],
-              y_shape[i]));
-    }
-
-    PADDLE_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape);
-  }
-};
-
-class AddPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Input(ctx, "Y")[0];
-    auto z_name = Output(ctx, "Z")[0];
-    auto x_type = GetType(ctx, x_name);
-    auto y_type = GetType(ctx, y_name);
-    auto x_dtype = GetDataType(ctx, x_name);
-    auto y_dtype = GetDataType(ctx, y_name);
-    PADDLE_ENFORCE_EQ(x_type,
-                      y_type,
-                      platform::errors::InvalidArgument(
-                          "The type of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_type,
-                          y_type));
-    PADDLE_ENFORCE_EQ(x_dtype,
-                      y_dtype,
-                      platform::errors::InvalidArgument(
-                          "The datatype of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_dtype,
-                          y_dtype));
-
-    SetType(ctx, z_name, x_type);
-    SetDataType(ctx, z_name, x_dtype);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(add_p,
-                  paddle::operators::AddPrimOp,
-                  paddle::operators::AddPrimOpMaker,
-                  paddle::operators::AddPrimOpShapeInference,
-                  paddle::operators::AddPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/bernoulli_p_op.cc b/paddle/fluid/operators/prim_ops/bernoulli_p_op.cc
deleted file mode 100644
index 251cd9bff5400..0000000000000
--- a/paddle/fluid/operators/prim_ops/bernoulli_p_op.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class VarDesc;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class BernoulliPrimOp : public framework::OperatorBase {
- public:
-  BernoulliPrimOp(const std::string &type,
-                  const framework::VariableNameMap &inputs,
-                  const framework::VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator bernoulli_p should not be executed directly"));
-  }
-};
-
-class BernoulliPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddOutput("Y", "(Tensor), The output tensor of bernoulli_p op.");
-    AddAttr<std::vector<int64_t>>(
-        "shape", "(std::vector<int64_t>) The shape of output tensor.");
-    AddAttr<int>("dtype", "(int) The dtype of output tensor.");
-    AddAttr<float>("p", "(float) The probability of bernoulli distribution.");
-    AddComment(R"DOC(
-Autograd primitive bernoulli_p operator.
-)DOC");
-  }
-};
-
-class BernoulliPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
-    auto shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
-    PADDLE_GET(framework::VarDesc *, y_var_ptr)->SetShape(shape);
-  }
-};
-
-class BernoulliPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto y_name = Output(ctx, "Y")[0];
-    auto data_type = static_cast<framework::proto::VarType::Type>(
-        PADDLE_GET_CONST(int, ctx->GetAttr("dtype")));
-    SetDataType(ctx, y_name, data_type);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(bernoulli_p,
-                  paddle::operators::BernoulliPrimOp,
-                  paddle::operators::BernoulliPrimOpMaker,
-                  paddle::operators::BernoulliPrimOpShapeInference,
-                  paddle::operators::BernoulliPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/broadcast_p_op.cc b/paddle/fluid/operators/prim_ops/broadcast_p_op.cc
deleted file mode 100644
index d2c391f7a9bc6..0000000000000
--- a/paddle/fluid/operators/prim_ops/broadcast_p_op.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class VarDesc;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class BroadcastPrimOp : public framework::OperatorBase {
- public:
-  BroadcastPrimOp(const std::string &type,
-                  const framework::VariableNameMap &inputs,
-                  const framework::VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator broadcast_p should not be executed directly"));
-  }
-};
-
-class BroadcastPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of broadcast_p op.");
-    AddOutput("Y", "(Tensor), The output tensor of broadcast_p op.");
-    AddAttr<std::vector<int64_t>>(
-        "shape",
-        "(std::vector<int64_t>) Target shape of broadcast_p operator.");
-    AddComment(R"DOC(
-Autograd primitive broadcast_p operator.
-)DOC");
-  }
-};
-
-static void CheckShapeValid(const std::vector<int64_t> &x_shape,
-                            const std::vector<int64_t> &target_shape) {
-  size_t x_rank = x_shape.size();
-  size_t target_rank = target_shape.size();
-  PADDLE_ENFORCE_GE(target_rank,
-                    x_rank,
-                    platform::errors::InvalidArgument(
-                        "The rank of target shape should be greater than or "
-                        "equal to input tensor's dimensions, "
-                        "but received %d and %d",
-                        target_rank,
-                        x_rank));
-  std::vector<int64_t>::const_iterator it = target_shape.begin();
-  for (size_t i = 0; i < x_rank; i++, it++) {
-    if (x_shape[i] != 1) {
-      it = std::find(it, target_shape.end(), x_shape[i]);
-    }
-    PADDLE_ENFORCE_EQ(
-        it != target_shape.end(),
-        true,
-        platform::errors::InvalidArgument(
-            "Invalid shape, can not broadcast input tensor into target shape,"
-            "the first dismatching shape  %d is shape of input tensor at "
-            "dimension %d",
-            x_shape[i],
-            i));
-  }
-}
-
-class BroadcastPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    auto x_shape = x_var->GetShape();
-    auto target_shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
-    CheckShapeValid(x_shape, target_shape);
-    PADDLE_GET(framework::VarDesc *, y_var_ptr)->SetShape(target_shape);
-  }
-};
-
-class BroadcastPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Output(ctx, "Y")[0];
-    SetType(ctx, y_name, GetType(ctx, x_name));
-    SetDataType(ctx, y_name, GetDataType(ctx, x_name));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(broadcast_p,
-                  paddle::operators::BroadcastPrimOp,
-                  paddle::operators::BroadcastPrimOpMaker,
-                  paddle::operators::BroadcastPrimOpShapeInference,
-                  paddle::operators::BroadcastPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/cast_p_op.cc b/paddle/fluid/operators/prim_ops/cast_p_op.cc
deleted file mode 100644
index ead6cc53ceea7..0000000000000
--- a/paddle/fluid/operators/prim_ops/cast_p_op.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class VarDesc;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class CastPrimOp : public framework::OperatorBase {
- public:
-  CastPrimOp(const std::string &type,
-             const framework::VariableNameMap &inputs,
-             const framework::VariableNameMap &outputs,
-             const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator cast_p should not be executed directly"));
-  }
-};
-
-class CastPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of cast_p op.");
-    AddOutput("Y", "(Tensor), The output tensor of cast_p op.");
-    AddAttr<int>("dtype", "output data type");
-    AddComment(R"DOC(Autograd primitive cast_p operator.)DOC");
-  }
-};
-
-class CastPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    PADDLE_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_var->GetShape());
-  }
-};
-
-class CastPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto out_type = static_cast<framework::proto::VarType::Type>(
-        PADDLE_GET_CONST(int, ctx->GetAttr("dtype")));
-    ctx->SetOutputDataType("Y", out_type);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(cast_p,
-                  paddle::operators::CastPrimOp,
-                  paddle::operators::CastPrimOpMaker,
-                  paddle::operators::CastPrimOpShapeInference,
-                  paddle::operators::CastPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/concat_p_op.cc b/paddle/fluid/operators/prim_ops/concat_p_op.cc
deleted file mode 100644
index 6b8d6c0a3322a..0000000000000
--- a/paddle/fluid/operators/prim_ops/concat_p_op.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class VarDesc;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class ConcatPrimOp : public framework::OperatorBase {
- public:
-  ConcatPrimOp(const std::string &type,
-               const framework::VariableNameMap &inputs,
-               const framework::VariableNameMap &outputs,
-               const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator concat_p should not be executed directly"));
-  }
-};
-
-class ConcatPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("XS", "(Tensor), The input tensors of concat_p op.")
-        .AsDuplicable();
-    AddOutput("Y", "(Tensor), The output tensor of concat_p op.");
-    AddAttr<int64_t>("axis", "(int64_t), The axis along which to concat.");
-    AddComment(R"DOC(
-Autograd primitive concat_p operator.
-)DOC");
-  }
-};
-
-class ConcatPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    auto x_var_ptrs = ctx->GetInputVarPtrs("XS");
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
-    auto axis = ctx->Attrs().Get<int64_t>("axis");
-    int64_t cnt_along_axis = 0;
-    framework::VarDesc *first_x_var =
-        PADDLE_GET(framework::VarDesc *, x_var_ptrs[0]);
-    auto first_x_shape = first_x_var->GetShape();
-    cnt_along_axis += first_x_shape[axis];
-    size_t first_x_rank = first_x_shape.size();
-    for (size_t i = 1; i < x_var_ptrs.size(); ++i) {
-      framework::VarDesc *x_var =
-          PADDLE_GET(framework::VarDesc *, x_var_ptrs[i]);
-      auto x_shape = x_var->GetShape();
-      cnt_along_axis += x_shape[axis];
-      size_t x_rank = x_shape.size();
-      PADDLE_ENFORCE_EQ(
-          x_rank,
-          first_x_rank,
-          platform::errors::InvalidArgument("The dimensions of %d input tensor "
-                                            "should be same as the dimensions "
-                                            "of 1st input tensor's, "
-                                            "but get %d and %d",
-                                            i + 1,
-                                            x_rank,
-                                            first_x_rank));
-      for (size_t j = 0; j < x_rank; ++j) {
-        if (j != size_t(axis)) {
-          PADDLE_ENFORCE_EQ(x_shape[j],
-                            first_x_shape[j],
-                            platform::errors::InvalidArgument(
-                                "The shape of %d input tensor at dimension %d "
-                                "should be same as the 1st input tensor's, "
-                                "but get %d and %d",
-                                i + 1,
-                                j,
-                                x_shape[j],
-                                first_x_shape[j]));
-        }
-      }
-    }
-
-    std::vector<int64_t> y_shape(first_x_shape);
-    y_shape[axis] = cnt_along_axis;
-    PADDLE_GET(framework::VarDesc *, y_var_ptr)->SetShape(y_shape);
-  }
-};
-
-class ConcatPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_names = Input(ctx, "XS");
-    auto y_name = Output(ctx, "Y")[0];
-    auto first_x_name = x_names[0];
-    auto first_x_type = GetType(ctx, first_x_name);
-    auto first_x_dtype = GetDataType(ctx, first_x_name);
-    for (size_t i = 1; i < x_names.size(); ++i) {
-      auto x_name = x_names[i];
-      auto x_type = GetType(ctx, x_name);
-      auto x_dtype = GetDataType(ctx, x_name);
-      PADDLE_ENFORCE_EQ(x_type,
-                        first_x_type,
-                        platform::errors::InvalidArgument(
-                            "The type of %d input tensor should be same as the "
-                            "first input tensor's, "
-                            "but get %d and %d",
-                            i + 1,
-                            x_type,
-                            first_x_type));
-      PADDLE_ENFORCE_EQ(x_dtype,
-                        first_x_dtype,
-                        platform::errors::InvalidArgument(
-                            "The datatype of %d input tensor should be same as "
-                            "the first input tensor's, "
-                            "but get %d and %d",
-                            i + 1,
-                            x_dtype,
-                            first_x_dtype));
-    }
-    SetType(ctx, y_name, GetType(ctx, first_x_name));
-    SetDataType(ctx, y_name, GetDataType(ctx, first_x_name));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(concat_p,
-                  paddle::operators::ConcatPrimOp,
-                  paddle::operators::ConcatPrimOpMaker,
-                  paddle::operators::ConcatPrimOpShapeInference,
-                  paddle::operators::ConcatPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/cos_p_op.cc b/paddle/fluid/operators/prim_ops/cos_p_op.cc
deleted file mode 100644
index c8acc30ba6107..0000000000000
--- a/paddle/fluid/operators/prim_ops/cos_p_op.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-class CosPrimOp : public framework::OperatorBase {
- public:
-  CosPrimOp(const std::string &type,
-            const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator cos_p should not be executed directly"));
-  }
-};
-
-class CosPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of cos_p op.");
-    AddOutput("Y", "(Tensor), The output tensor of cos_p op.");
-    AddComment(R"DOC(Autograd primitive cos_p operator.)DOC");
-  }
-};
-
-class CosPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    PADDLE_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_var->GetShape());
-  }
-};
-
-class CosPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Output(ctx, "Y")[0];
-    SetType(ctx, y_name, GetType(ctx, x_name));
-    SetDataType(ctx, y_name, GetDataType(ctx, x_name));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(cos_p,
-                  paddle::operators::CosPrimOp,
-                  paddle::operators::CosPrimOpMaker,
-                  paddle::operators::CosPrimOpShapeInference,
-                  paddle::operators::CosPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/div_p_op.cc b/paddle/fluid/operators/prim_ops/div_p_op.cc
deleted file mode 100644
index c046c63b8abad..0000000000000
--- a/paddle/fluid/operators/prim_ops/div_p_op.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class VarDesc;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class DivPrimOp : public framework::OperatorBase {
- public:
-  DivPrimOp(const std::string &type,
-            const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator div_p should not be executed directly"));
-  }
-};
-
-class DivPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of div_p op.");
-    AddInput("Y", "(Tensor), The input tensor of div_p op.");
-    AddOutput("Z", "(Tensor), The output tensor of div_p op.");
-    AddComment(R"DOC(
-Autograd primitive div_p operator.
-)DOC");
-  }
-};
-
-class DivPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0];
-    framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0];
-
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    framework::VarDesc *y_var = PADDLE_GET(framework::VarDesc *, y_var_ptr);
-    auto x_shape = x_var->GetShape();
-    auto y_shape = y_var->GetShape();
-    size_t x_rank = x_shape.size();
-    size_t y_rank = y_shape.size();
-    PADDLE_ENFORCE_EQ(x_rank,
-                      y_rank,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_rank,
-                          y_rank));
-    for (size_t i = 0; i < x_rank; ++i) {
-      PADDLE_ENFORCE_EQ(
-          x_shape[i],
-          y_shape[i],
-          platform::errors::InvalidArgument(
-              "The shape of two input tensor at dimension %d should be same, "
-              "but get %d and %d",
-              i,
-              x_shape[i],
-              y_shape[i]));
-    }
-
-    PADDLE_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape);
-  }
-};
-
-class DivPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Input(ctx, "Y")[0];
-    auto z_name = Output(ctx, "Z")[0];
-    auto x_type = GetType(ctx, x_name);
-    auto y_type = GetType(ctx, y_name);
-    auto x_dtype = GetDataType(ctx, x_name);
-    auto y_dtype = GetDataType(ctx, y_name);
-    PADDLE_ENFORCE_EQ(x_type,
-                      y_type,
-                      platform::errors::InvalidArgument(
-                          "The type of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_type,
-                          y_type));
-    PADDLE_ENFORCE_EQ(x_dtype,
-                      y_dtype,
-                      platform::errors::InvalidArgument(
-                          "The datatype of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_dtype,
-                          y_dtype));
-
-    SetType(ctx, z_name, x_type);
-    SetDataType(ctx, z_name, x_dtype);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(div_p,
-                  paddle::operators::DivPrimOp,
-                  paddle::operators::DivPrimOpMaker,
-                  paddle::operators::DivPrimOpShapeInference,
-                  paddle::operators::DivPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/eq_p_op.cc b/paddle/fluid/operators/prim_ops/eq_p_op.cc
deleted file mode 100644
index 389fd548677d6..0000000000000
--- a/paddle/fluid/operators/prim_ops/eq_p_op.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-class EqPrimOp : public framework::OperatorBase {
- public:
-  EqPrimOp(const std::string &type,
-           const framework::VariableNameMap &inputs,
-           const framework::VariableNameMap &outputs,
-           const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator eq_p should not be executed directly"));
-  }
-};
-
-class EqPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of eq_p op.");
-    AddInput("Y", "(Tensor), The input tensor of eq_p op.");
-    AddOutput("Z", "(Tensor), The output tensor of eq_p op.");
-    AddComment(R"DOC(
-Autograd primitive eq_p operator.
-)DOC");
-  }
-};
-
-class EqPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0];
-    framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0];
-
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    framework::VarDesc *y_var = PADDLE_GET(framework::VarDesc *, y_var_ptr);
-    auto x_shape = x_var->GetShape();
-    auto y_shape = y_var->GetShape();
-    size_t x_rank = x_shape.size();
-    size_t y_rank = y_shape.size();
-    PADDLE_ENFORCE_EQ(x_rank,
-                      y_rank,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_rank,
-                          y_rank));
-    for (size_t i = 0; i < x_rank; ++i) {
-      PADDLE_ENFORCE_EQ(
-          x_shape[i],
-          y_shape[i],
-          platform::errors::InvalidArgument(
-              "The shape of two input tensor at dimension %d should be same, "
-              "but get %d and %d",
-              i,
-              x_shape[i],
-              y_shape[i]));
-    }
-
-    PADDLE_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape);
-  }
-};
-
-class EqPrimOpVarTypeInference : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Input(ctx, "Y")[0];
-    auto z_name = Output(ctx, "Z")[0];
-    auto x_type = GetType(ctx, x_name);
-    auto y_type = GetType(ctx, y_name);
-    auto x_dtype = GetDataType(ctx, x_name);
-    auto y_dtype = GetDataType(ctx, y_name);
-    PADDLE_ENFORCE_EQ(x_type,
-                      y_type,
-                      platform::errors::InvalidArgument(
-                          "The type of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_type,
-                          y_type));
-    PADDLE_ENFORCE_EQ(x_dtype,
-                      y_dtype,
-                      platform::errors::InvalidArgument(
-                          "The datatype of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_dtype,
-                          y_dtype));
-
-    SetType(ctx, z_name, x_type);
-    SetDataType(ctx, z_name, framework::proto::VarType::BOOL);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(eq_p,
-                  paddle::operators::EqPrimOp,
-                  paddle::operators::EqPrimOpMaker,
-                  paddle::operators::EqPrimOpShapeInference,
-                  paddle::operators::EqPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/erf_p_op.cc b/paddle/fluid/operators/prim_ops/erf_p_op.cc
deleted file mode 100644
index 95bbeadfd6798..0000000000000
--- a/paddle/fluid/operators/prim_ops/erf_p_op.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class VarDesc;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class ErfPrimOp : public framework::OperatorBase {
- public:
-  ErfPrimOp(const std::string &type,
-            const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator erf_p should not be executed directly"));
-  }
-};
-
-class ErfPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of erf_p op.");
-    AddOutput("Y", "(Tensor), The output tensor of erf_p op.");
-    AddComment(R"DOC(Autograd primitive erf_p operator.)DOC");
-  }
-};
-
-class ErfPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    PADDLE_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_var->GetShape());
-  }
-};
-
-class ErfPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Output(ctx, "Y")[0];
-    SetType(ctx, y_name, GetType(ctx, x_name));
-    SetDataType(ctx, y_name, GetDataType(ctx, x_name));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(erf_p,
-                  paddle::operators::ErfPrimOp,
-                  paddle::operators::ErfPrimOpMaker,
-                  paddle::operators::ErfPrimOpShapeInference,
-                  paddle::operators::ErfPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/exp_p_op.cc b/paddle/fluid/operators/prim_ops/exp_p_op.cc
deleted file mode 100644
index 220ed7672ab25..0000000000000
--- a/paddle/fluid/operators/prim_ops/exp_p_op.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-class ExpPrimOp : public framework::OperatorBase {
- public:
-  ExpPrimOp(const std::string &type,
-            const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator exp_p should not be executed directly"));
-  }
-};
-
-class ExpPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of exp_p op.");
-    AddOutput("Y", "(Tensor), The output tensor of exp_p op.");
-    AddComment(R"DOC(Autograd primitive exp_p operator.)DOC");
-  }
-};
-
-class ExpPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    PADDLE_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_var->GetShape());
-  }
-};
-
-class ExpPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Output(ctx, "Y")[0];
-    SetType(ctx, y_name, GetType(ctx, x_name));
-    SetDataType(ctx, y_name, GetDataType(ctx, x_name));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(exp_p,
-                  paddle::operators::ExpPrimOp,
-                  paddle::operators::ExpPrimOpMaker,
-                  paddle::operators::ExpPrimOpShapeInference,
-                  paddle::operators::ExpPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/fill_constant_p_op.cc b/paddle/fluid/operators/prim_ops/fill_constant_p_op.cc
deleted file mode 100644
index a570ccd1cecba..0000000000000
--- a/paddle/fluid/operators/prim_ops/fill_constant_p_op.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class VarDesc;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class FillConstantPrimOp : public framework::OperatorBase {
- public:
-  FillConstantPrimOp(const std::string &type,
-                     const framework::VariableNameMap &inputs,
-                     const framework::VariableNameMap &outputs,
-                     const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator fill_constant_p should not be executed directly"));
-  }
-};
-
-class FillConstantPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddOutput("Y", "(Tensor), The output tensor of fill_constant_p op.");
-    AddAttr<float>("value", "(float) The value of output tensor.");
-    AddAttr<std::vector<int64_t>>(
-        "shape", "(std::vector<int64_t>) The shape of output tensor.");
-    AddAttr<int>("dtype", "(int) The dtype of output tensor.");
-    AddComment(R"DOC(
-Autograd primitive fill_constant_p operator.
-)DOC");
-  }
-};
-
-class FillConstantPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
-    auto shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
-    PADDLE_GET(framework::VarDesc *, y_var_ptr)->SetShape(shape);
-  }
-};
-
-class FillConstantPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto y_name = Output(ctx, "Y")[0];
-    auto data_type = static_cast<framework::proto::VarType::Type>(
-        PADDLE_GET_CONST(int, ctx->GetAttr("dtype")));
-    SetDataType(ctx, y_name, data_type);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(fill_constant_p,
-                  paddle::operators::FillConstantPrimOp,
-                  paddle::operators::FillConstantPrimOpMaker,
-                  paddle::operators::FillConstantPrimOpShapeInference,
-                  paddle::operators::FillConstantPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/gather_p_op.cc b/paddle/fluid/operators/prim_ops/gather_p_op.cc
deleted file mode 100644
index 23d8349f22eee..0000000000000
--- a/paddle/fluid/operators/prim_ops/gather_p_op.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class VarDesc;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class GatherPrimOp : public framework::OperatorBase {
- public:
-  GatherPrimOp(const std::string &type,
-               const framework::VariableNameMap &inputs,
-               const framework::VariableNameMap &outputs,
-               const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator gather_p should not be executed directly"));
-  }
-};
-
-class GatherPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of gather_p op.");
-    AddInput("IndexTensor",
-             "(Tensor), The index tensor of gather_p op, which is a 1D tensor.")
-        .AsDispensable();
-    AddOutput("Y", "(Tensor), The output tensor of gather_p op.");
-    AddAttr<int64_t>("axis", "(int64_t), The axis along which to gather.");
-    AddAttr<std::vector<int64_t>>(
-        "index", "(std::vector<int64_t>) The index of gather_p op")
-        .SetDefault({0});
-    AddComment(R"DOC(
-Autograd primitive gather_p operator.
-)DOC");
-  }
-};
-
-class GatherPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
-    int64_t num_index = 0;
-    if (ctx->HasInput("IndexTensor")) {
-      framework::InferShapeVarPtr index_var_ptr =
-          ctx->GetInputVarPtrs("IndexTensor")[0];
-      framework::VarDesc *index_var =
-          PADDLE_GET(framework::VarDesc *, index_var_ptr);
-      auto index_shape = index_var->GetShape();
-      PADDLE_ENFORCE_EQ(index_shape.size(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "The index tensor should be a 1D tensor,"
-                            "but get rank %d",
-                            index_shape.size()));
-      num_index = index_shape[0];
-    } else {
-      num_index = static_cast<int>(
-          ctx->Attrs().Get<std::vector<int64_t>>("index").size());
-    }
-    auto axis = ctx->Attrs().Get<int64_t>("axis");
-
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    auto x_shape = x_var->GetShape();
-    x_shape[axis] = num_index;
-
-    PADDLE_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_shape);
-  }
-};
-
-class GatherPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Output(ctx, "Y")[0];
-    if (ctx->HasInput("IndexTensor")) {
-      auto index_name = Input(ctx, "IndexTensor")[0];
-      auto index_dtype = GetDataType(ctx, index_name);
-      PADDLE_ENFORCE_EQ(
-          index_dtype,
-          framework::proto::VarType_Type_INT32,
-          platform::errors::InvalidArgument(
-              "The datatype of input tensor should be VarType_Type_INT32(%d), "
-              "but get %d",
-              framework::proto::VarType_Type_INT32,
-              index_dtype));
-    }
-    SetType(ctx, y_name, GetType(ctx, x_name));
-    SetDataType(ctx, y_name, GetDataType(ctx, x_name));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(gather_p,
-                  paddle::operators::GatherPrimOp,
-                  paddle::operators::GatherPrimOpMaker,
-                  paddle::operators::GatherPrimOpShapeInference,
-                  paddle::operators::GatherPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/ge_p_op.cc b/paddle/fluid/operators/prim_ops/ge_p_op.cc
deleted file mode 100644
index 20a6496158611..0000000000000
--- a/paddle/fluid/operators/prim_ops/ge_p_op.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-class GePrimOp : public framework::OperatorBase {
- public:
-  GePrimOp(const std::string &type,
-           const framework::VariableNameMap &inputs,
-           const framework::VariableNameMap &outputs,
-           const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator ge_p should not be executed directly"));
-  }
-};
-
-class GePrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of ge_p op.");
-    AddInput("Y", "(Tensor), The input tensor of ge_p op.");
-    AddOutput("Z", "(Tensor), The output tensor of ge_p op.");
-    AddComment(R"DOC(
-Autograd primitive ge_p operator.
-)DOC");
-  }
-};
-
-class GePrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0];
-    framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0];
-
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    framework::VarDesc *y_var = PADDLE_GET(framework::VarDesc *, y_var_ptr);
-    auto x_shape = x_var->GetShape();
-    auto y_shape = y_var->GetShape();
-    size_t x_rank = x_shape.size();
-    size_t y_rank = y_shape.size();
-    PADDLE_ENFORCE_EQ(x_rank,
-                      y_rank,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_rank,
-                          y_rank));
-    for (size_t i = 0; i < x_rank; ++i) {
-      PADDLE_ENFORCE_EQ(
-          x_shape[i],
-          y_shape[i],
-          platform::errors::InvalidArgument(
-              "The shape of two input tensor at dimension %d should be same, "
-              "but get %d and %d",
-              i,
-              x_shape[i],
-              y_shape[i]));
-    }
-
-    PADDLE_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape);
-  }
-};
-
-class GePrimOpVarTypeInference : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Input(ctx, "Y")[0];
-    auto z_name = Output(ctx, "Z")[0];
-    auto x_type = GetType(ctx, x_name);
-    auto y_type = GetType(ctx, y_name);
-    auto x_dtype = GetDataType(ctx, x_name);
-    auto y_dtype = GetDataType(ctx, y_name);
-    PADDLE_ENFORCE_EQ(x_type,
-                      y_type,
-                      platform::errors::InvalidArgument(
-                          "The type of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_type,
-                          y_type));
-    PADDLE_ENFORCE_EQ(x_dtype,
-                      y_dtype,
-                      platform::errors::InvalidArgument(
-                          "The datatype of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_dtype,
-                          y_dtype));
-
-    SetType(ctx, z_name, x_type);
-    SetDataType(ctx, z_name, framework::proto::VarType::BOOL);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(ge_p,
-                  paddle::operators::GePrimOp,
-                  paddle::operators::GePrimOpMaker,
-                  paddle::operators::GePrimOpShapeInference,
-                  paddle::operators::GePrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/gt_p_op.cc b/paddle/fluid/operators/prim_ops/gt_p_op.cc
deleted file mode 100644
index 01e8c1612cc43..0000000000000
--- a/paddle/fluid/operators/prim_ops/gt_p_op.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-class GtPrimOp : public framework::OperatorBase {
- public:
-  GtPrimOp(const std::string &type,
-           const framework::VariableNameMap &inputs,
-           const framework::VariableNameMap &outputs,
-           const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator gt_p should not be executed directly"));
-  }
-};
-
-class GtPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of gt_p op.");
-    AddInput("Y", "(Tensor), The input tensor of gt_p op.");
-    AddOutput("Z", "(Tensor), The output tensor of gt_p op.");
-    AddComment(R"DOC(
-Autograd primitive gt_p operator.
-)DOC");
-  }
-};
-
-class GtPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0];
-    framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0];
-
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    framework::VarDesc *y_var = PADDLE_GET(framework::VarDesc *, y_var_ptr);
-    auto x_shape = x_var->GetShape();
-    auto y_shape = y_var->GetShape();
-    size_t x_rank = x_shape.size();
-    size_t y_rank = y_shape.size();
-    PADDLE_ENFORCE_EQ(x_rank,
-                      y_rank,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_rank,
-                          y_rank));
-    for (size_t i = 0; i < x_rank; ++i) {
-      PADDLE_ENFORCE_EQ(
-          x_shape[i],
-          y_shape[i],
-          platform::errors::InvalidArgument(
-              "The shape of two input tensor at dimension %d should be same, "
-              "but get %d and %d",
-              i,
-              x_shape[i],
-              y_shape[i]));
-    }
-
-    PADDLE_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape);
-  }
-};
-
-class GtPrimOpVarTypeInference : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Input(ctx, "Y")[0];
-    auto z_name = Output(ctx, "Z")[0];
-    auto x_type = GetType(ctx, x_name);
-    auto y_type = GetType(ctx, y_name);
-    auto x_dtype = GetDataType(ctx, x_name);
-    auto y_dtype = GetDataType(ctx, y_name);
-    PADDLE_ENFORCE_EQ(x_type,
-                      y_type,
-                      platform::errors::InvalidArgument(
-                          "The type of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_type,
-                          y_type));
-    PADDLE_ENFORCE_EQ(x_dtype,
-                      y_dtype,
-                      platform::errors::InvalidArgument(
-                          "The datatype of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_dtype,
-                          y_dtype));
-
-    SetType(ctx, z_name, x_type);
-    SetDataType(ctx, z_name, framework::proto::VarType::BOOL);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(gt_p,
-                  paddle::operators::GtPrimOp,
-                  paddle::operators::GtPrimOpMaker,
-                  paddle::operators::GtPrimOpShapeInference,
-                  paddle::operators::GtPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/log_p_op.cc b/paddle/fluid/operators/prim_ops/log_p_op.cc
deleted file mode 100644
index d077510fd5c46..0000000000000
--- a/paddle/fluid/operators/prim_ops/log_p_op.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-class LogPrimOp : public framework::OperatorBase {
- public:
-  LogPrimOp(const std::string &type,
-            const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator log_p should not be executed directly"));
-  }
-};
-
-class LogPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of log_p op.");
-    AddOutput("Y", "(Tensor), The output tensor of log_p op.");
-    AddComment(R"DOC(
-Autograd primitive log_p operator.
-)DOC");
-  }
-};
-
-class LogPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
-
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-
-    PADDLE_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_var->GetShape());
-  }
-};
-
-class LogPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Output(ctx, "Y")[0];
-    SetType(ctx, y_name, GetType(ctx, x_name));
-    SetDataType(ctx, y_name, GetDataType(ctx, x_name));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(log_p,
-                  paddle::operators::LogPrimOp,
-                  paddle::operators::LogPrimOpMaker,
-                  paddle::operators::LogPrimOpShapeInference,
-                  paddle::operators::LogPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/matmul_p_op.cc b/paddle/fluid/operators/prim_ops/matmul_p_op.cc
deleted file mode 100644
index 6a53dda16f71c..0000000000000
--- a/paddle/fluid/operators/prim_ops/matmul_p_op.cc
+++ /dev/null
@@ -1,150 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class VarDesc;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class MatmulPrimOp : public framework::OperatorBase {
- public:
-  MatmulPrimOp(const std::string &type,
-               const framework::VariableNameMap &inputs,
-               const framework::VariableNameMap &outputs,
-               const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator matmul_p should not be executed directly"));
-  }
-};
-
-class MatmulPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of matmul_p op.");
-    AddInput("Y", "(Tensor), The input tensor of matmul_p op.");
-    AddOutput("Z", "(Tensor), The output tensor of matmul_p op.");
-    AddComment(R"DOC(
-Autograd primitive matmul_p operator.
-)DOC");
-  }
-};
-
-class MatmulPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0];
-    framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0];
-
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    framework::VarDesc *y_var = PADDLE_GET(framework::VarDesc *, y_var_ptr);
-    auto x_shape = x_var->GetShape();
-    auto y_shape = y_var->GetShape();
-    size_t x_rank = x_shape.size();
-    size_t y_rank = y_shape.size();
-    PADDLE_ENFORCE_EQ(x_rank,
-                      y_rank,
-                      platform::errors::InvalidArgument(
-                          "The two input tensor's dimension should be equal"
-                          "But received first input tensor's dimension is %d, "
-                          "and another input tensor's dimension is %d",
-                          x_rank,
-                          y_rank));
-
-    PADDLE_ENFORCE_EQ(x_rank == 2 || x_rank == 3,
-                      true,
-                      platform::errors::InvalidArgument(
-                          "The input tensor's dimension should be 2 or 3"
-                          "But received input tensor's dimension is %d",
-                          x_rank));
-
-    PADDLE_ENFORCE_EQ(
-        x_shape[x_rank - 1],
-        y_shape[y_rank - 2],
-        platform::errors::InvalidArgument(
-            "Invalid shape for matmul, the last dimension of first input and "
-            "the penultimate dimension for the second input should be same."
-            "But received  %d and %d.",
-            x_shape[x_rank - 1],
-            y_shape[y_rank - 2]));
-    if (x_rank == 2) {
-      std::vector<int64_t> z_shape{x_shape[x_rank - 2], y_shape[y_rank - 1]};
-      PADDLE_GET(framework::VarDesc *, z_var_ptr)->SetShape(z_shape);
-    } else {
-      PADDLE_ENFORCE_EQ(x_shape[0],
-                        y_shape[0],
-                        platform::errors::InvalidArgument(
-                            "Invalid shape for matmul when input tensor's "
-                            "dimension is 3, the first dimension of first "
-                            "input and the second input should be same."
-                            "But received  %d and %d.",
-                            x_shape[0],
-                            y_shape[0]));
-
-      std::vector<int64_t> z_shape{
-          x_shape[0], x_shape[x_rank - 2], y_shape[y_rank - 1]};
-      PADDLE_GET(framework::VarDesc *, z_var_ptr)->SetShape(z_shape);
-    }
-  }
-};
-
-class MatmulPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Input(ctx, "Y")[0];
-    auto z_name = Output(ctx, "Z")[0];
-    auto x_type = GetType(ctx, x_name);
-    auto y_type = GetType(ctx, y_name);
-    auto x_dtype = GetDataType(ctx, x_name);
-    auto y_dtype = GetDataType(ctx, y_name);
-    PADDLE_ENFORCE_EQ(x_type,
-                      y_type,
-                      platform::errors::InvalidArgument(
-                          "The type of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_type,
-                          y_type));
-    PADDLE_ENFORCE_EQ(x_dtype,
-                      y_dtype,
-                      platform::errors::InvalidArgument(
-                          "The datatype of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_dtype,
-                          y_dtype));
-
-    SetType(ctx, z_name, x_type);
-    SetDataType(ctx, z_name, x_dtype);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(matmul_p,
-                  paddle::operators::MatmulPrimOp,
-                  paddle::operators::MatmulPrimOpMaker,
-                  paddle::operators::MatmulPrimOpShapeInference,
-                  paddle::operators::MatmulPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/max_p_op.cc b/paddle/fluid/operators/prim_ops/max_p_op.cc
deleted file mode 100644
index 782925b748eac..0000000000000
--- a/paddle/fluid/operators/prim_ops/max_p_op.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-class MaxPrimOp : public framework::OperatorBase {
- public:
-  MaxPrimOp(const std::string &type,
-            const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator max_p should not be executed directly"));
-  }
-};
-
-class MaxPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of max_p op.");
-    AddInput("Y", "(Tensor), The input tensor of max_p op.");
-    AddOutput("Z", "(Tensor), The output tensor of max_p op.");
-    AddComment(R"DOC(
-Autograd primitive max_p operator.
-)DOC");
-  }
-};
-
-class MaxPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0];
-    framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0];
-
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    framework::VarDesc *y_var = PADDLE_GET(framework::VarDesc *, y_var_ptr);
-    auto x_shape = x_var->GetShape();
-    auto y_shape = y_var->GetShape();
-    size_t x_rank = x_shape.size();
-    size_t y_rank = y_shape.size();
-    PADDLE_ENFORCE_EQ(x_rank,
-                      y_rank,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_rank,
-                          y_rank));
-    for (size_t i = 0; i < x_rank; ++i) {
-      PADDLE_ENFORCE_EQ(
-          x_shape[i],
-          y_shape[i],
-          platform::errors::InvalidArgument(
-              "The shape of two input tensor at dimension %d should be same, "
-              "but get %d and %d",
-              i,
-              x_shape[i],
-              y_shape[i]));
-    }
-
-    PADDLE_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape);
-  }
-};
-
-class MaxPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Input(ctx, "Y")[0];
-    auto z_name = Output(ctx, "Z")[0];
-    auto x_type = GetType(ctx, x_name);
-    auto y_type = GetType(ctx, y_name);
-    auto x_dtype = GetDataType(ctx, x_name);
-    auto y_dtype = GetDataType(ctx, y_name);
-    PADDLE_ENFORCE_EQ(x_type,
-                      y_type,
-                      platform::errors::InvalidArgument(
-                          "The type of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_type,
-                          y_type));
-    PADDLE_ENFORCE_EQ(x_dtype,
-                      y_dtype,
-                      platform::errors::InvalidArgument(
-                          "The datatype of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_dtype,
-                          y_dtype));
-
-    SetType(ctx, z_name, x_type);
-    SetDataType(ctx, z_name, x_dtype);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(max_p,
-                  paddle::operators::MaxPrimOp,
-                  paddle::operators::MaxPrimOpMaker,
-                  paddle::operators::MaxPrimOpShapeInference,
-                  paddle::operators::MaxPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/mul_p_op.cc b/paddle/fluid/operators/prim_ops/mul_p_op.cc
deleted file mode 100644
index fd655e887be90..0000000000000
--- a/paddle/fluid/operators/prim_ops/mul_p_op.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class VarDesc;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class MulPrimOp : public framework::OperatorBase {
- public:
-  MulPrimOp(const std::string &type,
-            const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator mul_p should not be executed directly"));
-  }
-};
-
-class MulPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of mul_p op.");
-    AddInput("Y", "(Tensor), The input tensor of mul_p op.");
-    AddOutput("Z", "(Tensor), The output tensor of mul_p op.");
-    AddComment(R"DOC(
-Autograd primitive mul_p operator.
-)DOC");
-  }
-};
-
-class MulPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0];
-    framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0];
-
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    framework::VarDesc *y_var = PADDLE_GET(framework::VarDesc *, y_var_ptr);
-    auto x_shape = x_var->GetShape();
-    auto y_shape = y_var->GetShape();
-    size_t x_rank = x_shape.size();
-    size_t y_rank = y_shape.size();
-    PADDLE_ENFORCE_EQ(x_rank,
-                      y_rank,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_rank,
-                          y_rank));
-    for (size_t i = 0; i < x_rank; ++i) {
-      PADDLE_ENFORCE_EQ(
-          x_shape[i],
-          y_shape[i],
-          platform::errors::InvalidArgument(
-              "The shape of two input tensor at dimension %d should be same, "
-              "but get %d and %d",
-              i,
-              x_shape[i],
-              y_shape[i]));
-    }
-
-    PADDLE_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape);
-  }
-};
-
-class MulPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Input(ctx, "Y")[0];
-    auto z_name = Output(ctx, "Z")[0];
-    auto x_type = GetType(ctx, x_name);
-    auto y_type = GetType(ctx, y_name);
-    auto x_dtype = GetDataType(ctx, x_name);
-    auto y_dtype = GetDataType(ctx, y_name);
-    PADDLE_ENFORCE_EQ(x_type,
-                      y_type,
-                      platform::errors::InvalidArgument(
-                          "The type of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_type,
-                          y_type));
-    PADDLE_ENFORCE_EQ(x_dtype,
-                      y_dtype,
-                      platform::errors::InvalidArgument(
-                          "The datatype of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_dtype,
-                          y_dtype));
-
-    SetType(ctx, z_name, x_type);
-    SetDataType(ctx, z_name, x_dtype);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(mul_p,
-                  paddle::operators::MulPrimOp,
-                  paddle::operators::MulPrimOpMaker,
-                  paddle::operators::MulPrimOpShapeInference,
-                  paddle::operators::MulPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/ne_p_op.cc b/paddle/fluid/operators/prim_ops/ne_p_op.cc
deleted file mode 100644
index 0d65d1a7e33d9..0000000000000
--- a/paddle/fluid/operators/prim_ops/ne_p_op.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-class NePrimOp : public framework::OperatorBase {
- public:
-  NePrimOp(const std::string &type,
-           const framework::VariableNameMap &inputs,
-           const framework::VariableNameMap &outputs,
-           const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator ne_p should not be executed directly"));
-  }
-};
-
-class NePrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of ne_p op.");
-    AddInput("Y", "(Tensor), The input tensor of ne_p op.");
-    AddOutput("Z", "(Tensor), The output tensor of ne_p op.");
-    AddComment(R"DOC(
-Autograd primitive ne_p operator.
-)DOC");
-  }
-};
-
-class NePrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0];
-    framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0];
-
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    framework::VarDesc *y_var = PADDLE_GET(framework::VarDesc *, y_var_ptr);
-    auto x_shape = x_var->GetShape();
-    auto y_shape = y_var->GetShape();
-    size_t x_rank = x_shape.size();
-    size_t y_rank = y_shape.size();
-    PADDLE_ENFORCE_EQ(x_rank,
-                      y_rank,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_rank,
-                          y_rank));
-    for (size_t i = 0; i < x_rank; ++i) {
-      PADDLE_ENFORCE_EQ(
-          x_shape[i],
-          y_shape[i],
-          platform::errors::InvalidArgument(
-              "The shape of two input tensor at dimension %d should be same, "
-              "but get %d and %d",
-              i,
-              x_shape[i],
-              y_shape[i]));
-    }
-
-    PADDLE_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape);
-  }
-};
-
-class NePrimOpVarTypeInference : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Input(ctx, "Y")[0];
-    auto z_name = Output(ctx, "Z")[0];
-    auto x_type = GetType(ctx, x_name);
-    auto y_type = GetType(ctx, y_name);
-    auto x_dtype = GetDataType(ctx, x_name);
-    auto y_dtype = GetDataType(ctx, y_name);
-    PADDLE_ENFORCE_EQ(x_type,
-                      y_type,
-                      platform::errors::InvalidArgument(
-                          "The type of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_type,
-                          y_type));
-    PADDLE_ENFORCE_EQ(x_dtype,
-                      y_dtype,
-                      platform::errors::InvalidArgument(
-                          "The datatype of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_dtype,
-                          y_dtype));
-
-    SetType(ctx, z_name, x_type);
-    SetDataType(ctx, z_name, framework::proto::VarType::BOOL);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(ne_p,
-                  paddle::operators::NePrimOp,
-                  paddle::operators::NePrimOpMaker,
-                  paddle::operators::NePrimOpShapeInference,
-                  paddle::operators::NePrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/pow_p_op.cc b/paddle/fluid/operators/prim_ops/pow_p_op.cc
deleted file mode 100644
index 50e625a328e58..0000000000000
--- a/paddle/fluid/operators/prim_ops/pow_p_op.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-class PowPrimOp : public framework::OperatorBase {
- public:
-  PowPrimOp(const std::string &type,
-            const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator pow_p should not be executed directly"));
-  }
-};
-
-class PowPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The base of pow_p op.");
-    AddInput("Y", "(Tensor), The exponents of pow_p op.");
-    AddOutput("Z", "(Tensor), The output tensor of pow_p op.");
-    AddComment(R"DOC(
-Autograd primitive pow_p operator.
-)DOC");
-  }
-};
-
-class PowPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0];
-    framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0];
-
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    framework::VarDesc *y_var = PADDLE_GET(framework::VarDesc *, y_var_ptr);
-    auto x_shape = x_var->GetShape();
-    auto y_shape = y_var->GetShape();
-    size_t x_rank = x_shape.size();
-    size_t y_rank = y_shape.size();
-
-    PADDLE_ENFORCE_EQ(x_rank,
-                      y_rank,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_rank,
-                          y_rank));
-    for (size_t i = 0; i < x_rank; ++i) {
-      PADDLE_ENFORCE_EQ(
-          x_shape[i],
-          y_shape[i],
-          platform::errors::InvalidArgument(
-              "The shape of two input tensor at dimension %d should be same, "
-              "but get %d and %d",
-              i,
-              x_shape[i],
-              y_shape[i]));
-    }
-
-    PADDLE_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape);
-  }
-};
-
-class PowPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Input(ctx, "Y")[0];
-    auto z_name = Output(ctx, "Z")[0];
-    auto x_type = GetType(ctx, x_name);
-    auto y_type = GetType(ctx, y_name);
-    auto x_dtype = GetDataType(ctx, x_name);
-
-    PADDLE_ENFORCE_EQ(x_type,
-                      y_type,
-                      platform::errors::InvalidArgument(
-                          "The type of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_type,
-                          y_type));
-
-    SetType(ctx, z_name, x_type);
-    SetDataType(ctx, z_name, x_dtype);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(pow_p,
-                  paddle::operators::PowPrimOp,
-                  paddle::operators::PowPrimOpMaker,
-                  paddle::operators::PowPrimOpShapeInference,
-                  paddle::operators::PowPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/reduce_sum_p_op.cc b/paddle/fluid/operators/prim_ops/reduce_sum_p_op.cc
deleted file mode 100644
index dbb33a98b108c..0000000000000
--- a/paddle/fluid/operators/prim_ops/reduce_sum_p_op.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class VarDesc;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class ReduceSumPrimOp : public framework::OperatorBase {
- public:
-  ReduceSumPrimOp(const std::string &type,
-                  const framework::VariableNameMap &inputs,
-                  const framework::VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator reduce_sum_p should not be executed directly"));
-  }
-};
-
-class ReduceSumPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of reduce_sum_p op.");
-    AddOutput("Y", "(Tensor), The output tensor of reduce_sum_p op.");
-    AddAttr<std::vector<int64_t>>(
-        "axis",
-        "(std::vector<int64_t>) The axis along which to reduce on. Must be in "
-        "range [-rank(input), rank(input)]. If `axis[i] < 0`, the axis[i] to "
-        "reduce is `rank + axis[i]`.");
-    AddAttr<bool>("keepdim",
-                  "(bool, default false) "
-                  "If true, retain the reduced axis with length 1.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-Autograd primitive reduce_sum_p operator.
-)DOC");
-  }
-};
-
-class ReduceSumPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    auto x_shape = x_var->GetShape();
-    auto axis = ctx->Attrs().Get<std::vector<int64_t>>("axis");
-    auto keepdim = ctx->Attrs().Get<bool>("keepdim");
-    if (keepdim) {
-      for (auto item : axis) {
-        x_shape[item] = 1;
-      }
-    } else {
-      const int kDelFlag = -2;
-      for (auto item : axis) {
-        x_shape[item] = kDelFlag;
-      }
-      x_shape.erase(remove(x_shape.begin(), x_shape.end(), kDelFlag),
-                    x_shape.end());
-    }
-
-    PADDLE_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_shape);
-  }
-};
-
-class ReduceSumPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Output(ctx, "Y")[0];
-    SetType(ctx, y_name, GetType(ctx, x_name));
-    SetDataType(ctx, y_name, GetDataType(ctx, x_name));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(reduce_sum_p,
-                  paddle::operators::ReduceSumPrimOp,
-                  paddle::operators::ReduceSumPrimOpMaker,
-                  paddle::operators::ReduceSumPrimOpShapeInference,
-                  paddle::operators::ReduceSumPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/reshape_p_op.cc b/paddle/fluid/operators/prim_ops/reshape_p_op.cc
deleted file mode 100644
index 8137dfd629b01..0000000000000
--- a/paddle/fluid/operators/prim_ops/reshape_p_op.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class VarDesc;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class ReshapePrimOp : public framework::OperatorBase {
- public:
-  ReshapePrimOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator reshape_p should not be executed directly"));
-  }
-};
-
-class ReshapePrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of reshape_p op.");
-    AddOutput("Y", "(Tensor), The output tensor of reshape_p op.");
-    AddAttr<std::vector<int64_t>>(
-        "shape", "(std::vector<int64_t>) Target shape of reshape_p operator.");
-    AddComment(R"DOC(
-Autograd primitive reshape_p operator.
-)DOC");
-  }
-};
-
-static int64_t product(const std::vector<int64_t> &shape) {
-  int64_t rslt = 1;
-  for (auto item : shape) {
-    rslt *= item;
-  }
-  return rslt;
-}
-
-class ReshapePrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    auto x_shape = x_var->GetShape();
-    auto shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
-    PADDLE_ENFORCE_EQ(product(x_shape),
-                      product(shape),
-                      platform::errors::InvalidArgument(
-                          "The input tensor can't be reshaped to target shape, "
-                          "the input tensor has %d elements but target shape "
-                          "contains %d elements",
-                          product(x_shape),
-                          product(shape)));
-    PADDLE_GET(framework::VarDesc *, y_var_ptr)->SetShape(shape);
-  }
-};
-
-class ReshapePrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Output(ctx, "Y")[0];
-    SetType(ctx, y_name, GetType(ctx, x_name));
-    SetDataType(ctx, y_name, GetDataType(ctx, x_name));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(reshape_p,
-                  paddle::operators::ReshapePrimOp,
-                  paddle::operators::ReshapePrimOpMaker,
-                  paddle::operators::ReshapePrimOpShapeInference,
-                  paddle::operators::ReshapePrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/rsqrt_p_op.cc b/paddle/fluid/operators/prim_ops/rsqrt_p_op.cc
deleted file mode 100644
index d2401c6d4e40f..0000000000000
--- a/paddle/fluid/operators/prim_ops/rsqrt_p_op.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class VarDesc;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class RsqrtPrimOp : public framework::OperatorBase {
- public:
-  RsqrtPrimOp(const std::string &type,
-              const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator rsqrt_p should not be executed directly"));
-  }
-};
-
-class RsqrtPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of rsqrt_p op.");
-    AddOutput("Y", "(Tensor), The output tensor of rsqrt_p op.");
-    AddComment(R"DOC(
-Autograd primitive rsqrt_p operator.
-)DOC");
-  }
-};
-
-class RsqrtPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
-
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-
-    PADDLE_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_var->GetShape());
-  }
-};
-
-class RsqrtPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Output(ctx, "Y")[0];
-    SetType(ctx, y_name, GetType(ctx, x_name));
-    SetDataType(ctx, y_name, GetDataType(ctx, x_name));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(rsqrt_p,
-                  paddle::operators::RsqrtPrimOp,
-                  paddle::operators::RsqrtPrimOpMaker,
-                  paddle::operators::RsqrtPrimOpShapeInference,
-                  paddle::operators::RsqrtPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/scatter_add_p_op.cc b/paddle/fluid/operators/prim_ops/scatter_add_p_op.cc
deleted file mode 100644
index 2b116d5224073..0000000000000
--- a/paddle/fluid/operators/prim_ops/scatter_add_p_op.cc
+++ /dev/null
@@ -1,176 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class VarDesc;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class ScatterAddPrimOp : public framework::OperatorBase {
- public:
-  ScatterAddPrimOp(const std::string &type,
-                   const framework::VariableNameMap &inputs,
-                   const framework::VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator scatter_add_p should not be executed directly"));
-  }
-};
-
-class ScatterAddPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The tensor to apply scatter rule and add on.");
-    AddInput("Y", "(Tensor), The source tensor of scatter_add_p op.");
-    AddInput(
-        "IndexTensor",
-        "(Tensor), The index tensor of scatter_add_p op, which is a 1D tensor.")
-        .AsDispensable();
-    AddOutput("Z", "(Tensor), The output tensor of scatter_add_p op.");
-    AddAttr<int64_t>("axis",
-                     "(int64_t), The axis along which to scatter and add.");
-    AddAttr<std::vector<int64_t>>(
-        "index", "(std::vector<int64_t>) The index of scatter_add_p op")
-        .SetDefault({0});
-    AddComment(R"DOC(
-Autograd primitive scatter_add_p operator.
-)DOC");
-  }
-};
-
-class ScatterAddPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0];
-    framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0];
-    int64_t num_index = 0;
-    if (ctx->HasInput("IndexTensor")) {
-      framework::InferShapeVarPtr index_var_ptr =
-          ctx->GetInputVarPtrs("IndexTensor")[0];
-      framework::VarDesc *index_var =
-          PADDLE_GET(framework::VarDesc *, index_var_ptr);
-      auto index_shape = index_var->GetShape();
-      PADDLE_ENFORCE_EQ(index_shape.size(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "The index tensor should be a 1D tensor,"
-                            "but get rank %d",
-                            index_shape.size()));
-      num_index = index_shape[0];
-    } else {
-      num_index = static_cast<int64_t>(
-          ctx->Attrs().Get<std::vector<int64_t>>("index").size());
-    }
-    auto axis = ctx->Attrs().Get<int64_t>("axis");
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    framework::VarDesc *y_var = PADDLE_GET(framework::VarDesc *, y_var_ptr);
-    auto x_shape = x_var->GetShape();
-    auto y_shape = y_var->GetShape();
-    size_t x_rank = x_shape.size();
-    size_t y_rank = y_shape.size();
-    PADDLE_ENFORCE_EQ(x_rank,
-                      y_rank,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_rank,
-                          y_rank));
-    PADDLE_ENFORCE_EQ(y_shape[axis],
-                      num_index,
-                      platform::errors::InvalidArgument(
-                          "The shape of source input tensor at scatter axis "
-                          "should be  equal to num_index, "
-                          "but get %d and %d",
-                          y_shape[axis],
-                          num_index));
-    for (size_t i = 0; i < x_rank; ++i) {
-      if (i != size_t(axis)) {
-        PADDLE_ENFORCE_EQ(
-            x_shape[i],
-            y_shape[i],
-            platform::errors::InvalidArgument(
-                "The shape of two input tensor at dimension %d should be same, "
-                "but get %d and %d",
-                i,
-                x_rank,
-                y_rank));
-      }
-    }
-
-    PADDLE_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape);
-  }
-};
-
-class ScatterAddPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Input(ctx, "Y")[0];
-    auto z_name = Output(ctx, "Z")[0];
-    auto x_type = GetType(ctx, x_name);
-    auto y_type = GetType(ctx, y_name);
-    auto x_dtype = GetDataType(ctx, x_name);
-    auto y_dtype = GetDataType(ctx, y_name);
-    PADDLE_ENFORCE_EQ(x_type,
-                      y_type,
-                      platform::errors::InvalidArgument(
-                          "The type of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_type,
-                          y_type));
-    PADDLE_ENFORCE_EQ(x_dtype,
-                      y_dtype,
-                      platform::errors::InvalidArgument(
-                          "The datatype of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_dtype,
-                          y_dtype));
-
-    if (ctx->HasInput("IndexTensor")) {
-      auto index_name = Input(ctx, "IndexTensor")[0];
-      auto index_dtype = GetDataType(ctx, index_name);
-      PADDLE_ENFORCE_EQ(
-          index_dtype,
-          framework::proto::VarType_Type_INT32,
-          platform::errors::InvalidArgument(
-              "The datatype of input tensor should be VarType_Type_INT32(%d), "
-              "but get %d",
-              framework::proto::VarType_Type_INT32,
-              index_dtype));
-    }
-    SetType(ctx, z_name, GetType(ctx, x_name));
-    SetDataType(ctx, z_name, GetDataType(ctx, x_name));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(scatter_add_p,
-                  paddle::operators::ScatterAddPrimOp,
-                  paddle::operators::ScatterAddPrimOpMaker,
-                  paddle::operators::ScatterAddPrimOpShapeInference,
-                  paddle::operators::ScatterAddPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/select_p_op.cc b/paddle/fluid/operators/prim_ops/select_p_op.cc
deleted file mode 100644
index 69253da41d7d2..0000000000000
--- a/paddle/fluid/operators/prim_ops/select_p_op.cc
+++ /dev/null
@@ -1,153 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-class SelectPrimOp : public framework::OperatorBase {
- public:
-  SelectPrimOp(const std::string &type,
-               const framework::VariableNameMap &inputs,
-               const framework::VariableNameMap &outputs,
-               const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator select_p should not be executed directly"));
-  }
-};
-
-class SelectPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Condition", "(Tensor), The input condition of select_p op.");
-    AddInput("X", "(Tensor), The input tensor of select_p op.");
-    AddInput("Y", "(Tensor), The input tensor of select_p op.");
-    AddOutput("Z", "(Tensor), The output tensor of select_p op.");
-    AddComment(R"DOC(
-Autograd primitive select_p operator.
-)DOC");
-  }
-};
-
-class SelectPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr condition_var_ptr =
-        ctx->GetInputVarPtrs("Condition")[0];
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0];
-    framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0];
-
-    framework::VarDesc *condition_var =
-        PADDLE_GET(framework::VarDesc *, condition_var_ptr);
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    framework::VarDesc *y_var = PADDLE_GET(framework::VarDesc *, y_var_ptr);
-
-    auto condition_shape = condition_var->GetShape();
-    auto x_shape = x_var->GetShape();
-    auto y_shape = y_var->GetShape();
-
-    size_t condition_rank = condition_shape.size();
-    size_t x_rank = x_shape.size();
-    size_t y_rank = y_shape.size();
-
-    PADDLE_ENFORCE_EQ(
-        condition_rank,
-        x_rank,
-        platform::errors::InvalidArgument(
-            "The dimensions of condtion and Inputs(X) should be same, "
-            "but get %d and %d",
-            condition_rank,
-            x_rank));
-    PADDLE_ENFORCE_EQ(
-        x_rank,
-        y_rank,
-        platform::errors::InvalidArgument(
-            "The dimensions of Inputs(X) and Inputs(Y) should be same, "
-            "but get %d and %d",
-            x_rank,
-            y_rank));
-    for (size_t i = 0; i < condition_rank; ++i) {
-      PADDLE_ENFORCE_EQ(condition_shape[i],
-                        x_shape[i],
-                        platform::errors::InvalidArgument(
-                            "The shape of condition and Inputs(X) at dimension "
-                            "%d should be same, "
-                            "but get %d and %d",
-                            i,
-                            condition_shape[i],
-                            x_shape[i]));
-    }
-    for (size_t i = 0; i < x_rank; ++i) {
-      PADDLE_ENFORCE_EQ(x_shape[i],
-                        y_shape[i],
-                        platform::errors::InvalidArgument(
-                            "The shape of Inputs(X) and Inputs(Y) at dimension "
-                            "%d should be same, "
-                            "but get %d and %d",
-                            i,
-                            x_shape[i],
-                            y_shape[i]));
-    }
-
-    PADDLE_GET(framework::VarDesc *, z_var_ptr)->SetShape(condition_shape);
-  }
-};
-
-class SelectPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Input(ctx, "Y")[0];
-    auto z_name = Output(ctx, "Z")[0];
-
-    auto x_type = GetType(ctx, x_name);
-    auto y_type = GetType(ctx, y_name);
-
-    auto x_dtype = GetDataType(ctx, x_name);
-    auto y_dtype = GetDataType(ctx, y_name);
-
-    PADDLE_ENFORCE_EQ(x_type,
-                      y_type,
-                      platform::errors::InvalidArgument(
-                          "The type of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_type,
-                          y_type));
-    PADDLE_ENFORCE_EQ(x_dtype,
-                      y_dtype,
-                      platform::errors::InvalidArgument(
-                          "The datatype of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_dtype,
-                          y_dtype));
-
-    SetType(ctx, z_name, x_type);
-    SetDataType(ctx, z_name, x_dtype);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(select_p,
-                  paddle::operators::SelectPrimOp,
-                  paddle::operators::SelectPrimOpMaker,
-                  paddle::operators::SelectPrimOpShapeInference,
-                  paddle::operators::SelectPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/sin_p_op.cc b/paddle/fluid/operators/prim_ops/sin_p_op.cc
deleted file mode 100644
index 95b413acc77af..0000000000000
--- a/paddle/fluid/operators/prim_ops/sin_p_op.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-class SinPrimOp : public framework::OperatorBase {
- public:
-  SinPrimOp(const std::string &type,
-            const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator sin_p should not be executed directly"));
-  }
-};
-
-class SinPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of sin_p op.");
-    AddOutput("Y", "(Tensor), The output tensor of sin_p op.");
-    AddComment(R"DOC(Autograd primitive sin_p operator.)DOC");
-  }
-};
-
-class SinPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    PADDLE_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_var->GetShape());
-  }
-};
-
-class SinPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Output(ctx, "Y")[0];
-    SetType(ctx, y_name, GetType(ctx, x_name));
-    SetDataType(ctx, y_name, GetDataType(ctx, x_name));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(sin_p,
-                  paddle::operators::SinPrimOp,
-                  paddle::operators::SinPrimOpMaker,
-                  paddle::operators::SinPrimOpShapeInference,
-                  paddle::operators::SinPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/slice_assign_p_op.cc b/paddle/fluid/operators/prim_ops/slice_assign_p_op.cc
deleted file mode 100644
index 9485d621aa5d4..0000000000000
--- a/paddle/fluid/operators/prim_ops/slice_assign_p_op.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class VarDesc;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class SliceAssignPrimOp : public framework::OperatorBase {
- public:
-  SliceAssignPrimOp(const std::string &type,
-                    const framework::VariableNameMap &inputs,
-                    const framework::VariableNameMap &outputs,
-                    const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator slice_assign_p should not be executed directly"));
-  }
-};
-
-class SliceAssignPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The tensor to slice from and assign on.");
-    AddInput("Y", "(Tensor), The source tensor of slice_assign_p op.");
-    AddOutput("Z", "(Tensor), The output tensor of slice_assign_p op.");
-    AddAttr<std::vector<int64_t>>(
-        "axis", "(std::vector<int64_t>), The axis along which to gather.");
-    AddAttr<std::vector<int64_t>>(
-        "starts",
-        "(std::vector<int64_t>) The slice starts of slice_assign_p op");
-    AddAttr<std::vector<int64_t>>(
-        "ends", "(std::vector<int64_t>) The slice ends of slice_assign_p op");
-    AddAttr<std::vector<int64_t>>(
-        "strides",
-        "(std::vector<int64_t>) The slice strides of slice_assign_p op");
-    AddComment(R"DOC(
-Autograd primitive slice_assign_p operator.
-)DOC");
-  }
-};
-
-class SliceAssignPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0];
-    framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0];
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    framework::VarDesc *y_var = PADDLE_GET(framework::VarDesc *, y_var_ptr);
-    auto x_shape = x_var->GetShape();
-    auto y_shape = y_var->GetShape();
-    size_t x_rank = x_shape.size();
-    size_t y_rank = y_shape.size();
-    auto axis = ctx->Attrs().Get<std::vector<int64_t>>("axis");
-    auto starts = ctx->Attrs().Get<std::vector<int64_t>>("starts");
-    auto ends = ctx->Attrs().Get<std::vector<int64_t>>("ends");
-    auto strides = ctx->Attrs().Get<std::vector<int64_t>>("strides");
-    PADDLE_ENFORCE_EQ(
-        starts.size(),
-        axis.size(),
-        platform::errors::InvalidArgument(
-            "Number of starts attribute and axis attribute should be same, "
-            "but get %d and %d",
-            starts.size(),
-            axis.size()));
-    PADDLE_ENFORCE_EQ(
-        ends.size(),
-        axis.size(),
-        platform::errors::InvalidArgument(
-            "Number of ends attribute and axis attribute should be same, "
-            "but get %d and %d",
-            ends.size(),
-            axis.size()));
-    PADDLE_ENFORCE_EQ(
-        strides.size(),
-        axis.size(),
-        platform::errors::InvalidArgument(
-            "Number of strides attribute and axis attribute should be same, "
-            "but get %d and %d",
-            strides.size(),
-            axis.size()));
-    PADDLE_ENFORCE_EQ(x_rank,
-                      y_rank,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_rank,
-                          y_rank));
-    std::vector<int64_t> y_target_shape(x_shape);
-    for (size_t i = 0; i < axis.size(); ++i) {
-      y_target_shape[axis[i]] =
-          (ends[i] - starts[i] + strides[i] - 1) / strides[i];
-    }
-    for (size_t i = 0; i < x_rank; ++i) {
-      PADDLE_ENFORCE_EQ(y_target_shape[i],
-                        y_shape[i],
-                        platform::errors::InvalidArgument(
-                            "The shape of source tensor of slice_assign_p op "
-                            "at dimension %d should be %d, "
-                            "but get %d",
-                            i,
-                            y_target_shape[i],
-                            y_shape[i]));
-    }
-    PADDLE_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape);
-  }
-};
-
-class SliceAssignPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Input(ctx, "Y")[0];
-    auto z_name = Output(ctx, "Z")[0];
-    auto x_type = GetType(ctx, x_name);
-    auto y_type = GetType(ctx, y_name);
-    auto x_dtype = GetDataType(ctx, x_name);
-    auto y_dtype = GetDataType(ctx, y_name);
-    PADDLE_ENFORCE_EQ(x_type,
-                      y_type,
-                      platform::errors::InvalidArgument(
-                          "The type of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_type,
-                          y_type));
-    PADDLE_ENFORCE_EQ(x_dtype,
-                      y_dtype,
-                      platform::errors::InvalidArgument(
-                          "The datatype of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_dtype,
-                          y_dtype));
-
-    SetType(ctx, z_name, GetType(ctx, x_name));
-    SetDataType(ctx, z_name, GetDataType(ctx, x_name));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(slice_assign_p,
-                  paddle::operators::SliceAssignPrimOp,
-                  paddle::operators::SliceAssignPrimOpMaker,
-                  paddle::operators::SliceAssignPrimOpShapeInference,
-                  paddle::operators::SliceAssignPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/slice_select_p_op.cc b/paddle/fluid/operators/prim_ops/slice_select_p_op.cc
deleted file mode 100644
index dd2242368b739..0000000000000
--- a/paddle/fluid/operators/prim_ops/slice_select_p_op.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class VarDesc;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class SliceSelectPrimOp : public framework::OperatorBase {
- public:
-  SliceSelectPrimOp(const std::string &type,
-                    const framework::VariableNameMap &inputs,
-                    const framework::VariableNameMap &outputs,
-                    const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator slice_select_p should not be executed directly"));
-  }
-};
-
-class SliceSelectPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of slice_select_p op.");
-    AddOutput("Y", "(Tensor), The output tensor of slice_select_p op.");
-    AddAttr<std::vector<int64_t>>(
-        "axis", "(std::vector<int64_t>), The axis along which to gather.");
-    AddAttr<std::vector<int64_t>>(
-        "starts",
-        "(std::vector<int64_t>) The slice starts of slice_select_p op");
-    AddAttr<std::vector<int64_t>>(
-        "ends", "(std::vector<int64_t>) The slice ends of slice_select_p op");
-    AddAttr<std::vector<int64_t>>(
-        "strides",
-        "(std::vector<int64_t>) The slice strides of slice_select_p op");
-    AddComment(R"DOC(
-Autograd primitive slice_select_p operator.
-)DOC");
-  }
-};
-
-class SliceSelectPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    auto x_shape = x_var->GetShape();
-    auto axis = ctx->Attrs().Get<std::vector<int64_t>>("axis");
-    auto starts = ctx->Attrs().Get<std::vector<int64_t>>("starts");
-    auto ends = ctx->Attrs().Get<std::vector<int64_t>>("ends");
-    auto strides = ctx->Attrs().Get<std::vector<int64_t>>("strides");
-    PADDLE_ENFORCE_EQ(
-        starts.size(),
-        axis.size(),
-        platform::errors::InvalidArgument(
-            "Number of starts attribute and axis attribute should be same, "
-            "but get %d and %d",
-            starts.size(),
-            axis.size()));
-    PADDLE_ENFORCE_EQ(
-        ends.size(),
-        axis.size(),
-        platform::errors::InvalidArgument(
-            "Number of ends attribute and axis attribute should be same, "
-            "but get %d and %d",
-            ends.size(),
-            axis.size()));
-    PADDLE_ENFORCE_EQ(
-        strides.size(),
-        axis.size(),
-        platform::errors::InvalidArgument(
-            "Number of strides attribute and axis attribute should be same, "
-            "but get %d and %d",
-            strides.size(),
-            axis.size()));
-    for (size_t i = 0; i < axis.size(); ++i) {
-      x_shape[axis[i]] = (ends[i] - starts[i] + strides[i] - 1) / strides[i];
-    }
-    PADDLE_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_shape);
-  }
-};
-
-class SliceSelectPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Output(ctx, "Y")[0];
-    SetType(ctx, y_name, GetType(ctx, x_name));
-    SetDataType(ctx, y_name, GetDataType(ctx, x_name));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(slice_select_p,
-                  paddle::operators::SliceSelectPrimOp,
-                  paddle::operators::SliceSelectPrimOpMaker,
-                  paddle::operators::SliceSelectPrimOpShapeInference,
-                  paddle::operators::SliceSelectPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/split_p_op.cc b/paddle/fluid/operators/prim_ops/split_p_op.cc
deleted file mode 100644
index bc0f8b8a31cda..0000000000000
--- a/paddle/fluid/operators/prim_ops/split_p_op.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class VarDesc;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class SplitPrimOp : public framework::OperatorBase {
- public:
-  SplitPrimOp(const std::string &type,
-              const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator split_p should not be executed directly"));
-  }
-};
-
-class SplitPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of split_p op.");
-    AddOutput("YS", "(Tensor), The output tensors of split_p op.")
-        .AsDuplicable();
-    AddAttr<int64_t>("axis", "(int64_t), The axis along which to split.");
-    AddAttr<std::vector<int64_t>>(
-        "num_or_sections",
-        "(std::vector<int64_t>) If num_or_sections has only one element, then "
-        "num_or_sections indicates the number of equal sized sub-Tensors that "
-        "the input will be divided into. If num_or_sections has more then one "
-        "element, the length of it indicates the number of sub-Tensors and the "
-        "elements in it indicate the sizes of sub-Tensors' dimension orderly. "
-        "The length of the vector must not be larger than the input's size of "
-        "specified axis.");
-    AddComment(R"DOC(
-Autograd primitive split_p operator.
-)DOC");
-  }
-};
-
-class SplitPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    auto y_var_ptrs = ctx->GetOutputVarPtrs("YS");
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    auto x_shape = x_var->GetShape();
-    auto axis = ctx->Attrs().Get<int64_t>("axis");
-    auto num_or_sections =
-        ctx->Attrs().Get<std::vector<int64_t>>("num_or_sections");
-    std::vector<int64_t> y_shape(x_shape);
-    if (num_or_sections.size() == 1) {
-      PADDLE_ENFORCE_EQ(x_shape[axis] % num_or_sections[0],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "The input tensor can't be devided equally into %d "
-                            "parts equally along axis %d",
-                            num_or_sections[0],
-                            axis));
-      y_shape[axis] = x_shape[axis] / num_or_sections[0];
-      for (size_t i = 0; i < size_t(num_or_sections[0]); ++i) {
-        PADDLE_GET(framework::VarDesc *, y_var_ptrs[i])->SetShape(y_shape);
-      }
-    } else {
-      int64_t cnt_along_axis = 0;
-      for (size_t i = 0; i < num_or_sections.size(); ++i) {
-        y_shape[axis] = num_or_sections[i];
-        cnt_along_axis += num_or_sections[i];
-        PADDLE_GET(framework::VarDesc *, y_var_ptrs[i])->SetShape(y_shape);
-      }
-      PADDLE_ENFORCE_EQ(
-          x_shape[axis],
-          cnt_along_axis,
-          platform::errors::InvalidArgument(
-              "The input tensor has %d elements along axis %d, thus can't be "
-              "devided into %d tensor with %d elements totally.",
-              x_shape[axis],
-              axis,
-              num_or_sections.size(),
-              cnt_along_axis));
-    }
-  }
-};
-
-class SplitPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_names = Output(ctx, "YS");
-    for (auto const &y_name : y_names) {
-      SetType(ctx, y_name, GetType(ctx, x_name));
-      SetDataType(ctx, y_name, GetDataType(ctx, x_name));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(split_p,
-                  paddle::operators::SplitPrimOp,
-                  paddle::operators::SplitPrimOpMaker,
-                  paddle::operators::SplitPrimOpShapeInference,
-                  paddle::operators::SplitPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/sqrt_p_op.cc b/paddle/fluid/operators/prim_ops/sqrt_p_op.cc
deleted file mode 100644
index caebfd388f68f..0000000000000
--- a/paddle/fluid/operators/prim_ops/sqrt_p_op.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class VarDesc;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class SqrtPrimOp : public framework::OperatorBase {
- public:
-  SqrtPrimOp(const std::string &type,
-             const framework::VariableNameMap &inputs,
-             const framework::VariableNameMap &outputs,
-             const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator sqrt_p should not be executed directly"));
-  }
-};
-
-class SqrtPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of sqrt_p op.");
-    AddOutput("Y", "(Tensor), The output tensor of sqrt_p op.");
-    AddComment(R"DOC(
-Autograd primitive sqrt_p operator.
-)DOC");
-  }
-};
-
-class SqrtPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
-
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-
-    PADDLE_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_var->GetShape());
-  }
-};
-
-class SqrtPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Output(ctx, "Y")[0];
-    SetType(ctx, y_name, GetType(ctx, x_name));
-    SetDataType(ctx, y_name, GetDataType(ctx, x_name));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(sqrt_p,
-                  paddle::operators::SqrtPrimOp,
-                  paddle::operators::SqrtPrimOpMaker,
-                  paddle::operators::SqrtPrimOpShapeInference,
-                  paddle::operators::SqrtPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/sub_p_op.cc b/paddle/fluid/operators/prim_ops/sub_p_op.cc
deleted file mode 100644
index 4497978093f4f..0000000000000
--- a/paddle/fluid/operators/prim_ops/sub_p_op.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class VarDesc;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class SubPrimOp : public framework::OperatorBase {
- public:
-  SubPrimOp(const std::string &type,
-            const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator sub_p should not be executed directly"));
-  }
-};
-
-class SubPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of sub_p op.");
-    AddInput("Y", "(Tensor), The input tensor of sub_p op.");
-    AddOutput("Z", "(Tensor), The output tensor of sub_p op.");
-    AddComment(R"DOC(
-Autograd primitive sub_p operator.
-)DOC");
-  }
-};
-
-class SubPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0];
-    framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0];
-
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    framework::VarDesc *y_var = PADDLE_GET(framework::VarDesc *, y_var_ptr);
-    auto x_shape = x_var->GetShape();
-    auto y_shape = y_var->GetShape();
-    size_t x_rank = x_shape.size();
-    size_t y_rank = y_shape.size();
-    PADDLE_ENFORCE_EQ(x_rank,
-                      y_rank,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_rank,
-                          y_rank));
-    for (size_t i = 0; i < x_rank; ++i) {
-      PADDLE_ENFORCE_EQ(
-          x_shape[i],
-          y_shape[i],
-          platform::errors::InvalidArgument(
-              "The shape of two input tensor at dimension %d should be same, "
-              "but get %d and %d",
-              i,
-              x_shape[i],
-              y_shape[i]));
-    }
-
-    PADDLE_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape);
-  }
-};
-
-class SubPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Input(ctx, "Y")[0];
-    auto z_name = Output(ctx, "Z")[0];
-    auto x_type = GetType(ctx, x_name);
-    auto y_type = GetType(ctx, y_name);
-    auto x_dtype = GetDataType(ctx, x_name);
-    auto y_dtype = GetDataType(ctx, y_name);
-    PADDLE_ENFORCE_EQ(x_type,
-                      y_type,
-                      platform::errors::InvalidArgument(
-                          "The type of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_type,
-                          y_type));
-    PADDLE_ENFORCE_EQ(x_dtype,
-                      y_dtype,
-                      platform::errors::InvalidArgument(
-                          "The datatype of two input tensor should be same, "
-                          "but get %d and %d",
-                          x_dtype,
-                          y_dtype));
-
-    SetType(ctx, z_name, x_type);
-    SetDataType(ctx, z_name, x_dtype);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(sub_p,
-                  paddle::operators::SubPrimOp,
-                  paddle::operators::SubPrimOpMaker,
-                  paddle::operators::SubPrimOpShapeInference,
-                  paddle::operators::SubPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/tanh_p_op.cc b/paddle/fluid/operators/prim_ops/tanh_p_op.cc
deleted file mode 100644
index 042394aa15068..0000000000000
--- a/paddle/fluid/operators/prim_ops/tanh_p_op.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class VarDesc;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class TanhPrimOp : public framework::OperatorBase {
- public:
-  TanhPrimOp(const std::string &type,
-             const framework::VariableNameMap &inputs,
-             const framework::VariableNameMap &outputs,
-             const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator tanh_p should not be executed directly"));
-  }
-};
-
-class TanhPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of tanh_p op.");
-    AddOutput("Y", "(Tensor), The output tensor of tanh_p op.");
-    AddComment(R"DOC(
-Autograd primitive tanh_p operator.
-)DOC");
-  }
-};
-
-class TanhPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
-
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-
-    PADDLE_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_var->GetShape());
-  }
-};
-
-class TanhPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Output(ctx, "Y")[0];
-    SetType(ctx, y_name, GetType(ctx, x_name));
-    SetDataType(ctx, y_name, GetDataType(ctx, x_name));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(tanh_p,
-                  paddle::operators::TanhPrimOp,
-                  paddle::operators::TanhPrimOpMaker,
-                  paddle::operators::TanhPrimOpShapeInference,
-                  paddle::operators::TanhPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/transpose_p_op.cc b/paddle/fluid/operators/prim_ops/transpose_p_op.cc
deleted file mode 100644
index cb76f81ef0901..0000000000000
--- a/paddle/fluid/operators/prim_ops/transpose_p_op.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class VarDesc;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class TransposePrimOp : public framework::OperatorBase {
- public:
-  TransposePrimOp(const std::string &type,
-                  const framework::VariableNameMap &inputs,
-                  const framework::VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator transpose_p should not be executed directly"));
-  }
-};
-
-class TransposePrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of transpose_p op.");
-    AddOutput("Y", "(Tensor), The output tensor of transpose_p op.");
-    AddAttr<std::vector<int64_t>>("axis",
-                                  "(std::vector<int64_t>) Tanspose axis.");
-    AddComment(R"DOC(
-Autograd primitive transpose_p operator.
-)DOC");
-  }
-};
-
-class TransposePrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
-    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
-    auto x_shape = x_var->GetShape();
-    auto axis = ctx->Attrs().Get<std::vector<int64_t>>("axis");
-    size_t x_rank = x_shape.size();
-    size_t axis_size = axis.size();
-    PADDLE_ENFORCE_EQ(x_rank,
-                      axis_size,
-                      platform::errors::InvalidArgument(
-                          "The input tensor's dimension "
-                          "should be equal to the axis's size. "
-                          "But received input tensor's dimension is %d, "
-                          "axis's size is %d",
-                          x_rank,
-                          axis_size));
-
-    std::vector<int> count(axis_size, 0);
-    for (size_t i = 0; i < axis_size; i++) {
-      PADDLE_ENFORCE_GE(axis[i],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "The axis should be greater than or equal to 0."
-                            "But received %d of axis[%d]",
-                            axis[i],
-                            i));
-
-      PADDLE_ENFORCE_EQ(
-          axis[i] < static_cast<int>(axis_size) && ++count[axis[i]] == 1,
-          true,
-          platform::errors::InvalidArgument(
-              "Each element of Attribute axis should "
-              "be a unique value range from 0 to (dims - 1), "
-              "where the dims is the axis's size, "
-              "unique value means this axis value can appear only once. "
-              "But received axis[%d] is %d, axis_size is %d, "
-              "count[axis[%d]] is %d",
-              i,
-              axis[i],
-              axis_size,
-              i,
-              count[axis[i]]));
-    }
-    std::vector<int64_t> y_shape(axis_size);
-    for (size_t i = 0; i < axis_size; i++) {
-      y_shape[i] = x_shape[axis[i]];
-    }
-    PADDLE_GET(framework::VarDesc *, y_var_ptr)->SetShape(y_shape);
-  }
-};
-
-class TransposePrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto x_name = Input(ctx, "X")[0];
-    auto y_name = Output(ctx, "Y")[0];
-    SetType(ctx, y_name, GetType(ctx, x_name));
-    SetDataType(ctx, y_name, GetDataType(ctx, x_name));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(transpose_p,
-                  paddle::operators::TransposePrimOp,
-                  paddle::operators::TransposePrimOpMaker,
-                  paddle::operators::TransposePrimOpShapeInference,
-                  paddle::operators::TransposePrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/uniform_random_p_op.cc b/paddle/fluid/operators/prim_ops/uniform_random_p_op.cc
deleted file mode 100644
index 3a06459d33798..0000000000000
--- a/paddle/fluid/operators/prim_ops/uniform_random_p_op.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class VarDesc;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-class UniformRandomPrimOp : public framework::OperatorBase {
- public:
-  UniformRandomPrimOp(const std::string &type,
-                      const framework::VariableNameMap &inputs,
-                      const framework::VariableNameMap &outputs,
-                      const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator uniform_randrom_p should not be executed directly"));
-  }
-};
-
-class UniformRandomPrimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddOutput("Out", "(Tensor), The output tensor of uniform_random_p op.");
-    AddAttr<std::vector<int64_t>>("shape", "The shape of the output tensor")
-        .SetDefault({});
-    AddAttr<float>("min", "Minimum value of uniform_random_p. [default -1.0].");
-    AddAttr<float>("max", "Maximun value of uniform_random_p. [default 1.0].");
-    AddAttr<int>("seed",
-                 "Random seed used for generating samples. "
-                 "0 means use a seed generated by the system."
-                 "Note that if seed is not 0, this operator will always "
-                 "generate the same random numbers every time. ");
-    AddAttr<int>("dtype", "Output tensor data type. ");
-    AddComment(R"DOC(
-Autograd primitive uniform_random_p operator.
-)DOC");
-  }
-};
-
-class UniformRandomPrimOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Out")[0];
-    auto shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
-    PADDLE_GET(framework::VarDesc *, y_var_ptr)->SetShape(shape);
-  }
-};
-
-class UniformRandomPrimOpVarTypeInference
-    : public framework::StaticGraphVarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto y_name = Output(ctx, "Out")[0];
-    auto data_type = static_cast<framework::proto::VarType::Type>(
-        PADDLE_GET_CONST(int, ctx->GetAttr("dtype")));
-    SetDataType(ctx, y_name, data_type);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(uniform_random_p,
-                  paddle::operators::UniformRandomPrimOp,
-                  paddle::operators::UniformRandomPrimOpMaker,
-                  paddle::operators::UniformRandomPrimOpShapeInference,
-                  paddle::operators::UniformRandomPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/unity_build_rule.cmake b/paddle/fluid/operators/prim_ops/unity_build_rule.cmake
deleted file mode 100644
index 74b04d234fcde..0000000000000
--- a/paddle/fluid/operators/prim_ops/unity_build_rule.cmake
+++ /dev/null
@@ -1,20 +0,0 @@
-register_unity_group(
-  cc
-  reshape_p_op.cc
-  broadcast_p_op.cc
-  reduce_p_op.cc
-  transpose_p_op.cc
-  split_p_op.cc
-  concat_p_op.cc
-  slice_select_p_op.cc
-  slice_assign_p_op.cc
-  gather_p_op.cc
-  scatter_add_p_op.cc
-  add_p_op.cc
-  sub_p_op.cc
-  mul_p_op.cc
-  div_p_op.cc
-  sqrt_p_op.cc
-  tanh_p_op.cc
-  matmul_p_op.cc
-  fill_constant_p_op.cc)
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index 26647e8f05c83..e521fc0ffcacf 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -58,12 +58,11 @@ class PrintOp : public framework::OperatorBase {
 
     PADDLE_ENFORCE_NOT_NULL(
         in_var,
-        platform::errors::NotFound("The input:%s not found in scope",
-                                   Input("In")));
+        phi::errors::NotFound("The input:%s not found in scope", Input("In")));
     PADDLE_ENFORCE_NOT_NULL(
         out_var,
-        platform::errors::NotFound("The output:%s not found in scope",
-                                   Output("Out")));
+        phi::errors::NotFound("The output:%s not found in scope",
+                              Output("Out")));
 
     auto &in_tensor = in_var->Get<phi::DenseTensor>();
     phi::DenseTensor *out_tensor = out_var->GetMutable<phi::DenseTensor>();
diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.cc b/paddle/fluid/operators/prune_gate_by_capacity_op.cc
index 365342fa7ea5f..4e4bc4d291d68 100644
--- a/paddle/fluid/operators/prune_gate_by_capacity_op.cc
+++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cc
@@ -51,7 +51,7 @@ class PruneGateByCapacityOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         expert_count_num_ele,
         n_expert * n_worker,
-        platform::errors::Unavailable(
+        phi::errors::Unavailable(
             "The number of elements for expert_count is ( %ld ) incorrect. "
             "Because the number of expert_count must equal the "
             "product of n_worker ( %ld ) and n_expert ( %ld ). "
@@ -76,11 +76,11 @@ class PruneGateByCapacityOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         gate_idx_data_type,
         expert_count_data_type,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The dtype of the gate_idx and expert_count should be same"));
     PADDLE_ENFORCE_EQ(gate_idx_data_type,
                       framework::proto::VarType::INT64,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dtype of the gate_idx and expert_count should "
                           "be same as int64"));
     return phi::KernelKey(gate_idx_data_type, ctx.GetPlace());
diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
index 9d5e3eb00d0ef..ff8f9931b0e06 100644
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
@@ -29,15 +29,15 @@ class DistributedLookupTableOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE_EQ(ctx->HasInputs("Ids"),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input(Ids) of LookupTableOp should not be null."));
     PADDLE_ENFORCE_EQ(ctx->HasInput("W"),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input(W) of LookupTableOp should not be null."));
     PADDLE_ENFORCE_EQ(ctx->HasOutputs("Outputs"),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Output(Outs) of LookupTableOp should not be null."));
 
     auto ids_dims = ctx->GetInputsDim("Ids");
@@ -46,13 +46,13 @@ class DistributedLookupTableOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         table_dims.size(),
         2,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Only 2 dimensions of the 'Embedding' is supported."));
 
     for (auto &ids_dim : ids_dims) {
       PADDLE_ENFORCE_EQ(ids_dim.size(),
                         2,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The dimension of the 'Ids' tensor must be 2."));
     }
 
diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
index 414500c2faac3..258de211c482b 100644
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
@@ -39,7 +39,7 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
     } else if (var->IsType<phi::SelectedRows>()) {
       emb_dim = var->Get<phi::SelectedRows>().value().dims()[1];
     } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Expected type of `W` must be Tensor, SelectedRows.But got "
           "unsupport type: %s.",
           framework::ToTypeName(var->Type())));
diff --git a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
index 2a8b4f9be7698..cc9e0aeff1f01 100644
--- a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
+++ b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
@@ -29,11 +29,11 @@ class DistributedPushSparseOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE_EQ(ctx->HasInputs("Ids"),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input(Ids) of PushSparseOp should not be null."));
     PADDLE_ENFORCE_EQ(ctx->HasOutputs("Outputs"),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Output(Outs) of PushSparseOp should not be null."));
 
     auto ids_dims = ctx->GetInputsDim("Ids");
@@ -41,7 +41,7 @@ class DistributedPushSparseOp : public framework::OperatorWithKernel {
     for (auto &ids_dim : ids_dims) {
       PADDLE_ENFORCE_EQ(ids_dim.size(),
                         2,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The dimension of the 'Ids' tensor must be 2."));
     }
 
diff --git a/paddle/fluid/operators/pscore/fake_init_op.cc b/paddle/fluid/operators/pscore/fake_init_op.cc
index cd919cb7ca0bf..1104b8bed673e 100644
--- a/paddle/fluid/operators/pscore/fake_init_op.cc
+++ b/paddle/fluid/operators/pscore/fake_init_op.cc
@@ -43,7 +43,7 @@ class FakeInitOp : public framework::OperatorBase {
       tensor = out_var.GetMutable<phi::SelectedRows>()->mutable_value();
       tensor->Resize(common::make_ddim(Attr<std::vector<int64_t>>("shape")));
     } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "fake init op's output only"
           "supports SelectedRows and phi::DenseTensor"));
     }
diff --git a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
index 978981a6fcdf3..be1e6c64b2484 100644
--- a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
+++ b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
@@ -66,13 +66,13 @@ void HeterListenAndServOp::RunAsyncLoop(framework::ProgramDesc *program) const {
     VLOG(3) << "after split, key = " << pieces[0] << ", id=" << pieces[1];
     PADDLE_ENFORCE_EQ(pieces.size(),
                       2,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "Invalid format of message_and_id argument. "
                           "Expected \"message:block_id\". Received %s",
                           grad_and_id.c_str()));
     PADDLE_ENFORCE_EQ(out_map->count(pieces[0]),
                       0,
-                      platform::errors::AlreadyExists(
+                      phi::errors::AlreadyExists(
                           "The message name %s has already existed in out_map",
                           pieces[0].c_str()));
 
@@ -87,7 +87,7 @@ void HeterListenAndServOp::RunAsyncLoop(framework::ProgramDesc *program) const {
   size_t num_blocks = program->Size();
   PADDLE_ENFORCE_GE(num_blocks,
                     1,
-                    platform::errors::PreconditionNotMet(
+                    phi::errors::PreconditionNotMet(
                         "Invalid number of blocks in server program. Expected "
                         "equal or greater than 1. Received %zu",
                         num_blocks));
@@ -136,7 +136,7 @@ void HeterListenAndServOp::RunImpl(const framework::Scope &scope,
 
   PADDLE_ENFORCE_EQ(heter_server_,
                     nullptr,
-                    platform::errors::PreconditionNotMet(
+                    phi::errors::PreconditionNotMet(
                         "RPC service has been created unexpectedly."));
 
   std::string endpoint = Attr<std::string>("endpoint");
@@ -150,7 +150,7 @@ void HeterListenAndServOp::RunImpl(const framework::Scope &scope,
       Attr<std::vector<framework::BlockDesc *>>("optimize_blocks");
   PADDLE_ENFORCE_GE(optimize_blocks.size(),
                     1,
-                    platform::errors::PreconditionNotMet(
+                    phi::errors::PreconditionNotMet(
                         "optimize blocks is less than 1. Optimize blocks "
                         "should be 1 at least on the pserver side."));
 
diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.cc b/paddle/fluid/operators/pull_box_extended_sparse_op.cc
deleted file mode 100644
index 75918b9ad62a4..0000000000000
--- a/paddle/fluid/operators/pull_box_extended_sparse_op.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/pull_box_extended_sparse_op.h"
-
-namespace paddle {
-namespace operators {
-
-class PullBoxExtendedSparseOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_GE(
-        ctx->Inputs("Ids").size(),
-        1UL,
-        platform::errors::InvalidArgument(
-            "Inputs(Ids) of PullBoxExtendedSparseOp should not be empty."));
-    PADDLE_ENFORCE_GE(
-        ctx->Outputs("Out").size(),
-        1UL,
-        platform::errors::InvalidArgument(
-            "Outputs(Out) of PullBoxExtendedSparseOp should not be empty."));
-    PADDLE_ENFORCE_GE(ctx->Outputs("OutExtend").size(),
-                      1UL,
-                      platform::errors::InvalidArgument(
-                          "Outputs(OutExtend) of PullBoxExtendedSparseOp "
-                          "should not be empty."));
-    auto emb_size = static_cast<int64_t>(ctx->Attrs().Get<int>("emb_size"));
-    auto emb_extended_size =
-        static_cast<int64_t>(ctx->Attrs().Get<int>("emb_extended_size"));
-    auto all_ids_dim = ctx->GetInputsDim("Ids");
-    const size_t n_ids = all_ids_dim.size();
-    std::vector<framework::DDim> outs_dims;
-    std::vector<framework::DDim> outs_extended_dims;
-    outs_dims.resize(n_ids);
-    outs_extended_dims.resize(n_ids);
-    for (size_t i = 0; i < n_ids; ++i) {
-      const auto ids_dims = all_ids_dim[i];
-      int ids_rank = ids_dims.size();
-      PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1],
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Shape error in %lu id, the last dimension of the "
-                            "'Ids' tensor must be 1.",
-                            i));
-      auto out_dim =
-          common::vectorize(common::slice_ddim(ids_dims, 0, ids_rank - 1));
-      out_dim.push_back(emb_size);
-      outs_dims[i] = common::make_ddim(out_dim);
-
-      auto out_extended_dim =
-          common::vectorize(common::slice_ddim(ids_dims, 0, ids_rank - 1));
-      out_extended_dim.push_back(emb_extended_size);
-      outs_extended_dims[i] = common::make_ddim(out_extended_dim);
-    }
-    ctx->SetOutputsDim("Out", outs_dims);
-    ctx->SetOutputsDim("OutExtend", outs_extended_dims);
-    for (size_t i = 0; i < n_ids; ++i) {
-      ctx->ShareLoD("Ids", "Out", i, i);
-      ctx->ShareLoD("Ids", "OutExtend", i, i);
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(framework::proto::VarType::FP32, ctx.GetPlace());
-  }
-};
-
-class PullBoxExtendedSparseOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids",
-             "Input tensors with type int32 or int64 "
-             "contains the ids to be looked up in BoxPS. "
-             "The last dimension size must be 1.")
-        .AsDuplicable();
-    AddOutput("Out", "The lookup results tensors.").AsDuplicable();
-    AddOutput("OutExtend", "The lookup extended results tensors.")
-        .AsDuplicable();
-    AddAttr<int>("emb_size", "(int, the embedding hidden size").SetDefault(1);
-    AddAttr<int>("emb_extended_size",
-                 "(int, the extended_embedding hidden size")
-        .SetDefault(128);
-    AddComment(R"DOC(
-Pull Box Extended Sparse Operator.
-
-This operator is used to perform lookups on the BoxPS,
-then concatenated into a dense tensor.
-
-The input Ids can carry the LoD (Level of Details) information,
-or not. And the output only shares the LoD information with input Ids.
-
-)DOC");
-  }
-};
-
-template <typename T>
-class PushBoxExtendedSparseOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("push_box_extended_sparse");
-    op->SetInput("Ids", this->Input("Ids"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetInput(framework::GradVarName("OutExtend"),
-                 this->OutputGrad("OutExtend"));
-    op->SetOutput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-class PushBoxExtendedSparseOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Out")),
-                          ctx.GetPlace());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    pull_box_extended_sparse,
-    ops::PullBoxExtendedSparseOp,
-    ops::PullBoxExtendedSparseOpMaker,
-    ops::PushBoxExtendedSparseOpMaker<paddle::framework::OpDesc>,
-    ops::PushBoxExtendedSparseOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OPERATOR(push_box_extended_sparse, ops::PushBoxExtendedSparseOp);
-
-PD_REGISTER_STRUCT_KERNEL(pull_box_extended_sparse,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::PullBoxExtendedSparseCPUKernel,
-                          float,
-                          double) {}
-PD_REGISTER_STRUCT_KERNEL(push_box_extended_sparse,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::PushBoxExtendedSparseCPUKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.cu b/paddle/fluid/operators/pull_box_extended_sparse_op.cu
deleted file mode 100644
index 570c367c93182..0000000000000
--- a/paddle/fluid/operators/pull_box_extended_sparse_op.cu
+++ /dev/null
@@ -1,53 +0,0 @@
-//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/pull_box_extended_sparse_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/phi/backends/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class PullBoxExtendedSparseCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PullBoxExtendedSparseFunctor<T>(ctx);
-  }
-};
-
-template <typename T, typename DeviceContext>
-class PushBoxExtendedSparseCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PushBoxExtendedSparseFunctor<T>(ctx);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(pull_box_extended_sparse,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::PullBoxExtendedSparseCUDAKernel,
-                          float,
-                          double) {}
-PD_REGISTER_STRUCT_KERNEL(push_box_extended_sparse,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::PushBoxExtendedSparseCUDAKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.h b/paddle/fluid/operators/pull_box_extended_sparse_op.h
deleted file mode 100644
index 76e570f10fb64..0000000000000
--- a/paddle/fluid/operators/pull_box_extended_sparse_op.h
+++ /dev/null
@@ -1,128 +0,0 @@
-//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <vector>
-
-#include "paddle/fluid/framework/fleet/box_wrapper.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-static void PullBoxExtendedSparseFunctor(
-    const framework::ExecutionContext &ctx) {
-  auto inputs = ctx.MultiInput<phi::DenseTensor>("Ids");
-  auto outputs = ctx.MultiOutput<phi::DenseTensor>("Out");
-  auto outputs_extend = ctx.MultiOutput<phi::DenseTensor>("OutExtend");
-  const auto slot_size = inputs.size();
-  std::vector<const uint64_t *> all_keys(slot_size);
-  // BoxPS only supports float now
-  std::vector<float *> all_values(slot_size * 2);
-  std::vector<int64_t> slot_lengths(slot_size);
-  for (size_t i = 0; i < slot_size; i++) {
-    const auto *slot = inputs[i];
-    const uint64_t *single_slot_keys =
-        reinterpret_cast<const uint64_t *>(slot->data<int64_t>());
-    all_keys[i] = single_slot_keys;
-    slot_lengths[i] = slot->numel();
-    auto *output = outputs[i]->mutable_data<T>(ctx.GetPlace());
-    auto *output_extend = outputs_extend[i]->mutable_data<T>(ctx.GetPlace());
-    all_values[i] = reinterpret_cast<float *>(output);
-    all_values[i + slot_size] = reinterpret_cast<float *>(output_extend);
-  }
-#ifdef PADDLE_WITH_BOX_PS
-  auto emb_size = ctx.Attr<int>("emb_size");
-  auto emb_extended_size = ctx.Attr<int>("emb_extended_size");
-  auto box_ptr = paddle::framework::BoxWrapper::GetInstance();
-  box_ptr->PullSparse(ctx.GetPlace(),
-                      all_keys,
-                      all_values,
-                      slot_lengths,
-                      emb_size,
-                      emb_extended_size);
-#endif
-}
-
-template <typename T>
-static void PushBoxExtendedSparseFunctor(
-    const framework::ExecutionContext &ctx) {
-  auto inputs = ctx.MultiInput<phi::DenseTensor>("Ids");
-  auto d_output =
-      ctx.MultiInput<phi::DenseTensor>(framework::GradVarName("Out"));
-  auto d_output_extend =
-      ctx.MultiInput<phi::DenseTensor>(framework::GradVarName("OutExtend"));
-  const auto slot_size = inputs.size();
-  std::vector<const uint64_t *> all_keys(slot_size);
-  std::vector<const float *> all_grad_values(slot_size * 2);
-  std::vector<int64_t> slot_lengths(slot_size);
-  int batch_size = -1;
-  for (size_t i = 0; i < slot_size; i++) {
-    const auto *slot = inputs[i];
-    const uint64_t *single_slot_keys =
-        reinterpret_cast<const uint64_t *>(slot->data<int64_t>());
-    all_keys[i] = single_slot_keys;
-    slot_lengths[i] = slot->numel();
-    int cur_batch_size =
-        slot->lod().size() ? slot->lod()[0].size() - 1 : slot->dims()[0];
-    if (batch_size == -1) {
-      batch_size = cur_batch_size;
-    } else {
-      PADDLE_ENFORCE_EQ(batch_size,
-                        cur_batch_size,
-                        platform::errors::PreconditionNotMet(
-                            "The batch size of all input slots should be same,"
-                            "please check"));
-    }
-    const float *grad_value = d_output[i]->data<float>();
-    const float *grad_value_extend = d_output_extend[i]->data<float>();
-    all_grad_values[i] = reinterpret_cast<const float *>(grad_value);
-    all_grad_values[i + slot_size] =
-        reinterpret_cast<const float *>(grad_value_extend);
-  }
-#ifdef PADDLE_WITH_BOX_PS
-  auto emb_size = ctx.Attr<int>("emb_size");
-  auto emb_extended_size = ctx.Attr<int>("emb_extended_size");
-  auto box_ptr = paddle::framework::BoxWrapper::GetInstance();
-  box_ptr->PushSparseGrad(ctx.GetPlace(),
-                          all_keys,
-                          all_grad_values,
-                          slot_lengths,
-                          emb_size,
-                          emb_extended_size,
-                          batch_size);
-#endif
-}
-
-template <typename T, typename DeviceContext>
-class PullBoxExtendedSparseCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PullBoxExtendedSparseFunctor<T>(ctx);
-  }
-};
-
-template <typename T, typename DeviceContext>
-class PushBoxExtendedSparseCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PushBoxExtendedSparseFunctor<T>(ctx);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/pull_box_sparse_op.cc b/paddle/fluid/operators/pull_box_sparse_op.cc
index d37cc35a59945..51786ffc0180d 100644
--- a/paddle/fluid/operators/pull_box_sparse_op.cc
+++ b/paddle/fluid/operators/pull_box_sparse_op.cc
@@ -24,12 +24,12 @@ class PullBoxSparseOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GE(
         ctx->Inputs("Ids").size(),
         1UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Inputs(Ids) of PullBoxSparseOp should not be empty."));
     PADDLE_ENFORCE_GE(
         ctx->Outputs("Out").size(),
         1UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Outputs(Out) of PullBoxSparseOp should not be empty."));
     auto hidden_size = static_cast<int64_t>(ctx->Attrs().Get<int>("size"));
     auto all_ids_dim = ctx->GetInputsDim("Ids");
@@ -41,7 +41,7 @@ class PullBoxSparseOp : public framework::OperatorWithKernel {
       int ids_rank = ids_dims.size();
       PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1],
                         1,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "Shape error in %lu id, the last dimension of the "
                             "'Ids' tensor must be 1.",
                             i));
diff --git a/paddle/fluid/operators/pull_box_sparse_op.h b/paddle/fluid/operators/pull_box_sparse_op.h
index 1ebfa11a2b2e6..06ebe7b5a93d3 100644
--- a/paddle/fluid/operators/pull_box_sparse_op.h
+++ b/paddle/fluid/operators/pull_box_sparse_op.h
@@ -82,7 +82,7 @@ static void PushBoxSparseFunctor(const framework::ExecutionContext &ctx) {
     } else {
       PADDLE_ENFORCE_EQ(batch_size,
                         cur_batch_size,
-                        platform::errors::PreconditionNotMet(
+                        phi::errors::PreconditionNotMet(
                             "The batch size of all input slots should be same, "
                             "please cheack"));
     }
diff --git a/paddle/fluid/operators/pull_gpups_sparse_op.cc b/paddle/fluid/operators/pull_gpups_sparse_op.cc
index 6055632f5681a..946a1b8c7136b 100644
--- a/paddle/fluid/operators/pull_gpups_sparse_op.cc
+++ b/paddle/fluid/operators/pull_gpups_sparse_op.cc
@@ -24,21 +24,21 @@ class PullGpuPSSparseOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GE(
         ctx->Inputs("Ids").size(),
         1UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Inputs(Ids) of PullGpuPSSparseOp should not be empty."));
     PADDLE_ENFORCE_GE(
         ctx->Outputs("Out").size(),
         1UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Outputs(Out) of PullGpuPSSparseOp should not be empty."));
     auto embedding_size_vec = ctx->Attrs().Get<std::vector<int>>("size");
     PADDLE_ENFORCE_EQ(
         ctx->Inputs("Ids").size(),
         embedding_size_vec.size(),
-        platform::errors::InvalidArgument("The ids size: %lu must be equal to "
-                                          "the length of embedding size: %lu.",
-                                          ctx->Inputs("Ids").size(),
-                                          embedding_size_vec.size()));
+        phi::errors::InvalidArgument("The ids size: %lu must be equal to "
+                                     "the length of embedding size: %lu.",
+                                     ctx->Inputs("Ids").size(),
+                                     embedding_size_vec.size()));
     auto all_ids_dim = ctx->GetInputsDim("Ids");
     const size_t n_ids = all_ids_dim.size();
     std::vector<framework::DDim> outs_dims;
@@ -49,7 +49,7 @@ class PullGpuPSSparseOp : public framework::OperatorWithKernel {
       int ids_rank = ids_dims.size();
       PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1],
                         1,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "Shape error in %lu id, the last dimension of the "
                             "'Ids' tensor must be 1.",
                             i));
diff --git a/paddle/fluid/operators/pull_gpups_sparse_op.h b/paddle/fluid/operators/pull_gpups_sparse_op.h
index e5e08cfdde685..098e9b143a8e1 100644
--- a/paddle/fluid/operators/pull_gpups_sparse_op.h
+++ b/paddle/fluid/operators/pull_gpups_sparse_op.h
@@ -78,7 +78,7 @@ static void PushGpuPSSparseFunctor(const framework::ExecutionContext &ctx) {
     } else {
       PADDLE_ENFORCE_EQ(batch_size,
                         cur_batch_size,
-                        platform::errors::PreconditionNotMet(
+                        phi::errors::PreconditionNotMet(
                             "The batch size of all input slots should be same, "
                             "please check"));
     }
diff --git a/paddle/fluid/operators/pull_sparse_op.cc b/paddle/fluid/operators/pull_sparse_op.cc
index 55a6af8466b86..dcea341c8a9e9 100644
--- a/paddle/fluid/operators/pull_sparse_op.cc
+++ b/paddle/fluid/operators/pull_sparse_op.cc
@@ -25,11 +25,11 @@ class PullSparseOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE_GE(ctx->Inputs("Ids").size(),
                       1UL,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input(Ids) of PullSparseOp can not be null"));
     PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(),
                       1UL,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Output(Out) of PullSparseOp can not be null"));
 
     auto hidden_size =
@@ -43,7 +43,7 @@ class PullSparseOp : public framework::OperatorWithKernel {
       int ids_rank = ids_dims.size();
       PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1],
                         1,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "Shape error in %lu id, the last dimension of "
                             " the 'Ids' tensor must be 1.",
                             i));
diff --git a/paddle/fluid/operators/pull_sparse_v2_op.cc b/paddle/fluid/operators/pull_sparse_v2_op.cc
index d134607d3c4bb..07af5da7ef92a 100644
--- a/paddle/fluid/operators/pull_sparse_v2_op.cc
+++ b/paddle/fluid/operators/pull_sparse_v2_op.cc
@@ -25,11 +25,11 @@ class PullSparseV2Op : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE_GE(ctx->Inputs("Ids").size(),
                       1UL,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input(Ids) of PullSparseV2Op can not be null"));
     PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(),
                       1UL,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Output(Out) of PullSparseV2Op can not be null"));
 
     auto hidden_size =
diff --git a/paddle/fluid/operators/push_dense_op.cc b/paddle/fluid/operators/push_dense_op.cc
index c0b9b04500648..080610c6e0df1 100644
--- a/paddle/fluid/operators/push_dense_op.cc
+++ b/paddle/fluid/operators/push_dense_op.cc
@@ -25,7 +25,7 @@ class PushDenseOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE_GE(ctx->Inputs("Ids").size(),
                       1UL,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input(Ids) of PushDenseOp can not be null."));
   }
 
diff --git a/paddle/fluid/operators/push_dense_op.h b/paddle/fluid/operators/push_dense_op.h
index ec7b6b6c3f0bf..6ec833df39583 100644
--- a/paddle/fluid/operators/push_dense_op.h
+++ b/paddle/fluid/operators/push_dense_op.h
@@ -33,7 +33,7 @@ void PushDenseFunctor(const framework::ExecutionContext& ctx) {
   auto table_id = static_cast<uint32_t>(ctx.Attr<int>("TableId"));
   PADDLE_ENFORCE_GT(table_id,
                     0,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "table id should > 0, but value is ", table_id));
   float scale_datanorm = ctx.Attr<float>("ScaleDataNorm");
   const auto& ids = ctx.MultiInput<phi::DenseTensor>("Ids");
@@ -41,7 +41,7 @@ void PushDenseFunctor(const framework::ExecutionContext& ctx) {
       ids[0]->lod().size() ? ids[0]->lod()[0].size() - 1 : ids[0]->dims()[0];
   PADDLE_ENFORCE_GT(batch_size,
                     0,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "batch size should > 0, but value is ", batch_size));
 
   auto fleet_ptr = framework::FleetWrapper::GetInstance();
@@ -51,10 +51,10 @@ void PushDenseFunctor(const framework::ExecutionContext& ctx) {
   // note: GetInstance() is not thread-safe
   // we assume PullDenseWorker has been already initialized in DistMultiTrainer
   auto pull_dense_worker = framework::PullDenseWorker::GetInstance();
-  PADDLE_ENFORCE_NE(pull_dense_worker,
-                    nullptr,
-                    platform::errors::PreconditionNotMet(
-                        "pull_dense_worker should not be null"));
+  PADDLE_ENFORCE_NE(
+      pull_dense_worker,
+      nullptr,
+      phi::errors::PreconditionNotMet("pull_dense_worker should not be null"));
   int thread_id = pull_dense_worker->GetThreadIdByScope(&ctx.scope());
   pull_dense_worker->IncreaseThreadVersion(thread_id, table_id);
 #endif
diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc
index 7d9c8ceca4943..5e3fa0b5507a0 100644
--- a/paddle/fluid/operators/py_func_op.cc
+++ b/paddle/fluid/operators/py_func_op.cc
@@ -47,7 +47,7 @@ static py::object *GetPythonCallableObject(size_t i) {
   PADDLE_ENFORCE_LT(
       i,
       g_py_callables.size(),
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Invalid python callable id %d, which should be less than %d.",
           i,
           g_py_callables.size()));
@@ -81,7 +81,7 @@ static void CallPythonFunc(py::object *callable,
     // Otherwise, ret_num must be equal to out_num
     PADDLE_ENFORCE_EQ(ret_num == 1,
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Python function has no return values or returns "
                           "None. In this case, ret_num = 1 && ret[0] == None "
                           "&& out_num should be 0. But ret_num is %d",
@@ -90,7 +90,7 @@ static void CallPythonFunc(py::object *callable,
     PADDLE_ENFORCE_EQ(
         out_num == 0,
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Python function has no return values or returns None. In "
             "this case, ret_num = 1 && ret[0] == None && out_num should "
             "be 0. But out_num is %d",
@@ -99,7 +99,7 @@ static void CallPythonFunc(py::object *callable,
     PADDLE_ENFORCE_EQ(
         py::cast<phi::DenseTensor *>(ret_tuple[0]) == nullptr,
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Python function has no return values or returns None. In "
             "this case, ret_num = 1 && ret[0] == None && out_num should "
             "be 0. But ret[0] is not None"));
@@ -113,12 +113,12 @@ static void CallPythonFunc(py::object *callable,
     try {
       auto *py_out_tensor = py::cast<phi::DenseTensor *>(ret_tuple[i]);
       PADDLE_ENFORCE_NOT_NULL(py_out_tensor,
-                              platform::errors::InvalidArgument(
+                              phi::errors::InvalidArgument(
                                   "Output tensor %d should not be nullptr", i));
       out->set_lod(py_out_tensor->lod());
       out->ShareDataWith(*py_out_tensor);
     } catch (py::cast_error &) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "py::cast to phi::DenseTensor error. The %d-th output exception is "
           "phi::DenseTensor",
           i));
@@ -139,15 +139,15 @@ class PyFuncOpVarTypeInference : public framework::StaticGraphVarTypeInference {
     PADDLE_ENFORCE_EQ(
         has_in || has_out,
         true,
-        platform::errors::InvalidArgument("Input(X) or Output(Out) must exist, "
-                                          "but has_in is %d, has_out is %d.",
-                                          has_in,
-                                          has_out));
+        phi::errors::InvalidArgument("Input(X) or Output(Out) must exist, "
+                                     "but has_in is %d, has_out is %d.",
+                                     has_in,
+                                     has_out));
 
     PADDLE_ENFORCE_GE(
         PADDLE_GET_CONST(int, ctx->GetAttr(kForwardPythonCallableId.data())),
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Function id cannot be less than 0, but received value is %d.",
             PADDLE_GET_CONST(int,
                              ctx->GetAttr(kForwardPythonCallableId.data()))));
@@ -192,8 +192,8 @@ class PyFuncOpShapeInference : public framework::InferShapeBase {
     PADDLE_ENFORCE_EQ(
         !ctx->IsRuntime(),
         true,
-        platform::errors::InvalidArgument("Shape inference cannot be called at "
-                                          "run time in 'py_func' operator."));
+        phi::errors::InvalidArgument("Shape inference cannot be called at "
+                                     "run time in 'py_func' operator."));
   }
 };
 
diff --git a/paddle/fluid/operators/pyramid_hash_op.cc b/paddle/fluid/operators/pyramid_hash_op.cc
index f5a8fcaa9de0c..3f92da5d73676 100644
--- a/paddle/fluid/operators/pyramid_hash_op.cc
+++ b/paddle/fluid/operators/pyramid_hash_op.cc
@@ -88,24 +88,24 @@ class PyramidHashOP : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("X"),
         true,
-        platform::errors::NotFound("Input(X) of PyramidHashOP is not found."));
+        phi::errors::NotFound("Input(X) of PyramidHashOP is not found."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("W"),
         true,
-        platform::errors::NotFound("Input(W) of PyramidHashOP is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"),
-                      true,
-                      platform::errors::NotFound(
-                          "Output(Out) of PyramidHashOP is not found."));
+        phi::errors::NotFound("Input(W) of PyramidHashOP is not found."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("Out"),
+        true,
+        phi::errors::NotFound("Output(Out) of PyramidHashOP is not found."));
     PADDLE_ENFORCE_EQ(ctx->HasOutput("DropPos"),
                       true,
-                      platform::errors::NotFound(
+                      phi::errors::NotFound(
                           "Output(DropPos) of PyramidHashOP is not found."));
 
     auto x_dims = ctx->GetInputDim("X");
     PADDLE_ENFORCE_EQ(x_dims.size(),
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The rank of Input(X) of PyramidHashOP is invalid. "
                           "It should be 2, but got %d",
                           x_dims.size()));
@@ -113,7 +113,7 @@ class PyramidHashOP : public framework::OperatorWithKernel {
     auto w_dims = ctx->GetInputDim("W");
     PADDLE_ENFORCE_EQ(w_dims.size(),
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The rank of Input(W) of PyramidHashOP is invalid. "
                           "It should be 2, but got %d",
                           w_dims.size()));
@@ -124,7 +124,7 @@ class PyramidHashOP : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         w_dims[0],
         space_len + rand_len,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The first dimension of Input(W) of PyramidHashOP is invalid. "
             "It should be space_len + rand_len, but now %d != %d + %d",
             w_dims[0],
@@ -133,7 +133,7 @@ class PyramidHashOP : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         w_dims[1],
         1,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The second dimension of Input(W) of PyramidHashOP is invalid."
             " It should be 1, but got %d",
             w_dims[1]));
@@ -142,7 +142,7 @@ class PyramidHashOP : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         num_emb % rand_len,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The PyramidHashOP's Attr(num_emb) should mod Attr(rand_len), "
             "but num_emb is %d, rand_len is %d",
             num_emb,
@@ -153,19 +153,19 @@ class PyramidHashOP : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           ctx->HasInput("WhiteList"),
           true,
-          platform::errors::NotFound("Input(WhiteList) of PyramidHashOP is not "
-                                     "found but white_list_len > 0."));
+          phi::errors::NotFound("Input(WhiteList) of PyramidHashOP is not "
+                                "found but white_list_len > 0."));
       auto wl_dims = ctx->GetInputDim("WhiteList");
       PADDLE_ENFORCE_EQ(
           wl_dims.size(),
           2,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The rank of Input(WhiteList) of PyramidHashOP is invalid."
               " It should be 2, but got %d",
               wl_dims.size()));
       PADDLE_ENFORCE_EQ(wl_dims[0],
                         white_list_len,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The first dimension of Input(WhiteList) of "
                             "PyramidHashOP is invalid."
                             " It should be equal to Attr(white_list_len) "
@@ -174,7 +174,7 @@ class PyramidHashOP : public framework::OperatorWithKernel {
                             white_list_len));
       PADDLE_ENFORCE_EQ(wl_dims[1],
                         1,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The second dimension of Input(WhiteList) of "
                             "PyramidHashOP is invalid."
                             " It should be 1, but got %d",
@@ -186,19 +186,19 @@ class PyramidHashOP : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           ctx->HasInput("BlackList"),
           true,
-          platform::errors::NotFound("Input(BlackList) of PyramidHashOP is not "
-                                     "found but black_list_len > 0."));
+          phi::errors::NotFound("Input(BlackList) of PyramidHashOP is not "
+                                "found but black_list_len > 0."));
       auto bl_dims = ctx->GetInputDim("BlackList");
       PADDLE_ENFORCE_EQ(
           bl_dims.size(),
           2,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The rank of Input(BlackList) of PyramidHashOP is invalid."
               " It should be 2, but got %d",
               bl_dims.size()));
       PADDLE_ENFORCE_EQ(bl_dims[0],
                         black_list_len,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The first dimension of Input(BlackList) of "
                             "PyramidHashOP is invalid."
                             " It should be equal to Attr(black_list_len)"
@@ -207,7 +207,7 @@ class PyramidHashOP : public framework::OperatorWithKernel {
                             black_list_len));
       PADDLE_ENFORCE_EQ(bl_dims[1],
                         1,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The second dimension of Input(BlackList) of "
                             "PyramidHashOP is invalid."
                             " It should be 1, but got %d",
@@ -315,7 +315,7 @@ class CPUPyramidHashOPKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_EQ(
             math::bloomfilter_check(_filter),
             1,
-            platform::errors::PreconditionNotMet(
+            phi::errors::PreconditionNotMet(
                 "The white filter is not loaded successfully, please make sure "
                 "'white_list_len': %d is valid for Input(WhiteList).",
                 white_list_len));
@@ -325,7 +325,7 @@ class CPUPyramidHashOPKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_EQ(
             math::bloomfilter_check(_black_filter),
             1,
-            platform::errors::PreconditionNotMet(
+            phi::errors::PreconditionNotMet(
                 "The black filter is not loaded successfully, please make sure "
                 "'black_list_len': %d is valid for Input(BlackList).",
                 black_list_len));
@@ -442,27 +442,27 @@ class PyramidHashOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
-                      true,
-                      platform::errors::NotFound(
-                          "Input(X) of PyramidHashOpGrad is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("W"),
-                      true,
-                      platform::errors::NotFound(
-                          "Input(W) of PyramidHashOpGrad is not found."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("X"),
+        true,
+        phi::errors::NotFound("Input(X) of PyramidHashOpGrad is not found."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("W"),
+        true,
+        phi::errors::NotFound("Input(W) of PyramidHashOpGrad is not found."));
     PADDLE_ENFORCE_EQ(ctx->HasInput("DropPos"),
                       true,
-                      platform::errors::NotFound(
+                      phi::errors::NotFound(
                           "Input(DropPos) of PyramidHashOpGrad is not found."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("X_Temp_Out"),
         true,
-        platform::errors::NotFound(
+        phi::errors::NotFound(
             "Input(X_Temp_Out) of PyramidHashOpGrad is not found."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput(framework::GradVarName("Out")),
         true,
-        platform::errors::NotFound(
+        phi::errors::NotFound(
             "Input(Out@Grad) of PyramidHashOpGrad is not found."));
   }
 
diff --git a/paddle/fluid/operators/quantize_linear_op.cc b/paddle/fluid/operators/quantize_linear_op.cc
index c0ef288b5134b..44ff53e8a7d7b 100644
--- a/paddle/fluid/operators/quantize_linear_op.cc
+++ b/paddle/fluid/operators/quantize_linear_op.cc
@@ -164,17 +164,16 @@ class QuantizeLinearOpMaker : public framework::OpProtoAndCheckerMaker {
           PADDLE_ENFORCE_EQ(
               quant_axis == 0 || quant_axis == 1 || quant_axis == -1,
               true,
-              platform::errors::InvalidArgument(
-                  "'quant_axis' should be 0 or 1, but "
-                  "the received is %d",
-                  quant_axis));
+              phi::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                           "the received is %d",
+                                           quant_axis));
         });
     AddAttr<int>("bit_length", "(int, default 8)")
         .SetDefault(8)
         .AddCustomChecker([](const int &bit_length) {
           PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16,
                             true,
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "'bit_length' should be between 1 and 16, but "
                                 "the received is %d",
                                 bit_length));
@@ -190,7 +189,7 @@ class QuantizeLinearOpMaker : public framework::OpProtoAndCheckerMaker {
           PADDLE_ENFORCE_EQ(
               round_type == 0 || round_type == 1,
               true,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "'round_type' should be 0 or 1, 0 rounding to "
                   "nearest ties to even and 1 is rounding to nearest "
                   "ties away from zero.but the received is %d",
diff --git a/paddle/fluid/operators/quantize_linear_op.cu b/paddle/fluid/operators/quantize_linear_op.cu
index 8bcbc1107e9d1..d9aa1a860f405 100644
--- a/paddle/fluid/operators/quantize_linear_op.cu
+++ b/paddle/fluid/operators/quantize_linear_op.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/quantize_linear_op.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 
-using float16 = paddle::platform::float16;
+using float16 = phi::dtype::float16;
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/queue_generator_op.cc b/paddle/fluid/operators/queue_generator_op.cc
index 8eee44d6827ea..ca4f943885b2f 100644
--- a/paddle/fluid/operators/queue_generator_op.cc
+++ b/paddle/fluid/operators/queue_generator_op.cc
@@ -46,13 +46,13 @@ class QueueGeneratorOp : public framework::OperatorBase {
     PADDLE_ENFORCE_GT(
         names.size(),
         0,
-        platform::errors::InvalidArgument("The attribute 'names' for "
-                                          "Op(queue_generator) must be set."));
+        phi::errors::InvalidArgument("The attribute 'names' for "
+                                     "Op(queue_generator) must be set."));
 
     int capacity = Attr<int>("capacity");
     PADDLE_ENFORCE_GT(capacity,
                       0,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The attribute 'capacity' for Op(queue_generator) "
                           "must be set a positive value, "
                           "but the one received is %d.",
@@ -71,8 +71,8 @@ class QueueGeneratorOp : public framework::OperatorBase {
     auto var = scope->FindVar(name);
     PADDLE_ENFORCE_NOT_NULL(
         var,
-        platform::errors::NotFound(
-            "Can't find var named '%s' in the global scope.", name));
+        phi::errors::NotFound("Can't find var named '%s' in the global scope.",
+                              name));
     auto ptr = var->GetMutable<reader::LoDTensorBlockingQueueHolder>();
     ptr->InitOnce(capacity);
 
diff --git a/paddle/fluid/operators/random_routing_op.cc b/paddle/fluid/operators/random_routing_op.cc
index dffcc9c361a66..e579b3f6146e2 100644
--- a/paddle/fluid/operators/random_routing_op.cc
+++ b/paddle/fluid/operators/random_routing_op.cc
@@ -37,17 +37,17 @@ class RandomRoutingOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_EQ(prob_dims[0],
                       topk_val_dims[0],
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Output(Out) of ScatterNdAddOp should not be null."));
 
     PADDLE_ENFORCE_EQ(topk_idx_dims[1],
                       topk_val_dims[1],
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Output(Out) of ScatterNdAddOp should not be null."));
 
     PADDLE_ENFORCE_EQ(topk_idx_dims[0],
                       topk_val_dims[0],
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Output(Out) of ScatterNdAddOp should not be null."));
 
     ctx->SetOutputDim("Out", topk_idx_dims);
@@ -62,7 +62,7 @@ class RandomRoutingOp : public framework::OperatorWithKernel {
         OperatorWithKernel::IndicateVarDataType(ctx, "TopK_Idx");
     PADDLE_ENFORCE_EQ(topk_idx_dtype,
                       framework::proto::VarType::INT64,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The dtype of the topk_idx_dtype should be int64"));
 
     const auto& topk_value_type =
diff --git a/paddle/fluid/operators/range_op.h b/paddle/fluid/operators/range_op.h
index 195ef276b957e..8350794081bbe 100644
--- a/paddle/fluid/operators/range_op.h
+++ b/paddle/fluid/operators/range_op.h
@@ -23,23 +23,23 @@ namespace operators {
 
 template <typename T>
 void GetSize(T start, T end, T step, int64_t* size) {
-  PADDLE_ENFORCE_NE(step,
-                    0,
-                    platform::errors::InvalidArgument(
-                        "The step of range op should not be 0."));
+  PADDLE_ENFORCE_NE(
+      step,
+      0,
+      phi::errors::InvalidArgument("The step of range op should not be 0."));
 
   if (start < end) {
     PADDLE_ENFORCE_GT(
         step,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The step should be greater than 0 while start < end."));
   }
 
   if (start > end) {
     PADDLE_ENFORCE_LT(step,
                       0,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The step should be less than 0 while start > end."));
   }
 
diff --git a/paddle/fluid/operators/rank_attention_op.cc b/paddle/fluid/operators/rank_attention_op.cc
index 95de6f4e08054..aaef2782f5e21 100644
--- a/paddle/fluid/operators/rank_attention_op.cc
+++ b/paddle/fluid/operators/rank_attention_op.cc
@@ -27,32 +27,32 @@ class RankAttentionOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input(X) of RankAttentionOp should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("RankOffset"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(RankOffset) of RankAttentionOp should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("RankParam"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(RankParam) of RankAttentionOp should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("InsRank"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Output(InsRank) of RankAttentionOp should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("InputHelp"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Output(InputHelp) of RankAttentionOp should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("Out"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Output(Out) of RankAttentionOp should not be null."));
     auto max_rank = ctx->Attrs().Get<int>("MaxRank");
 
@@ -64,13 +64,13 @@ class RankAttentionOp : public framework::OperatorWithKernel {
     auto x_fea_dim = x_dims[1];
     auto block_matrix_row = max_rank * x_fea_dim;
 
-    PADDLE_ENFORCE_EQ((rank_offset_dims[1] - 1) / 2,
-                      max_rank,
-                      platform::errors::InvalidArgument(
-                          "Input(RankOffset) has wrong columns, "
-                          "except columns to be %d, but got %d",
-                          max_rank,
-                          (rank_offset_dims[1] - 1) / 2));
+    PADDLE_ENFORCE_EQ(
+        (rank_offset_dims[1] - 1) / 2,
+        max_rank,
+        phi::errors::InvalidArgument("Input(RankOffset) has wrong columns, "
+                                     "except columns to be %d, but got %d",
+                                     max_rank,
+                                     (rank_offset_dims[1] - 1) / 2));
 
     ctx->SetOutputDim("Out", {ins_num, para_col});
     ctx->SetOutputDim("InputHelp", {ins_num, block_matrix_row});
@@ -94,23 +94,23 @@ class RankAttentionGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("X"),
         true,
-        platform::errors::InvalidArgument("Input(X) should not be null"));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("RankParam"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(RankParam) should not be null"));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("RankOffset"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(RankOffset) should not be null"));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("InputHelp"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(InputHelp) should not be null"));
+        phi::errors::InvalidArgument("Input(X) should not be null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("RankParam"),
+        true,
+        phi::errors::InvalidArgument("Input(RankParam) should not be null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("RankOffset"),
+        true,
+        phi::errors::InvalidArgument("Input(RankOffset) should not be null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("InputHelp"),
+        true,
+        phi::errors::InvalidArgument("Input(InputHelp) should not be null"));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("InsRank"),
         true,
-        platform::errors::InvalidArgument("Input(InsRank) should not be null"));
+        phi::errors::InvalidArgument("Input(InsRank) should not be null"));
 
     ctx->SetOutputDim(framework::GradVarName("RankParam"),
                       ctx->GetInputDim("RankParam"));
diff --git a/paddle/fluid/operators/rank_attention_op.cu b/paddle/fluid/operators/rank_attention_op.cu
index 6d6c4c6a6d1dc..d73de790a527e 100644
--- a/paddle/fluid/operators/rank_attention_op.cu
+++ b/paddle/fluid/operators/rank_attention_op.cu
@@ -48,15 +48,15 @@ class RankAttentionCUDAKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         rank_offset_dims[0],
         ins_num,
-        platform::errors::InvalidArgument("Input(RankOffset) has wrong rows."));
-    PADDLE_ENFORCE_EQ((rank_offset_dims[1] - 1) / 2,
-                      max_rank,
-                      platform::errors::InvalidArgument(
-                          "Input(RankOffset) has wrong columns."));
+        phi::errors::InvalidArgument("Input(RankOffset) has wrong rows."));
+    PADDLE_ENFORCE_EQ(
+        (rank_offset_dims[1] - 1) / 2,
+        max_rank,
+        phi::errors::InvalidArgument("Input(RankOffset) has wrong columns."));
     PADDLE_ENFORCE_EQ(
         max_rank * max_rank * x_fea_dim,
         para_row,
-        platform::errors::InvalidArgument("Input(RankParam) has wrong rows."));
+        phi::errors::InvalidArgument("Input(RankParam) has wrong rows."));
 
     int block_matrix_row = max_rank * x_fea_dim;
 
diff --git a/paddle/fluid/operators/rank_attention_op.h b/paddle/fluid/operators/rank_attention_op.h
index 5124e91653810..f119c4a2f315c 100644
--- a/paddle/fluid/operators/rank_attention_op.h
+++ b/paddle/fluid/operators/rank_attention_op.h
@@ -23,10 +23,10 @@ template <typename T, typename DeviceContext>
 class RankAttentionKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()),
-                      true,
-                      platform::errors::Unimplemented(
-                          "Rank Attention only supports GPU now."));
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()),
+        true,
+        phi::errors::Unimplemented("Rank Attention only supports GPU now."));
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
deleted file mode 100644
index ebdddfd41b33f..0000000000000
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ /dev/null
@@ -1,254 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/rank_loss_op.h"
-
-#include <string>
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-class RankLossOp : public framework::OperatorWithKernel {
- public:
-  RankLossOp(const std::string &type,
-             const framework::VariableNameMap &inputs,
-             const framework::VariableNameMap &outputs,
-             const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "RankLoss");
-    OP_INOUT_CHECK(ctx->HasInput("Left"), "Input", "Left", "RankLoss");
-    OP_INOUT_CHECK(ctx->HasInput("Right"), "Input", "Right", "RankLoss");
-
-    auto label_dims = ctx->GetInputDim("Label");
-    auto left_dims = ctx->GetInputDim("Left");
-    auto right_dims = ctx->GetInputDim("Right");
-    // check label_dims valid
-    PADDLE_ENFORCE_GE(
-        label_dims.size(),
-        1,
-        platform::errors::InvalidArgument(
-            "The dimension size of Input(Label) must be greater than "
-            "or equal to 1, but received %d.",
-            label_dims.size()));
-    PADDLE_ENFORCE_LE(
-        label_dims.size(),
-        2,
-        platform::errors::InvalidArgument("The dimension size of Input(Label) "
-                                          "must be less than or equal to 2, "
-                                          "but received %d.",
-                                          label_dims.size()));
-    if (label_dims.size() == 2U) {
-      PADDLE_ENFORCE_EQ(
-          label_dims[1],
-          1,
-          platform::errors::InvalidArgument(
-              "The last dimension of Input(Label) must be 1, but received %d.",
-              label_dims[1]));
-    }
-    // check left_dims valid
-    PADDLE_ENFORCE_GE(
-        left_dims.size(),
-        1,
-        platform::errors::InvalidArgument(
-            "The dimension size of Input(Left) must be greater than "
-            "or equal to 1, but received %d.",
-            left_dims.size()));
-    PADDLE_ENFORCE_LE(
-        left_dims.size(),
-        2,
-        platform::errors::InvalidArgument("The dimension size of Input(Left) "
-                                          "must be less than or equal to 2, "
-                                          "but received %d.",
-                                          left_dims.size()));
-    if (left_dims.size() == 2U) {
-      PADDLE_ENFORCE_EQ(
-          left_dims[1],
-          1,
-          platform::errors::InvalidArgument(
-              "The last dimension of Input(Left) must be 1, but received %d.",
-              left_dims[1]));
-    }
-    // check right_dims valid
-    PADDLE_ENFORCE_GE(
-        right_dims.size(),
-        1,
-        platform::errors::InvalidArgument(
-            "The dimension size of Input(Right) must be greater than "
-            "or equal to 1, but received %d.",
-            right_dims.size()));
-    PADDLE_ENFORCE_LE(
-        right_dims.size(),
-        2,
-        platform::errors::InvalidArgument("The dimension size of Input(Right) "
-                                          "must be less than or equal to 2, "
-                                          "but received %d.",
-                                          right_dims.size()));
-    if (right_dims.size() == 2U) {
-      PADDLE_ENFORCE_EQ(
-          right_dims[1],
-          1,
-          platform::errors::InvalidArgument(
-              "The last dimension of Input(Right) must be 1, but received %d.",
-              right_dims[1]));
-    }
-    PADDLE_ENFORCE_EQ(
-        label_dims[0],
-        left_dims[0],
-        platform::errors::InvalidArgument(
-            "The first dimension of Input(Label) and Input(Left) "
-            "must have the same value. But received Label.dims[0]=%d, "
-            "Left.dims[0]=%d.",
-            label_dims[0],
-            left_dims[0]));
-    PADDLE_ENFORCE_EQ(
-        label_dims[0],
-        right_dims[0],
-        platform::errors::InvalidArgument(
-            "The first dimension of Input(Label) and Input(Right) "
-            "must have the same value. But received Label.dims[0]=%d, "
-            "Right.dims[0]=%d.",
-            label_dims[0],
-            right_dims[0]));
-    ctx->SetOutputDim("Out", label_dims);
-  }
-};
-
-class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Label",
-             "(2-D Tensor with shape [batch_size x 1]) "
-             "The label indicating A ranked higher than B or not.");
-    AddInput("Left",
-             "(2-D Tensor with shape [batch_size x 1]) "
-             "The output of RankNet for doc A.");
-    AddInput("Right",
-             "(2-D Tensor with shape [batch_size x 1]) "
-             "The output of RankNet for doc B.");
-    AddOutput("Out",
-              "(2-D Tensor with shape [batch_size x 1]) "
-              "The output loss of RankLoss operator.");
-    AddComment(R"DOC(
-RankLoss Operator.
-
-RankLoss operator for RankNet
-(http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf).
-RankNet is a pairwise ranking model with
-one training sample consisting of a pair of doc A and B, and the label P
-indicating that A is ranked higher than B or not:
-
-P = {0, 1} or {0, 0.5, 1}, where 0.5 means no information about the rank of
-the input pair.
-
-The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label
-(P_{i,j}), which represent the output score of RankNet for the two docs and
-the label respectively, and yields the rank loss C_{i,j} using the following
-equation:
-
-$$
-  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\
-  o_{i,j} =  o_i - o_j  \\
-  \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
-$$
-
-The operator can take batch inputs with size batch_size (batch_size >= 1).
-
-)DOC");
-  }
-};
-
-class RankLossGradOp : public framework::OperatorWithKernel {
- public:
-  RankLossGradOp(const std::string &type,
-                 const framework::VariableNameMap &inputs,
-                 const framework::VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "RankLossGrad");
-    OP_INOUT_CHECK(ctx->HasInput("Left"), "Input", "Left", "RankLossGrad");
-    OP_INOUT_CHECK(ctx->HasInput("Right"), "Input", "Right", "RankLossGrad");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   framework::GradVarName("Out"),
-                   "RankLossGrad");
-
-    auto left_dims = ctx->GetInputDim("Left");
-    auto right_dims = ctx->GetInputDim("Right");
-    auto left_grad_name = framework::GradVarName("Left");
-    auto right_grad_name = framework::GradVarName("Right");
-
-    if (ctx->HasOutput(left_grad_name)) {
-      ctx->SetOutputDim(left_grad_name, left_dims);
-    }
-
-    if (ctx->HasOutput(right_grad_name)) {
-      ctx->SetOutputDim(right_grad_name, right_dims);
-    }
-  }
-};
-
-template <typename T>
-class RankLossGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("rank_loss_grad");
-    op->SetInput("Label", this->Input("Label"));
-    op->SetInput("Left", this->Input("Left"));
-    op->SetInput("Right", this->Input("Right"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("Left"), this->InputGrad("Left"));
-    op->SetOutput(framework::GradVarName("Right"), this->InputGrad("Right"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(rank_loss,
-                  ops::RankLossOp,
-                  ops::RankLossOpMaker,
-                  ops::RankLossGradMaker<paddle::framework::OpDesc>,
-                  ops::RankLossGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(rank_loss_grad, ops::RankLossGradOp);
-
-PD_REGISTER_STRUCT_KERNEL(
-    rank_loss, CPU, ALL_LAYOUT, ops::RankLossKernel, float) {}
-PD_REGISTER_STRUCT_KERNEL(
-    rank_loss_grad, CPU, ALL_LAYOUT, ops::RankLossGradKernel, float) {}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_STRUCT_KERNEL(
-    rank_loss, GPU, ALL_LAYOUT, ops::RankLossKernel, float) {}
-PD_REGISTER_STRUCT_KERNEL(
-    rank_loss_grad, GPU, ALL_LAYOUT, ops::RankLossGradKernel, float) {}
-#endif
diff --git a/paddle/fluid/operators/rank_loss_op.h b/paddle/fluid/operators/rank_loss_op.h
deleted file mode 100644
index 03e0a094555e3..0000000000000
--- a/paddle/fluid/operators/rank_loss_op.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class RankLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out_t = ctx.Output<phi::DenseTensor>("Out");
-    auto* label_t = ctx.Input<phi::DenseTensor>("Label");
-    auto* left_t = ctx.Input<phi::DenseTensor>("Left");
-    auto* right_t = ctx.Input<phi::DenseTensor>("Right");
-    out_t->mutable_data<T>(ctx.GetPlace());
-
-    auto out = framework::EigenVector<T>::Flatten(*out_t);
-    auto label = framework::EigenVector<T>::Flatten(*label_t);
-    auto left = framework::EigenVector<T>::Flatten(*left_t);
-    auto right = framework::EigenVector<T>::Flatten(*right_t);
-
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    EigenRankLoss<std::decay_t<decltype(dev)>, T>::Eval(
-        dev, out, label, left, right);
-  }
-};
-
-template <typename T, typename DeviceContext>
-class RankLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_left_t =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Left"));
-    auto* d_right_t =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Right"));
-
-    auto* d_out_t = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* label_t = ctx.Input<phi::DenseTensor>("Label");
-    auto* left_t = ctx.Input<phi::DenseTensor>("Left");
-    auto* right_t = ctx.Input<phi::DenseTensor>("Right");
-
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
-    auto label = framework::EigenVector<T>::Flatten(*label_t);
-    auto left = framework::EigenVector<T>::Flatten(*left_t);
-    auto right = framework::EigenVector<T>::Flatten(*right_t);
-
-    // compute d_left
-    if (d_left_t) {
-      d_left_t->mutable_data<T>(ctx.GetPlace());
-      auto d_left = framework::EigenVector<T>::Flatten(*d_left_t);
-      EigenRankLossGrad<std::decay_t<decltype(dev)>, T>::EvalLeft(
-          dev, d_left, d_out, label, left, right);
-    }
-    // compute d_right
-    if (d_right_t) {
-      d_right_t->mutable_data<T>(ctx.GetPlace());
-      auto d_right = framework::EigenVector<T>::Flatten(*d_right_t);
-      EigenRankLossGrad<std::decay_t<decltype(dev)>, T>::EvalRight(
-          dev, d_right, d_out, label, left, right);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
index 4d3e79546fbef..35e76304b6c32 100644
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -36,7 +36,7 @@ class BlockingQueue {
       : capacity_(capacity), speed_test_mode_(speed_test_mode) {
     PADDLE_ENFORCE_GT(capacity_,
                       static_cast<size_t>(0),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The capacity of a reader::BlockingQueue must be "
                           "greater than 0, but received capacity is %d.",
                           capacity_));
@@ -59,7 +59,7 @@ class BlockingQueue {
     PADDLE_ENFORCE_LT(
         queue_.size(),
         capacity_,
-        platform::errors::PermissionDenied(
+        phi::errors::PermissionDenied(
             "The queue size cannot exceed the set queue capacity. Expected "
             "queue size is less than %d. But received %d",
             capacity_,
@@ -86,7 +86,7 @@ class BlockingQueue {
     PADDLE_ENFORCE_LT(
         queue_.size(),
         capacity_,
-        platform::errors::PermissionDenied(
+        phi::errors::PermissionDenied(
             "The queue size cannot exceed the set queue capacity. Expected "
             "queue size is less than %d. But received %d",
             capacity_,
@@ -104,7 +104,7 @@ class BlockingQueue {
     if (!queue_.empty()) {
       PADDLE_ENFORCE_NOT_NULL(
           elem,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The holder to receive queue data is null pointer."));
       *elem = queue_.front();
       if (LIKELY(!speed_test_mode_)) {
@@ -115,7 +115,7 @@ class BlockingQueue {
     } else {
       PADDLE_ENFORCE_EQ(closed_,
                         true,
-                        platform::errors::PermissionDenied(
+                        phi::errors::PermissionDenied(
                             "Blocking queue status error, if queue is empty "
                             "when pop data, it should be closed."));
       VLOG(3) << "queue is closed! return nothing.";
@@ -168,11 +168,10 @@ class BlockingQueue {
 
  private:
   inline void EnforceNotKilled() {
-    PADDLE_ENFORCE_NE(
-        killed_,
-        true,
-        platform::errors::Fatal("Blocking queue is killed because the "
-                                "data reader raises an exception."));
+    PADDLE_ENFORCE_NE(killed_,
+                      true,
+                      phi::errors::Fatal("Blocking queue is killed because the "
+                                         "data reader raises an exception."));
   }
 
  private:
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index cc5034c86f90f..15bbc9ff10965 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -127,7 +127,7 @@ void BufferedReader::ReadAsync(size_t i) {
         PADDLE_ENFORCE_EQ(
             cuda.size(),
             cpu.size(),
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "Input tensor number on GPU and CPU devices are not matched."));
       }
       if (pin_memory_) {
@@ -250,7 +250,7 @@ void BufferedReader::ReadAsync(size_t i) {
         PADDLE_ENFORCE_EQ(
             xpu.size(),
             cpu.size(),
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "Input tensor number on XPU and CPU devices are not matched. "
                 "The number on XPU is %d, on CPU is %d",
                 xpu.size(),
@@ -308,7 +308,7 @@ void BufferedReader::ReadAsync(size_t i) {
       } else {
         PADDLE_ENFORCE_EQ(custom_device.size(),
                           cpu.size(),
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "Input tensor number on CustomDevice and CPU "
                               "devices are not matched. "
                               "The number on CustomDevice is %d, on CPU is %d",
diff --git a/paddle/fluid/operators/reader/create_ctr_reader_op.cc b/paddle/fluid/operators/reader/create_ctr_reader_op.cc
index 8c38aaf528da0..de0dff6be2533 100644
--- a/paddle/fluid/operators/reader/create_ctr_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_ctr_reader_op.cc
@@ -35,7 +35,7 @@ class CreateCTRReaderOp : public framework::OperatorBase {
     auto* queue_holder_var = scope.FindVar(queue_name);
     PADDLE_ENFORCE_NOT_NULL(
         queue_holder_var,
-        platform::errors::PreconditionNotMet(
+        phi::errors::PreconditionNotMet(
             "No LoDTensorBlockingQueueHolder variable with name %s found",
             queue_name));
     auto* queue_holder =
diff --git a/paddle/fluid/operators/reader/create_custom_reader_op.cc b/paddle/fluid/operators/reader/create_custom_reader_op.cc
index 43fb5d9059c15..6a18e417a39bb 100644
--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -100,12 +100,12 @@ class CustomReaderInferShape : public framework::InferShapeBase {
     PADDLE_ENFORCE_NE(
         ctx->IsRuntime(),
         true,
-        platform::errors::PreconditionNotMet(
+        phi::errors::PreconditionNotMet(
             "'CustomReaderInferShape' should only be invoked during "
             "compile time."));
     PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"),
                       true,
-                      platform::errors::NotFound(
+                      phi::errors::NotFound(
                           "The output decorated reader should not be null."));
     const auto* sub_block =
         ctx->Attrs().Get<framework::BlockDesc*>("sub_block");
@@ -117,7 +117,7 @@ class CustomReaderInferShape : public framework::InferShapeBase {
       auto* sink_var = sub_block->FindVar(var_name);
       PADDLE_ENFORCE_NOT_NULL(
           sink_var,
-          platform::errors::NotFound(
+          phi::errors::NotFound(
               "The sink variable is not found in CustomReader."));
       res_dims.emplace_back(sink_var->GetShape());
       res_lod_levels.push_back(sink_var->GetLoDLevel());
@@ -135,7 +135,7 @@ class CustomReaderInferVarType : public framework::VarTypeInference {
     auto& out_var_name = ctx->Output("Out")[0];
     PADDLE_ENFORCE_EQ(ctx->HasVar(out_var_name),
                       true,
-                      platform::errors::NotFound(
+                      phi::errors::NotFound(
                           "The output reader variable should not be null."));
     ctx->SetType(out_var_name, framework::proto::VarType::READER);
 
@@ -148,7 +148,7 @@ class CustomReaderInferVarType : public framework::VarTypeInference {
       framework::VarDesc* var = sub_block->FindVar(var_name);
       PADDLE_ENFORCE_NOT_NULL(
           var,
-          platform::errors::NotFound(
+          phi::errors::NotFound(
               "The sink variable is not found in CustomReader."));
       res_data_types.emplace_back(var->GetDataType());
     }
@@ -167,7 +167,7 @@ void CustomReader::ReadNextImpl(paddle::framework::LoDTensorArray* out) {
   PADDLE_ENFORCE_EQ(
       source_var_names_.size(),
       underlying_outs.size(),
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The size of source_var_names(%d) and the size of "
           "underlying_outs(%d) are not consistent. Each feeding element "
           "must have its own source variable.",
@@ -192,8 +192,8 @@ void CustomReader::ReadNextImpl(paddle::framework::LoDTensorArray* out) {
     auto* var = exe_scope->FindVar(sink_var_names_[i]);
     PADDLE_ENFORCE_NOT_NULL(
         var,
-        platform::errors::NotFound("The variable %s is not in current scope.",
-                                   sink_var_names_[i]));
+        phi::errors::NotFound("The variable %s is not in current scope.",
+                              sink_var_names_[i]));
     const auto& tensor = var->Get<phi::DenseTensor>();
     framework::TensorCopySync(tensor, platform::CPUPlace(), &(*out)[i]);
   }
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 5cea8f5963111..975a32e9ab496 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -35,9 +35,9 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
           dynamic_cast<framework::DecoratedReader*>(out->Get().get());
       PADDLE_ENFORCE_NOT_NULL(
           decorated_reader,
-          platform::errors::NotFound("The inited reader should be a "
-                                     "DecoratedReader when running "
-                                     "create_double_buffer_reader op."));
+          phi::errors::NotFound("The inited reader should be a "
+                                "DecoratedReader when running "
+                                "create_double_buffer_reader op."));
       if (decorated_reader->UnderlyingReader() == underlying_reader.Get()) {
         return;
       }
diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc
index e9edce4423e26..c55e77fc14787 100644
--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -35,7 +35,7 @@ class CreatePyReaderOp : public framework::OperatorBase {
     auto* queue_holder_var = scope.FindVar(queue_name);
     PADDLE_ENFORCE_NOT_NULL(
         queue_holder_var,
-        platform::errors::NotFound(
+        phi::errors::NotFound(
             "No LoDTensorBlockingQueueHolder variable with name %s found. This "
             "may be because the DataLoader is defined in another Scope, "
             "which is different from the Scope when calling Executor.run.",
diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
index da265a6fce76d..208377937c130 100644
--- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
@@ -92,15 +92,15 @@ class OrderedMultiDeviceLoDTensorBlockingQueue {
       std::lock_guard<std::mutex> lock(init_mutex_);
       PADDLE_ENFORCE_GE(dev_cnt,
                         1,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "Device count to init "
                             "OrderedMultiDeviceLoDTensorBlockingQueue"
                             " must be larger than 1"));
       if (!queues_.empty()) {
-        PADDLE_ENFORCE_EQ(queues_.size(),
-                          dev_cnt,
-                          platform::errors::InvalidArgument(
-                              "queues should be only inited once"));
+        PADDLE_ENFORCE_EQ(
+            queues_.size(),
+            dev_cnt,
+            phi::errors::InvalidArgument("queues should be only inited once"));
         return;
       }
 
@@ -119,7 +119,7 @@ class OrderedMultiDeviceLoDTensorBlockingQueue {
     PADDLE_ENFORCE_LT(
         idx,
         queues_.size(),
-        platform::errors::OutOfRange("The queue index is out of range"));
+        phi::errors::OutOfRange("The queue index is out of range"));
     return queues_[idx];
   }
 
@@ -184,7 +184,7 @@ class OrderedMultiDeviceLoDTensorBlockingQueue {
   void EnforceIsInited() const {
     PADDLE_ENFORCE_EQ(queues_.empty(),
                       false,
-                      platform::errors::NotFound("queue has not been inited"));
+                      phi::errors::NotFound("queue has not been inited"));
   }
 
  private:
@@ -209,8 +209,8 @@ class LoDTensorBlockingQueueHolder {
     PADDLE_ENFORCE_EQ(
         queue_,
         nullptr,
-        platform::errors::AlreadyExists("LoDTensorBlockingQueueHolder::"
-                                        "InitOnce() can only be called once"));
+        phi::errors::AlreadyExists("LoDTensorBlockingQueueHolder::"
+                                   "InitOnce() can only be called once"));
     queue_ =
         std::make_unique<LoDTensorBlockingQueue>(capacity, speed_test_mode);
   }
@@ -228,7 +228,7 @@ class OrderedMultiDeviceLoDTensorBlockingQueueHolder {
   void InitOnce(size_t capacity, bool speed_test_mode = false) {
     PADDLE_ENFORCE_EQ(queue_,
                       nullptr,
-                      platform::errors::AlreadyExists(
+                      phi::errors::AlreadyExists(
                           "OrderedMultiDeviceLoDTensorBlockingQueueHolder::"
                           "InitOnce() can only be called once"));
     queue_ = std::make_unique<OrderedMultiDeviceLoDTensorBlockingQueue>(
diff --git a/paddle/fluid/operators/reader/py_reader.cc b/paddle/fluid/operators/reader/py_reader.cc
index f0c0409a729a5..d71f4b9e9ca95 100644
--- a/paddle/fluid/operators/reader/py_reader.cc
+++ b/paddle/fluid/operators/reader/py_reader.cc
@@ -25,7 +25,7 @@ PyReader::PyReader(
     const std::vector<bool>& need_check_feed)
     : framework::FileReader(dims, var_types, need_check_feed) {
   PADDLE_ENFORCE_NOT_NULL(queue,
-                          platform::errors::PreconditionNotMet(
+                          phi::errors::PreconditionNotMet(
                               "LoDTensorBlockingQueue must not be null."));
   queue_ = queue;
 }
diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc
index 1c65669adc3a9..d88dfb4962a9c 100644
--- a/paddle/fluid/operators/reader/read_op.cc
+++ b/paddle/fluid/operators/reader/read_op.cc
@@ -51,7 +51,7 @@ class ReadInferShape : public framework::InferShapeBase {
       PADDLE_ENFORCE_EQ(
           reader_dims.size(),
           out_names.size(),
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The reader's dim number doesn't match the output number."));
       ctx->SetOutputsDim("Out", reader_dims);
       auto in_desc =
@@ -61,7 +61,7 @@ class ReadInferShape : public framework::InferShapeBase {
       PADDLE_ENFORCE_EQ(
           in_lod_levels.size(),
           out_var_ptrs.size(),
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "LoDLevels of Input(Reader) must be the same as the "
               "number of Outputs(Out)."));
       for (size_t i = 0; i < out_var_ptrs.size(); ++i) {
@@ -82,7 +82,7 @@ class ReadInferVarType : public framework::StaticGraphVarTypeInference {
       auto dtypes = GetDataTypes(ctx, reader_name);
       PADDLE_ENFORCE_EQ(dtypes.size(),
                         out_names.size(),
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The number of input reader's dtypes do not match "
                             "the output variable number."));
       for (size_t i = 0; i < dtypes.size(); ++i) {
@@ -120,8 +120,8 @@ class ReadOp : public framework::OperatorBase {
     PADDLE_ENFORCE_EQ(
         ins.size(),
         out_arg_names.size(),
-        platform::errors::InvalidArgument("input data number and output data "
-                                          "number of read_op do not match"));
+        phi::errors::InvalidArgument("input data number and output data "
+                                     "number of read_op do not match"));
 
     const std::vector<framework::DDim>& shapes = reader->Shapes();
     const std::vector<framework::proto::VarType::Type>& var_types =
@@ -130,7 +130,7 @@ class ReadOp : public framework::OperatorBase {
     PADDLE_ENFORCE_EQ(
         out_arg_names.size(),
         need_check_feed.size(),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Output size of read_op and the number of fed "
             "variables of reader do not match. Received size of output is %d, "
             "number of fed variables of reader is %d",
@@ -145,7 +145,7 @@ class ReadOp : public framework::OperatorBase {
         PADDLE_ENFORCE_EQ(
             DimensionIsCompatibleWith(shapes[i], in_dims),
             true,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "The fed Variable %s should have dimensions = %d, "
                 "shape = [%s], but received fed shape [%s]",
                 out_arg_names[i],
@@ -155,7 +155,7 @@ class ReadOp : public framework::OperatorBase {
         PADDLE_ENFORCE_EQ(
             framework::TransToProtoVarType(ins[i].dtype()),
             var_types[i],
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "The data type of fed Variable %s must be %s, but received %s",
                 out_arg_names[i],
                 var_types[i],
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
index e62d728b6f017..9a1693c5061c7 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -69,13 +69,13 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE_NE(
       ctx->IsRuntime(),
       true,
-      platform::errors::PreconditionNotMet("'FileReaderInferShape' should only "
-                                           "be invoked during compile time."));
+      phi::errors::PreconditionNotMet("'FileReaderInferShape' should only "
+                                      "be invoked during compile time."));
 
   PADDLE_ENFORCE_EQ(
       ctx->HasOutput("Out"),
       true,
-      platform::errors::NotFound("The output file reader should not be null."));
+      phi::errors::NotFound("The output file reader should not be null."));
   bool use_data_config = ctx->Attrs().Get<bool>("use_data_config");
   if (use_data_config) {
     const auto shape_concat =
@@ -88,7 +88,7 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
     PADDLE_ENFORCE_EQ(
         lod_levels.size(),
         shapes.size(),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The number of 'lod_levels'(%d) doesn't match the number "
             "of 'shapes'(%d).",
             lod_levels.size(),
@@ -97,16 +97,16 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
     PADDLE_ENFORCE_EQ(
         dtypes.size(),
         shapes.size(),
-        platform::errors::InvalidArgument("The number of 'dtypes'(%d) doesn't "
-                                          "match the number of 'shapes'(%d).",
-                                          dtypes.size(),
-                                          shapes.size()));
+        phi::errors::InvalidArgument("The number of 'dtypes'(%d) doesn't "
+                                     "match the number of 'shapes'(%d).",
+                                     dtypes.size(),
+                                     shapes.size()));
     const auto need_check_feed =
         ctx->Attrs().Get<std::vector<int>>("need_check_feed");
     PADDLE_ENFORCE_EQ(
         need_check_feed.size(),
         shapes.size(),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The number of 'need_check_feed'(%d) doesn't match the "
             "number of 'shapes'(%d).",
             need_check_feed.size(),
@@ -127,18 +127,18 @@ void DecoratedReaderInferShape::operator()(
   PADDLE_ENFORCE_NE(
       ctx->IsRuntime(),
       true,
-      platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "'DecoratedReaderInferShape' should only be invoked during "
           "compile time."));
 
-  PADDLE_ENFORCE_EQ(ctx->HasInput("UnderlyingReader"),
-                    true,
-                    platform::errors::NotFound(
-                        "Input(UnderlyingReader) should not be null."));
-  PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"),
-                    true,
-                    platform::errors::NotFound(
-                        "The output decorated reader should not be null."));
+  PADDLE_ENFORCE_EQ(
+      ctx->HasInput("UnderlyingReader"),
+      true,
+      phi::errors::NotFound("Input(UnderlyingReader) should not be null."));
+  PADDLE_ENFORCE_EQ(
+      ctx->HasOutput("Out"),
+      true,
+      phi::errors::NotFound("The output decorated reader should not be null."));
   ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader"));
 
   framework::VarDesc* in_reader = PADDLE_GET(
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index a5d4ce5e29828..42856d5b3c12a 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -77,10 +77,10 @@ StepScopes::StepScopes(const platform::DeviceContext &dev_ctx,
       is_train_(is_train),
       is_backward_(is_backward) {
   size_t num_step_scopes = is_train ? seq_len : 2;
-  PADDLE_ENFORCE_EQ(is_train || !is_backward,
-                    true,
-                    platform::errors::PreconditionNotMet(
-                        "Cannot backward when is not training"));
+  PADDLE_ENFORCE_EQ(
+      is_train || !is_backward,
+      true,
+      phi::errors::PreconditionNotMet("Cannot backward when is not training"));
   if (!is_backward_) {
     ClearStepScopes(dev_ctx, const_cast<framework::Scope *>(&parent), scopes);
     scopes->reserve(static_cast<size_t>(num_step_scopes));
@@ -101,7 +101,7 @@ void StepScopes::BackwardNext(const platform::DeviceContext &dev_ctx,
                               framework::Scope *parent_scope) {
   PADDLE_ENFORCE_EQ(is_backward_,
                     true,
-                    platform::errors::PreconditionNotMet(
+                    phi::errors::PreconditionNotMet(
                         "Cannot get backward next scope when is forward"));
   if (counter_ + 2 == scopes_->size()) {
     parent_scope->DeleteScope((*scopes_)[counter_ + 1]);
@@ -114,7 +114,7 @@ void StepScopes::BackwardNext(const platform::DeviceContext &dev_ctx,
 void StepScopes::ForwardNext() {
   PADDLE_ENFORCE_EQ(is_backward_,
                     false,
-                    platform::errors::PreconditionNotMet(
+                    phi::errors::PreconditionNotMet(
                         "Cannot get forward next scope when is backward"));
   ++counter_;
 }
@@ -126,7 +126,7 @@ framework::Scope &StepScopes::GetScope(size_t scope_id) const {
   PADDLE_ENFORCE_LT(
       scope_id,
       scopes_->size(),
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Input scope_id is greater than scopes size in RecurrentOp"));
   return *(*scopes_)[scope_id];
 }
@@ -149,16 +149,16 @@ int64_t RecurrentBase::GetSequenceLength(const framework::Scope &scope) const {
   PADDLE_ENFORCE_EQ(
       all_inputs.empty(),
       false,
-      platform::errors::InvalidArgument("RecurrentOp gets empty input"));
+      phi::errors::InvalidArgument("RecurrentOp gets empty input"));
   for (auto &iname : all_inputs) {
     auto *var = scope.FindVar(iname);
     PADDLE_ENFORCE_NOT_NULL(var,
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "RecurrentOp finds var %s is NULL", iname));
     PADDLE_ENFORCE_EQ(
         var->IsType<phi::DenseTensor>(),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "RecurrentOp only accepts phi::DenseTensor as input but "
             "input var %s is not phi::DenseTensor",
             iname));
@@ -168,7 +168,7 @@ int64_t RecurrentBase::GetSequenceLength(const framework::Scope &scope) const {
     } else {
       PADDLE_ENFORCE_EQ(seq_len,
                         dim[0],
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "Sequence length of input %s in RecurrentOp is NOT "
                             "equal to sequence length of previous input",
                             iname));
@@ -176,7 +176,7 @@ int64_t RecurrentBase::GetSequenceLength(const framework::Scope &scope) const {
   }
   PADDLE_ENFORCE_GE(seq_len,
                     0,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "RecurrentOp gets invalid sequence length. Expected "
                         "seq_len >= 0. Received seq_len = %d",
                         seq_len));
@@ -331,9 +331,9 @@ StepScopes RecurrentOp::CreateStepScopes(const platform::DeviceContext &dev_ctx,
   // fault in multithreading in eval process. The performance drop of
   // adding mutex need to be fixed.
   auto *var = scope.FindVar(Output(kStepScopes));
-  PADDLE_ENFORCE_NOT_NULL(var,
-                          platform::errors::InvalidArgument(
-                              "RecurrentOp gets empty StepScopes var"));
+  PADDLE_ENFORCE_NOT_NULL(
+      var,
+      phi::errors::InvalidArgument("RecurrentOp gets empty StepScopes var"));
   return StepScopes(dev_ctx,
                     scope,
                     var->GetMutable<StepScopeVar>(),
@@ -413,7 +413,7 @@ void RecurrentGradOp::RunImpl(const framework::Scope &scope,
 
         PADDLE_ENFORCE_EQ(ex_state_grads.size(),
                           cur_state_grads.size(),
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "lengths of ex_states and cur_states are not "
                               "equal in RecurrentGradOp"));
         for (size_t i = 0; i < ex_state_grads.size(); ++i) {
@@ -475,7 +475,7 @@ void RecurrentGradOp::RunImpl(const framework::Scope &scope,
       auto &p_names = Inputs(kParameters);
       PADDLE_ENFORCE_EQ(pg_names.size(),
                         p_names.size(),
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "Sizes of Parameters and ParamGrads are not equal "
                             "in RecurrentGradOp"));
 
@@ -566,7 +566,7 @@ void RecurrentGradOp::RunImpl(const framework::Scope &scope,
   // Delete the scope of StepScopes
   auto *var = scope.FindVar(Input(kStepScopes));
   PADDLE_ENFORCE_NOT_NULL(var,
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "StepScopes var is empty in RecurrentGradOp"));
   auto *step_scopes = var->GetMutable<StepScopeVar>();
   ClearStepScopes(dev_ctx, const_cast<framework::Scope *>(&scope), step_scopes);
@@ -578,7 +578,7 @@ StepScopes RecurrentGradOp::CreateStepScopes(
     size_t seq_len) const {
   auto *var = scope.FindVar(Input(kStepScopes));
   PADDLE_ENFORCE_NOT_NULL(var,
-                          platform::errors::InvalidArgument(
+                          phi::errors::InvalidArgument(
                               "StepScopes var is empty in RecurrentGradOp"));
   return StepScopes(dev_ctx,
                     scope,
@@ -735,27 +735,27 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase {
               .Get<std::vector<std::string>>(RecurrentBase::kExStates)
               .size(),
           0,
-          platform::errors::InvalidArgument("The Attr(%s) should be empty.",
-                                            RecurrentBase::kExStates));
+          phi::errors::InvalidArgument("The Attr(%s) should be empty.",
+                                       RecurrentBase::kExStates));
       PADDLE_ENFORCE_EQ(
           ctx->Attrs()
               .Get<std::vector<std::string>>(RecurrentBase::kStates)
               .size(),
           0,
-          platform::errors::InvalidArgument("The Attr(%s) should be empty.",
-                                            RecurrentBase::kStates));
+          phi::errors::InvalidArgument("The Attr(%s) should be empty.",
+                                       RecurrentBase::kStates));
     }
 
     PADDLE_ENFORCE_EQ(
         ctx->HasInputs(RecurrentBase::kInputs),
         true,
-        platform::errors::InvalidArgument("The input(%s) should not be empty.",
-                                          RecurrentBase::kInputs));
+        phi::errors::InvalidArgument("The input(%s) should not be empty.",
+                                     RecurrentBase::kInputs));
     PADDLE_ENFORCE_EQ(
         ctx->HasInputs(RecurrentBase::kOutputs),
         true,
-        platform::errors::InvalidArgument("The input(%s) should not be empty.",
-                                          RecurrentBase::kOutputs));
+        phi::errors::InvalidArgument("The input(%s) should not be empty.",
+                                     RecurrentBase::kOutputs));
 
     // In some case the kInitialStates is empty.
     if (ctx->HasInputs(RecurrentBase::kInitialStates) &&
@@ -769,7 +769,7 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase {
         ctx->HasOutputs(framework::GradVarName(RecurrentBase::kInputs),
                         /*allow_null=*/true),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The output of(%s) should not be empty.",
             framework::GradVarName(RecurrentBase::kInputs)));
     ctx->SetOutputsDim(framework::GradVarName(RecurrentBase::kInputs),
@@ -780,7 +780,7 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase {
       PADDLE_ENFORCE_EQ(
           ctx->HasOutputs(framework::GradVarName(RecurrentBase::kParameters)),
           true,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The output of(%s) should not be empty.",
               framework::GradVarName(RecurrentBase::kParameters)));
       ctx->SetOutputsDim(framework::GradVarName(RecurrentBase::kParameters),
diff --git a/paddle/fluid/operators/recurrent_op.h b/paddle/fluid/operators/recurrent_op.h
index d027205429513..b1be9a5c0389e 100644
--- a/paddle/fluid/operators/recurrent_op.h
+++ b/paddle/fluid/operators/recurrent_op.h
@@ -122,7 +122,7 @@ class RecurrentBase : public framework::OperatorBase {
                                      bool is_backward = false) {
     PADDLE_ENFORCE_EQ(src_vars.size(),
                       dst_vars.size(),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Sizes of source vars and destination vars are not "
                           "equal in LinkTensor."));
     for (size_t i = 0; i < dst_vars.size(); ++i) {
@@ -148,7 +148,7 @@ class RecurrentBase : public framework::OperatorBase {
                                      bool is_backward = false) {
     PADDLE_ENFORCE_EQ(src_vars.size(),
                       dst_vars.size(),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Sizes of source vars and destination vars are not "
                           "equal in LinkTensor."));
     for (size_t i = 0; i < dst_vars.size(); ++i) {
@@ -180,8 +180,8 @@ class RecurrentBase : public framework::OperatorBase {
     }
     PADDLE_ENFORCE_NOT_NULL(
         src_var,
-        platform::errors::NotFound("Source variable %s is not found.",
-                                   src_var_name));
+        phi::errors::NotFound("Source variable %s is not found.",
+                              src_var_name));
     auto &src_tensor = src_var->Get<phi::DenseTensor>();
 
     auto *dst_var = dst_scope->Var(dst_var_name);
@@ -203,13 +203,13 @@ class RecurrentBase : public framework::OperatorBase {
     auto *src_var = src_scope.FindVar(src_var_name);
     PADDLE_ENFORCE_NOT_NULL(
         src_var,
-        platform::errors::NotFound("Source variable %s is not found.",
-                                   src_var_name));
+        phi::errors::NotFound("Source variable %s is not found.",
+                              src_var_name));
     auto &src_tensor = src_var->Get<phi::DenseTensor>();
     PADDLE_ENFORCE_NOT_NULL(
         dst_var,
-        platform::errors::NotFound("Destination variable %s is not found.",
-                                   src_var_name));
+        phi::errors::NotFound("Destination variable %s is not found.",
+                              src_var_name));
     auto *dst_tensor = dst_var->GetMutable<phi::DenseTensor>();
     callback(src_tensor, dst_tensor);
   }
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc
index b23fee1a012df..a7776609b79b8 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc
@@ -57,12 +57,12 @@ class XPULogsumexpKernel : public framework::OpKernel<T> {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     int r = xpu::logsumexp<T>(
         dev_ctx.x_context(), input_data, output_data, xdims, axis_shape);
-    PADDLE_ENFORCE_EQ(r,
-                      xpu::Error_t::SUCCESS,
-                      platform::errors::External(
-                          "XPU logsumexp kernel error! error value[%d %]",
-                          r,
-                          XPUAPIErrorMsg[r]));
+    PADDLE_ENFORCE_EQ(
+        r,
+        xpu::Error_t::SUCCESS,
+        phi::errors::External("XPU logsumexp kernel error! error value[%d %]",
+                              r,
+                              XPUAPIErrorMsg[r]));
   }
 };
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.h b/paddle/fluid/operators/reduce_ops/reduce_mean_op.h
index 017fab6308821..eb82be83ba517 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.h
@@ -62,7 +62,7 @@ struct FP16MeanGradFunctor {
                   int size) {
     dx->device(place) = (dy->template cast<float>().broadcast(dim) /
                          dx->template cast<float>().constant(size))
-                            .template cast<platform::float16>();
+                            .template cast<phi::dtype::float16>();
   }
 };
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 428c8d2c9a02c..2e14acddc1485 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -87,7 +87,7 @@ static inline std::vector<int> GetReduceDim(const std::vector<int>& dims,
     for (auto e : dims) {
       PADDLE_ENFORCE_LT(e,
                         dim_size,
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "ReduceBaseOp: invalid axis, when x_dims is %d, "
                             "axis[i] should less than x_dims, but got %d.",
                             dim_size,
@@ -511,7 +511,7 @@ class ReduceBaseOp : public framework::OperatorWithKernel {
     auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
     PADDLE_ENFORCE_GT(dims.size(),
                       0,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The input dim dimensions of ReduceBaseOp "
                           "should be greater than 0. But received the dim "
                           "dimensions of Reduce = %d.",
@@ -521,7 +521,7 @@ class ReduceBaseOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_LT(
           dims[i],
           x_rank,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The reduce dim index %d should be in the "
               "range [-dimension(X), dimension(X)] "
               "which dimension = %d. But received dim index = %d.",
@@ -531,7 +531,7 @@ class ReduceBaseOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_GE(
           dims[i],
           -x_rank,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The reduce dim index %d should be in the "
               "range [-dimension(X), dimension(X)] "
               "which dimension = %d. But received dim index = %d.",
@@ -628,7 +628,7 @@ class ReduceBaseOp : public framework::OperatorWithKernel {
                             platform::is_xpu_place(ctx.GetPlace()) ||
                             platform::is_custom_place(ctx.GetPlace()),
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "float16 can only be used on GPU or XPU place"));
     }
     return phi::KernelKey(input_data_type, ctx.GetPlace());
@@ -670,7 +670,7 @@ class ReduceGradOp : public framework::OperatorWithKernel {
         PADDLE_ENFORCE_LT(
             dims[i],
             x_rank,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "The reduce dim index %d should be in the "
                 "range [-dimension(X), dimension(X)], "
                 "which dimension = %d. But received dim index = %d.",
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h b/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h
index 35cc8fea6d0ba..31279af17e176 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h
@@ -34,10 +34,9 @@ void XPUReduce(const framework::ExecutionContext& context,
                                  T*,
                                  const std::vector<int>&,
                                  const std::vector<int>&)> func) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_xpu_place(context.GetPlace()),
-      true,
-      platform::errors::Unavailable("This kernel only runs on XPU."));
+  PADDLE_ENFORCE_EQ(platform::is_xpu_place(context.GetPlace()),
+                    true,
+                    phi::errors::Unavailable("This kernel only runs on XPU."));
   bool reduce_all = context.Attr<bool>("reduce_all");
   auto dims = context.Attr<std::vector<int>>("dim");
   auto* x = context.Input<phi::DenseTensor>("X");
@@ -48,7 +47,7 @@ void XPUReduce(const framework::ExecutionContext& context,
   int out_dtype = context.Attr<int>("out_dtype");
   PADDLE_ENFORCE_EQ(out_dtype == -1,
                     true,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "XPU only support out_dtype == -1 in reduce op."));
 
   const auto* x_data = x->data<T>();
@@ -88,16 +87,16 @@ void XPUReduce(const framework::ExecutionContext& context,
         dev_ctx.x_context(), x_data, y_data, x->numel() * sizeof(T));
     PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS,
                       true,
-                      platform::errors::External("XPU copy in reduce op return "
-                                                 "wrong value[%d %s].",
-                                                 r,
-                                                 XPUAPIErrorMsg[r]));
+                      phi::errors::External("XPU copy in reduce op return "
+                                            "wrong value[%d %s].",
+                                            r,
+                                            XPUAPIErrorMsg[r]));
   } else {
     int r = func(dev_ctx.x_context(), x_data, y_data, xdims, reduce_dims);
     PADDLE_ENFORCE_EQ(
         r == xpu::Error_t::SUCCESS,
         true,
-        platform::errors::External(
+        phi::errors::External(
             "XPU reduce op return wrong value[%d %s].", r, XPUAPIErrorMsg[r]));
   }
 }
diff --git a/paddle/fluid/operators/reduce_ops/unity_build_rule.cmake b/paddle/fluid/operators/reduce_ops/unity_build_rule.cmake
index 839bb1ac7306c..da67c2c8d8b01 100644
--- a/paddle/fluid/operators/reduce_ops/unity_build_rule.cmake
+++ b/paddle/fluid/operators/reduce_ops/unity_build_rule.cmake
@@ -4,8 +4,7 @@
 # Generally, the combination rules in this file do not need to be modified.
 # If there are some redefined error in compiling with the source file which
 # in combination rule, you can remove the source file from the following rules.
-register_unity_group(cc reduce_all_op.cc reduce_any_op.cc)
-register_unity_group(cu reduce_all_op.cu reduce_any_op.cu)
+
 # The following groups are to make better use of `/MP` which MSVC's parallel
 # compilation instruction when compiling in Unity Build.
 register_unity_group(cu frobenius_norm_op.cu)
diff --git a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
index 319fad9b39231..5ce59fc54d6a6 100644
--- a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
@@ -210,10 +210,10 @@ class ReorderLoDTensorByRankTableOp : public ReorderLoDTensorByRankTableBase {
     size_t out_offset = 0;
     out->mutable_lod()->clear();
     for (auto &item : rank_table.items()) {
-      PADDLE_ENFORCE_LT(item.index,
-                        absolute_table.size(),
-                        platform::errors::OutOfRange(
-                            "The value of rank_table is out of range."));
+      PADDLE_ENFORCE_LT(
+          item.index,
+          absolute_table.size(),
+          phi::errors::OutOfRange("The value of rank_table is out of range."));
       out_offset = CopyTensorAndLod(
           place, absolute_table[item.index], x, out, out_offset);
     }
diff --git a/paddle/fluid/operators/repeat_interleave_op.cc b/paddle/fluid/operators/repeat_interleave_op.cc
index d0af82510bdc4..e276ef2082fb6 100644
--- a/paddle/fluid/operators/repeat_interleave_op.cc
+++ b/paddle/fluid/operators/repeat_interleave_op.cc
@@ -29,12 +29,12 @@ class RepeatInterleaveOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("X"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(X) of RepeatInterleaveOp should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("Out"),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Output(Out) of RepeatInterleaveOp should not be null."));
 
     auto input_dim = ctx->GetInputDim("X");
@@ -43,7 +43,7 @@ class RepeatInterleaveOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         dim < input_dim.size() && dim >= (0 - input_dim.size()),
         true,
-        platform::errors::OutOfRange(
+        phi::errors::OutOfRange(
             "Attr(dim) is out of range, It's expected "
             "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
             input_dim.size(),
@@ -58,7 +58,7 @@ class RepeatInterleaveOp : public framework::OperatorWithKernel {
           repeats_dim.size() == 1 ||
               (repeats_dim.size() == 2 && repeats_dim[1] == 1),
           true,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The 'shape' of Input(RepeatsTensor) must be 1-D tensor. "
               "But received: the 'shape' of Input(Index) is [%s], "
               "the dimension of Input(Index) is [%d].",
@@ -67,7 +67,7 @@ class RepeatInterleaveOp : public framework::OperatorWithKernel {
 
       PADDLE_ENFORCE_EQ(repeats_dim[0] != 0,
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The length of Input(RepeatsTensor) can't be 0."));
 
       if (dim < 0) {
@@ -98,14 +98,14 @@ class RepeatInterleaveGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(Out@GRAD) should be not null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Output(X@GRAD) should be not null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput(framework::GradVarName("Out")),
+        true,
+        phi::errors::InvalidArgument("Input(Out@GRAD) should be not null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput(framework::GradVarName("X")),
+        true,
+        phi::errors::InvalidArgument("Output(X@GRAD) should be not null."));
 
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 34d80604ae8b0..d984edc4c4172 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -52,11 +52,11 @@ class ReshapeOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input(X) of ReshapeOp should not be null."));
     PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Output(Out) of ReshapeOp should not be null."));
 
     if (ctx->IsRuntime()) {
@@ -76,7 +76,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_GT(
           ShapeTensor.size(),
           0,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "When `shape` in ReshapeOp is a list or tuple "
               "which contains Tensor, the shape's size can't be zero. "
               "But received shape's size is %d.",
@@ -89,7 +89,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
           PADDLE_ENFORCE_LT(
               static_cast<int>(i),
               in_dims.size(),
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "The index of 0 in `shape` must be less than "
                   "the input tensor X's dimensions. But received shape[%d] "
                   "= 0, X's dimensions = %d, X's shape = [%s].",
@@ -155,7 +155,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
         PADDLE_ENFORCE_EQ(
             unk_dim_idx,
             -1,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "Only one dimension value of 'shape' in ReshapeOp can "
                 "be -1. But received shape = [%s], shape[%d] is also -1.",
                 common::make_ddim(shape),
@@ -165,7 +165,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
         PADDLE_ENFORCE_LT(
             static_cast<int>(i),
             in_dims.size(),
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "The index of 0 in `shape` must be less than "
                 "the input tensor X's dimensions. "
                 "But received shape = [%s], shape[%d] = 0, X's shape = [%s], "
@@ -178,7 +178,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
         PADDLE_ENFORCE_GT(
             shape[i],
             0,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "Each dimension value of 'shape' in ReshapeOp must not "
                 "be negative except one unknown dimension. "
                 "But received  shape = [%s], shape[%d] = %d.",
@@ -204,7 +204,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
         PADDLE_ENFORCE_EQ(
             output_shape[unk_dim_idx] * capacity,
             -in_size,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "The 'shape' attribute in ReshapeOp is invalid. "
                 "The input tensor X'size must be divisible by known "
                 "capacity of 'shape'. "
@@ -222,7 +222,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
         PADDLE_ENFORCE_EQ(
             capacity,
             in_size,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "The 'shape' in ReshapeOp is invalid. "
                 "The input tensor X'size must be equal to the capacity of "
                 "'shape'. "
@@ -242,7 +242,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_LE(
           capacity,
           in_size,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The 'shape' in ReshapeOp is invalid. "
               "The input tensor X's shape = [%s], X's capacity = %d."
               "But the target shape of Out is [%s],  the "
@@ -359,11 +359,11 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("X"),
         true,
-        platform::errors::InvalidArgument("Input(X) shouldn't be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(Out@GRAD) shouldn't be null."));
+        phi::errors::InvalidArgument("Input(X) shouldn't be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput(framework::GradVarName("Out")),
+        true,
+        phi::errors::InvalidArgument("Input(Out@GRAD) shouldn't be null."));
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 
@@ -613,11 +613,11 @@ class Reshape2GradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("XShape"),
         true,
-        platform::errors::InvalidArgument("Input(XShape) shouldn't be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(Out@GRAD) shouldn't be null."));
+        phi::errors::InvalidArgument("Input(XShape) shouldn't be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput(framework::GradVarName("Out")),
+        true,
+        phi::errors::InvalidArgument("Input(Out@GRAD) shouldn't be null."));
 
     // Construct MetaTensor for InferMeta Func
     using CompatMetaTensor = framework::CompatMetaTensor;
@@ -774,7 +774,7 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape,
                                 ops::ReshapeKernel,
                                 int64_t,
                                 ops::ReshapeKernel,
-                                plat::float16,
+                                phi::dtype::float16,
                                 ops::ReshapeKernel,
                                 plat::bfloat16,
                                 ops::ReshapeKernel);
@@ -791,7 +791,7 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad,
                                 ops::ReshapeGradKernel,
                                 uint8_t,
                                 ops::ReshapeGradKernel,
-                                plat::float16,
+                                phi::dtype::float16,
                                 ops::ReshapeGradKernel,
                                 plat::bfloat16,
                                 ops::ReshapeGradKernel);
diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
index 04633c9e8e5dd..38d77de90ace4 100644
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -39,7 +39,7 @@ class RowConvOp : public framework::OperatorWithKernel {
     auto filter_dims = ctx->GetInputDim("Filter");
     PADDLE_ENFORCE_EQ(filter_dims.size(),
                       2,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input(Filter)'s dimensions should be 2. Received: "
                           "Input(Filter)'s shape: [%s].",
                           filter_dims));
diff --git a/paddle/fluid/operators/rrelu_op.cc b/paddle/fluid/operators/rrelu_op.cc
index 53f6969695e8e..3111ad4e5015d 100644
--- a/paddle/fluid/operators/rrelu_op.cc
+++ b/paddle/fluid/operators/rrelu_op.cc
@@ -52,7 +52,7 @@ class RReluOpMaker : public framework::OpProtoAndCheckerMaker {
         .AddCustomChecker([](const float& lower) {
           PADDLE_ENFORCE_EQ(lower >= 0.0f && lower < 1.0f,
                             true,
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "'RRelu_lower' must be between 0.0 and 1.0."));
         });
     float defalut_upper = 1. / 3.;
@@ -61,7 +61,7 @@ class RReluOpMaker : public framework::OpProtoAndCheckerMaker {
         .AddCustomChecker([](const float& upper) {
           PADDLE_ENFORCE_EQ(upper > 0.0f && upper <= 1.0f,
                             true,
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "'RRelu_upper' must be between 0.0 and 1.0."));
         });
     AddComment(R"DOC(
diff --git a/paddle/fluid/operators/run_program_op.cc b/paddle/fluid/operators/run_program_op.cc
index ffb024d165d36..0dc2d8ea0e20d 100644
--- a/paddle/fluid/operators/run_program_op.cc
+++ b/paddle/fluid/operators/run_program_op.cc
@@ -24,13 +24,13 @@ class RunProgramOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInputs("X"),
-                      true,
-                      platform::errors::NotFound(
-                          "Input(X) of RunProgramOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInputs("X"),
+        true,
+        phi::errors::NotFound("Input(X) of RunProgramOp should not be null."));
     PADDLE_ENFORCE_EQ(ctx->HasOutputs("Out"),
                       true,
-                      platform::errors::NotFound(
+                      phi::errors::NotFound(
                           "Output(Out) of RunProgramOp should not be null."));
   }
 
@@ -173,12 +173,12 @@ class RunProgramGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE_EQ(ctx->HasInputs("X"),
                       true,
-                      platform::errors::NotFound(
+                      phi::errors::NotFound(
                           "Input(X) of RunProgramGradOp should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInputs(framework::GradVarName("Out")),
         true,
-        platform::errors::NotFound(
+        phi::errors::NotFound(
             "Input(Out@GRAD) of RunProgramGradOp should not be null."));
     // NOTE: The X@GRAD and Params@GRAD may not exist,
     // because they can be set stop_gradient = True
diff --git a/paddle/fluid/operators/run_program_op.cu b/paddle/fluid/operators/run_program_op.cu
index 9a2b6851a4c73..1d9011429577b 100644
--- a/paddle/fluid/operators/run_program_op.cu
+++ b/paddle/fluid/operators/run_program_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/run_program_op.h"
 
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/float16.h"
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h
index 6006d7556423c..895a99608c902 100644
--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
@@ -32,7 +32,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/framework/variable.h"
 #ifdef PADDLE_WITH_DNNL
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/onednn_helper.h"
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/operators/cuda_graph_with_in_out.h"
@@ -58,7 +58,7 @@ static void CheckInputVarStatus(const Variable &var,
                                 const std::string &var_name) {
   PADDLE_ENFORCE_EQ(var.IsType<phi::DenseTensor>(),
                     true,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The input variable %s of "
                         "RunProgram(Grad)Op holds "
                         "wrong type. Expect type is phi::DenseTensor, but "
@@ -68,10 +68,10 @@ static void CheckInputVarStatus(const Variable &var,
   PADDLE_ENFORCE_EQ(
       var.Get<phi::DenseTensor>().IsInitialized(),
       true,
-      platform::errors::InvalidArgument("The tensor in input variable %s of "
-                                        "RunProgram(Grad)Op "
-                                        "is not initialized.",
-                                        var_name));
+      phi::errors::InvalidArgument("The tensor in input variable %s of "
+                                   "RunProgram(Grad)Op "
+                                   "is not initialized.",
+                                   var_name));
 }
 
 static void CheckOutputVarStatus(const Variable &src_var,
@@ -81,7 +81,7 @@ static void CheckOutputVarStatus(const Variable &src_var,
     PADDLE_ENFORCE_EQ(
         src_var.IsType<phi::DenseTensor>(),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The output variable %s get from "
             "RunProgram(Grad)Op's internal scope holds "
             "wrong type. Expect type is phi::DenseTensor, but receive type is "
@@ -90,7 +90,7 @@ static void CheckOutputVarStatus(const Variable &src_var,
             platform::demangle(framework::ToTypeName(src_var.Type()))));
     PADDLE_ENFORCE_EQ(src_var.Get<phi::DenseTensor>().IsInitialized(),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The tensor in output variable %s get from "
                           "RunProgram(Grad)Op's internal "
                           "scope is not initialized.",
@@ -99,7 +99,7 @@ static void CheckOutputVarStatus(const Variable &src_var,
     PADDLE_ENFORCE_EQ(
         src_var.IsType<phi::SelectedRows>(),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The output variable %s get from "
             "RunProgram(Grad)Op's internal scope holds "
             "wrong type. Expect type is SelectedRows, but receive type is %s.",
@@ -107,14 +107,14 @@ static void CheckOutputVarStatus(const Variable &src_var,
             platform::demangle(framework::ToTypeName(src_var.Type()))));
     PADDLE_ENFORCE_EQ(src_var.Get<phi::SelectedRows>().value().IsInitialized(),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The tensor in output variable %s get from "
                           "RunProgram(Grad)Op's "
                           "internal scope is not initialized.",
                           var_name));
 
   } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
+    PADDLE_THROW(phi::errors::InvalidArgument(
         "The RunProgram(Grad)Op only support output "
         "variable of type phi::DenseTensor or SelectedRows, "
         "but received variable %s's type is %s",
@@ -173,10 +173,10 @@ static void ShareVarsFromScope(const std::vector<Variable *> &vars,
     auto *var = scope->FindVar(var_names[i]);
     PADDLE_ENFORCE_NOT_NULL(
         var,
-        platform::errors::NotFound("The output variable %s is not in "
-                                   "RunProgram(Grad)Op'"
-                                   "s internal scope.",
-                                   var_names[i]));
+        phi::errors::NotFound("The output variable %s is not in "
+                              "RunProgram(Grad)Op'"
+                              "s internal scope.",
+                              var_names[i]));
     CheckOutputVarStatus(*var, *vars[i], var_names[i]);
     VariableShare(*var, vars[i]);
   }
@@ -312,14 +312,14 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           use_cuda_graph,
           true,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "If not provide OutScope then must run under cuda graph mode."));
       inner_scope = std::make_unique<framework::Scope>();
     } else {
       PADDLE_ENFORCE_EQ(
           out_scope_vec->size(),
           1,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The OutScope of RunProgramGradOp should only hold one scope."));
     }
 
@@ -511,7 +511,7 @@ class RunProgramGradOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         out_scope_vec->size(),
         1,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The OutScope of RunProgramGradOp should only hold one scope."));
 
     framework::Scope *global_inner_scope = out_scope_vec->front();
@@ -519,7 +519,7 @@ class RunProgramGradOpKernel : public framework::OpKernel<T> {
     VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num;
     PADDLE_ENFORCE_GT(sub_scope_num,
                       0,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The OutScope of RunProgramGradOp should hold at "
                           "least one sub scope."));
 
diff --git a/paddle/fluid/operators/sampling_id_op.cc b/paddle/fluid/operators/sampling_id_op.cc
deleted file mode 100644
index 5df5270976ca4..0000000000000
--- a/paddle/fluid/operators/sampling_id_op.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sampling_id_op.h"
-
-namespace paddle {
-namespace operators {
-
-class SamplingIdOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SampleIn");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "X", "SampleOut");
-    PADDLE_ENFORCE_LT(
-        ctx->Attrs().Get<float>("min"),
-        ctx->Attrs().Get<float>("max"),
-        platform::errors::InvalidArgument(
-            "min must less then max, but here min is %f, max is %f",
-            ctx->Attrs().Get<float>("min"),
-            ctx->Attrs().Get<float>("max")));
-
-    auto input_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(
-        input_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "Input(X, Filter) should be 2-D tensor. But X dim is %d",
-            input_dims.size()));
-
-    auto dim0 = input_dims[0];
-    framework::DDim dims = common::make_ddim({dim0});
-    ctx->SetOutputDim("Out", dims);
-    ctx->ShareLoD("X", "Out");
-  }
-};
-
-class SamplingIdOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input tensor of softmax. "
-             "2-D with shape [batch_size, input_feature_dimensions].");
-    AddOutput("Out", "SamplingId data tensor.");
-    AddComment(R"DOC(
-SamplingId Operator.
-A layer for sampling id from multinomial distribution from the
- input. Sampling one id for one sample.)DOC");
-    AddAttr<float>("min", "Minimum value of random. (float, default 0.0).")
-        .SetDefault(0.0f);
-    AddAttr<float>("max", "Maximun value of random. (float, default 1.0).")
-        .SetDefault(1.0f);
-    AddAttr<int>(
-        "seed",
-        "Random seed used for the random number engine. "
-        "0 means use a seed generated by the system."
-        "Note that if seed is not 0, this operator will "
-        "generate the same random numbers every time. (int, default 0).")
-        .SetDefault(0);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    sampling_id,
-    ops::SamplingIdOp,
-    ops::SamplingIdOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-PD_REGISTER_STRUCT_KERNEL(sampling_id,
-                          CPU,
-                          ALL_LAYOUT,
-                          paddle::operators::SamplingIdKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/sampling_id_op.cu b/paddle/fluid/operators/sampling_id_op.cu
deleted file mode 100644
index 2ec00d125bcab..0000000000000
--- a/paddle/fluid/operators/sampling_id_op.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include "paddle/fluid/operators/sampling_id_op.h"
-
-namespace ops = paddle::operators;
-PD_REGISTER_STRUCT_KERNEL(sampling_id,
-                          GPU,
-                          ALL_LAYOUT,
-                          paddle::operators::SamplingIdKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/sampling_id_op.h b/paddle/fluid/operators/sampling_id_op.h
deleted file mode 100644
index 730d84c2a651e..0000000000000
--- a/paddle/fluid/operators/sampling_id_op.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <iostream>
-#include <iterator>
-#include <random>
-#include <sstream>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/generator.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class SamplingIdKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const phi::DenseTensor* input = context.Input<phi::DenseTensor>("X");
-    const int batch_size = static_cast<int>(input->dims()[0]);
-    const int width = static_cast<int>(input->dims()[1]);
-
-    PADDLE_ENFORCE_GE(
-        batch_size,
-        0,
-        platform::errors::InvalidArgument(
-            "batch_size(dims[0]) must be nonnegative. but it is %d.",
-            batch_size));
-    PADDLE_ENFORCE_GE(
-        width,
-        0,
-        platform::errors::InvalidArgument(
-            "width(dims[1]) must be nonnegative. but it is %d.", width));
-
-    std::vector<T> ins_vector;
-    framework::TensorToVector(*input, context.device_context(), &ins_vector);
-
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-
-    std::uniform_real_distribution<T> dist(
-        static_cast<T>(context.Attr<float>("min")),
-        static_cast<T>(context.Attr<float>("max")));
-
-    auto engine = phi::GetCPURandomEngine(seed);
-    std::vector<int64_t> ids(batch_size);
-    for (int i = 0; i < batch_size; ++i) {
-      T r = dist(*engine);
-      int idx = width - 1;
-      for (int j = 0; j < width; ++j) {
-        if ((r -= ins_vector[i * width + j]) < 0) {
-          idx = j;
-          break;
-        }
-      }
-      ids[i] = int64_t(idx);
-    }
-
-    std::vector<int64_t> out_dim;
-    out_dim.push_back(static_cast<int64_t>(batch_size));
-
-    phi::DenseTensor* output = context.Output<phi::DenseTensor>("Out");
-    output->Resize(common::make_ddim(out_dim));
-    output->mutable_data<T>(context.GetPlace());
-    framework::TensorFromVector(ids, context.device_context(), output);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sampling_id_op_xpu.cc b/paddle/fluid/operators/sampling_id_op_xpu.cc
deleted file mode 100644
index 9fd0193733e6e..0000000000000
--- a/paddle/fluid/operators/sampling_id_op_xpu.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include "paddle/fluid/operators/sampling_id_op.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(
-    sampling_id, XPU, ALL_LAYOUT, ops::SamplingIdKernel, float, double) {}
diff --git a/paddle/fluid/operators/save_combine_op.h b/paddle/fluid/operators/save_combine_op.h
index f5c3fb9969f1e..60c844678924b 100644
--- a/paddle/fluid/operators/save_combine_op.h
+++ b/paddle/fluid/operators/save_combine_op.h
@@ -185,7 +185,7 @@ class SaveCombineOpKernel : public framework::OpKernel<T> {
 
     PADDLE_ENFORCE_GT(inp_var_names.size(),
                       0UL,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The number of variables to be saved is %d, expect "
                           "it to be greater than 0.",
                           inp_var_names.size()));
@@ -199,12 +199,12 @@ class SaveCombineOpKernel : public framework::OpKernel<T> {
       for (size_t i = 0; i < inp_vars.size(); i++) {
         PADDLE_ENFORCE_NOT_NULL(
             inp_vars[i],
-            platform::errors::InvalidArgument(
-                "Cannot find variable %s to save.", inp_var_names[i]));
+            phi::errors::InvalidArgument("Cannot find variable %s to save.",
+                                         inp_var_names[i]));
         PADDLE_ENFORCE_EQ(
             inp_vars[i]->IsType<phi::DenseTensor>(),
             true,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "SaveCombine operator only supports saving "
                 "phi::DenseTensor or Vocab variable, %s has wrong type.",
                 inp_var_names[i]));
@@ -222,12 +222,12 @@ class SaveCombineOpKernel : public framework::OpKernel<T> {
       for (size_t i = 0; i < inp_vars.size(); i++) {
         PADDLE_ENFORCE_NOT_NULL(
             inp_vars[i],
-            platform::errors::InvalidArgument(
-                "Cannot find variable %s to save.", inp_var_names[i]));
+            phi::errors::InvalidArgument("Cannot find variable %s to save.",
+                                         inp_var_names[i]));
         PADDLE_ENFORCE_EQ(
             inp_vars[i]->IsType<framework::Vocab>(),
             true,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "SaveCombine operator only supports saving "
                 "phi::DenseTensor or Vocab variable, %s has wrong type.",
                 inp_var_names[i]));
diff --git a/paddle/fluid/operators/search_compute.h b/paddle/fluid/operators/search_compute.h
index abf8365182483..579f31fbcf388 100644
--- a/paddle/fluid/operators/search_compute.h
+++ b/paddle/fluid/operators/search_compute.h
@@ -145,7 +145,7 @@ inline void axpy(const T* x, T* y, size_t len, const T alpha) {
   }
 #elif defined(PADDLE_WITH_ARM) || defined(PADDLE_WITH_SW) || \
     defined(PADDLE_WITH_MIPS) || defined(PADDLE_WITH_LOONGARCH)
-  PADDLE_THROW(platform::errors::Unimplemented("axpy is not supported"));
+  PADDLE_THROW(phi::errors::Unimplemented("axpy is not supported"));
 #else
   lll = len & ~SSE_CUT_LEN_MASK;
   __m128x mm_alpha = _mm_load1_px(&alpha);
@@ -175,7 +175,7 @@ inline void axpy_noadd(const T* x, T* y, size_t len, const T alpha) {
   }
 #elif defined(PADDLE_WITH_ARM) || defined(PADDLE_WITH_SW) || \
     defined(PADDLE_WITH_MIPS) || defined(PADDLE_WITH_LOONGARCH)
-  PADDLE_THROW(platform::errors::Unimplemented("axpy_noadd is not supported"));
+  PADDLE_THROW(phi::errors::Unimplemented("axpy_noadd is not supported"));
 #else
   lll = len & ~SSE_CUT_LEN_MASK;
   __m128x mm_alpha = _mm_load1_px(&alpha);
@@ -194,7 +194,7 @@ inline void axpy_noadd(const int8_t* x,
                        int8_t* y,
                        size_t len,
                        const float alpha) {
-  PADDLE_THROW(platform::errors::Unimplemented(
+  PADDLE_THROW(phi::errors::Unimplemented(
       "int8_t input of axpy_noadd is not supported"));
 }
 
diff --git a/paddle/fluid/operators/select_input_op.cc b/paddle/fluid/operators/select_input_op.cc
index 3b00aab8c8e89..8383a8bec3bd3 100644
--- a/paddle/fluid/operators/select_input_op.cc
+++ b/paddle/fluid/operators/select_input_op.cc
@@ -43,7 +43,7 @@ class SelectInputOp : public framework::OperatorBase {
     PADDLE_ENFORCE_LT(
         output_branch,
         x_names.size(),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input 'Mask' in SelectInputOp is invalid. "
             "'Mask' must be less than the size of input vector 'X'. "
             "But received Mask = %d, X's size = %d.",
diff --git a/paddle/fluid/operators/select_op_helper.h b/paddle/fluid/operators/select_op_helper.h
index 2b7f884f6170c..33c5879de71f2 100644
--- a/paddle/fluid/operators/select_op_helper.h
+++ b/paddle/fluid/operators/select_op_helper.h
@@ -28,7 +28,7 @@ namespace operators {
 inline int GetBranchNumber(const phi::DenseTensor &mask) {
   PADDLE_ENFORCE_EQ(mask.numel(),
                     1,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The numel of Input(Mask) in SelectInputOp or "
                         "SelectOutputOp must be 1. "
                         "But received %d, and it's shape is [%s].",
@@ -43,7 +43,7 @@ inline int GetBranchNumber(const phi::DenseTensor &mask) {
     defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU)
   framework::TensorCopySync(mask, platform::CPUPlace(), cpu_mask.get());
 #else
-  PADDLE_THROW(platform::errors::PreconditionNotMet(
+  PADDLE_THROW(phi::errors::PreconditionNotMet(
       "This version of PaddlePaddle does NOT support GPU, "
       "but got GPU tensor 'Mask' in SelectInputOp or SelectOutputOp. "
       "Please compile PaddlePaddle WITH_GPU first."));
diff --git a/paddle/fluid/operators/select_output_op.cc b/paddle/fluid/operators/select_output_op.cc
index 623d1bb5c6ce9..8f61ef7bb712a 100644
--- a/paddle/fluid/operators/select_output_op.cc
+++ b/paddle/fluid/operators/select_output_op.cc
@@ -55,7 +55,7 @@ class SelectOutputOp : public framework::OperatorBase {
     PADDLE_ENFORCE_LT(
         output_branch,
         out_names.size(),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input 'Mask' in SelectOutputOp is invalid. "
             "'Mask' must be less than the size of output vector 'Out'. "
             "But received Mask = %d, Out's size = %d.",
diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc
deleted file mode 100644
index dd65162b3aad4..0000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/sequence_ops/sequence_concat_op.h"
-
-#include <memory>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-class SeqConcatOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The inputs of sequence concat op").AsDuplicable();
-    AddOutput("Out", "The output of sequence concat op");
-    AddComment(
-        "Sequence Concat Op\n"
-        "It will concat LoD tensors by its sequence information.\n"
-        "For example:\n"
-        "  LoD of X1 = [0, 3, 7]\n"
-        "  LoD of X2 = [0, 7, 9]\n"
-        "  Result LoD is [0, (3+7), (7+9)]\n"
-        "            i.e.[0, 10, 16]\n");
-  }
-};
-
-class SequenceConcatOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE_EQ(
-        context->HasInputs("X"),
-        true,
-        platform::errors::NotFound("SequenceConcatOp Input(X) of Sequence "
-                                   "Concat Op should not be null."));
-    PADDLE_ENFORCE_EQ(
-        context->HasOutput("Out"),
-        true,
-        platform::errors::NotFound("SequenceConcatOp Output(Out) of Sequence "
-                                   "Concat Op should not be null."));
-
-    PADDLE_ENFORCE_GT(context->Inputs("X").size(),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "The number of SequenceConcatOp inputs should be "
-                          "greater than 1. But "
-                          "the number of inputs we received is %d",
-                          context->Inputs("X").size()));
-    auto x_dims = context->GetInputsDim("X");
-    int64_t batch_size = 0;
-    int64_t feature_size = 0;
-    std::vector<int64_t> out_dims;
-    for (auto &x_dim : x_dims) {
-      if (out_dims.empty()) {
-        out_dims = common::vectorize(x_dim);
-      }
-      batch_size += x_dim[0];
-      PADDLE_ENFORCE_NE(
-          x_dim[0],
-          0,
-          platform::errors::InvalidArgument(
-              "The first dim of SequenceConcatOp inputs must not be 0."));
-      if (feature_size == 0) {
-        feature_size = common::product(x_dim) / x_dim[0];
-      } else {
-        PADDLE_ENFORCE_EQ(
-            feature_size,
-            common::product(x_dim) / x_dim[0],
-            platform::errors::InvalidArgument(
-                "Each input of SequenceConcatOp inputs must have same feature "
-                "size, But "
-                "the feature size we received is %d, the feature size of 1st "
-                "input is %d",
-                feature_size,
-                common::product(x_dim) / x_dim[0]));
-      }
-    }
-    if (batch_size < 0) {
-      batch_size = -1;  // Normalize batch size for compile time.
-    }
-    out_dims[0] = batch_size;
-    context->SetOutputDim("Out", common::make_ddim(out_dims));
-    if (!context->IsRuntime()) {  // Runtime LoD infershape will be computed
-      // in Kernel.
-      context->ShareLoD("X", "Out");
-    }
-  }
-};
-
-template <typename T>
-class SeqConcatGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("sequence_concat_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X", false));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-class SeqConcatGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *context) const override {
-    context->SetOutputsDim(framework::GradVarName("X"),
-                           context->GetInputsDim("X"));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Out")),
-                          ctx.GetPlace());
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(SeqConcatGradNoNeedBufferVarsInferer, "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace op = paddle::operators;
-
-REGISTER_OPERATOR(sequence_concat,
-                  op::SequenceConcatOp,
-                  op::SeqConcatOpMaker,
-                  op::SeqConcatGradOpMaker<paddle::framework::OpDesc>,
-                  op::SeqConcatGradOpMaker<paddle::imperative::OpBase>);
-PD_REGISTER_STRUCT_KERNEL(sequence_concat,
-                          CPU,
-                          ALL_LAYOUT,
-                          op::SeqConcatKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
-
-REGISTER_OPERATOR(sequence_concat_grad,
-                  op::SeqConcatGradOp,
-                  op::SeqConcatGradNoNeedBufferVarsInferer);
-PD_REGISTER_STRUCT_KERNEL(sequence_concat_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          op::SeqConcatGradKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
deleted file mode 100644
index b668a9d2558ef..0000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/sequence_ops/sequence_concat_op.h"
-
-#include "paddle/fluid/framework/op_registry.h"
-
-PD_REGISTER_STRUCT_KERNEL(sequence_concat,
-                          GPU,
-                          ALL_LAYOUT,
-                          paddle::operators::SeqConcatKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
-PD_REGISTER_STRUCT_KERNEL(sequence_concat_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          paddle::operators::SeqConcatGradKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h
deleted file mode 100644
index 463cadc3ce733..0000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h
+++ /dev/null
@@ -1,188 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-
-namespace paddle {
-namespace operators {
-
-namespace detail {
-template <typename Container>
-inline framework::LoD ConcatLoD(const Container &xs,
-                                std::vector<phi::DenseTensor> *xs_in_order) {
-  std::vector<size_t> result;
-  result.resize(xs[0].get().lod()[0].size());
-
-  for (size_t i = 1; i < result.size(); ++i) {
-    size_t sum = 0;
-    for (size_t j = 0; j < xs.size(); ++j) {
-      auto &x_lod = xs[j].get().lod()[0];
-      const phi::DenseTensor &tensor = xs[j].get();
-      if (x_lod[i - 1] < x_lod[i]) {
-        xs_in_order->emplace_back(tensor.Slice(x_lod[i - 1], x_lod[i]));
-      }
-      sum += x_lod[i];
-    }
-    result[i] = sum;
-  }
-  framework::LoD lod;
-  lod.emplace_back(result);
-  return lod;
-}
-
-template <typename T, typename... ARGS>
-inline std::vector<std::reference_wrapper<T>> GetDataVectorSafely(
-    const std::vector<T *> &vec, ARGS &&...args) {
-  std::vector<std::reference_wrapper<T>> result;
-  result.reserve(vec.size());
-  for (auto *ptr : vec) {
-    PADDLE_ENFORCE_NOT_NULL(ptr,
-                            platform::errors::InvalidArgument(
-                                "The input variable X contains nullptr."));
-    result.emplace_back(*ptr);
-  }
-  return result;
-}
-}  // namespace detail
-
-template <typename T, typename DeviceContext>
-class SeqConcatKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto xs =
-        detail::GetDataVectorSafely(context.MultiInput<phi::DenseTensor>("X"));
-    auto &out = *context.Output<phi::DenseTensor>("Out");
-
-    size_t lod_size = 0;
-    for (auto &x : xs) {
-      if (lod_size == 0) {
-        PADDLE_ENFORCE_EQ(x.get().lod().empty(),
-                          false,
-                          platform::errors::NotFound(
-                              "Input(X) Tensor of SequenceConcatOp does not "
-                              "contain LoD information."));
-        lod_size = x.get().lod()[0].size();
-      } else {
-        PADDLE_ENFORCE_EQ(lod_size,
-                          x.get().lod()[0].size(),
-                          platform::errors::InvalidArgument(
-                              "The lod size of each input must be the same, "
-                              "But the lod size of input we received is %d, "
-                              "the first input is %d",
-                              x.get().lod()[0].size(),
-                              lod_size));
-      }
-    }
-    PADDLE_ENFORCE_NE(
-        lod_size,
-        0,
-        platform::errors::InvalidArgument(
-            "Each input must have sequence lod information. But we "
-            "received input lod size is %d",
-            lod_size));
-
-    std::vector<phi::DenseTensor> x_in_order;
-    out.set_lod(detail::ConcatLoD(xs, &x_in_order));
-    out.mutable_data<T>(context.GetPlace());
-    math::ConcatFunctor<DeviceContext, T> functor;
-    functor(
-        context.template device_context<DeviceContext>(), x_in_order, 0, &out);
-  }
-};
-
-template <typename T, typename DeviceContext>
-class SeqConcatGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto xs = context.MultiInput<phi::DenseTensor>("X");
-    auto dxs =
-        context.MultiOutput<phi::DenseTensor>(framework::GradVarName("X"));
-    PADDLE_ENFORCE_EQ(xs.size(),
-                      dxs.size(),
-                      platform::errors::InvalidArgument(
-                          "The rank of Input X and Output Grad X must be "
-                          "same, But the rank of Input X we received is %d, "
-                          "the rank of Output Grad X is %d",
-                          xs.size(),
-                          dxs.size()));
-    for (size_t i = 0; i < dxs.size(); ++i) {
-      if (dxs[i] != nullptr) {
-        dxs[i]->set_lod(xs[i]->lod());
-        dxs[i]->mutable_data<T>(context.GetPlace());
-      }
-    }
-
-    std::vector<phi::DenseTensor> sliced_x;
-    std::vector<paddle::optional<phi::DenseTensor>> sliced_dx;
-
-    for (size_t i = 1; i < xs[0]->lod()[0].size(); ++i) {
-      for (size_t j = 0; j < xs.size(); ++j) {
-        const phi::DenseTensor *x = xs[j];
-        framework::DDim x_dims = x->dims();
-
-        phi::DenseTensor *dx = dxs[j];
-        auto &x_lod = x->lod()[0];
-        if (x_lod[i - 1] == x_lod[i]) continue;
-
-        auto prev_lod = x_lod[i - 1];
-        auto next_lod = x_lod[i];
-
-        x_dims[0] = next_lod - prev_lod;
-
-        sliced_x.emplace_back();
-        sliced_x.back().Resize(x_dims);
-
-        if (dx) {
-          sliced_dx.emplace_back(dx->Slice(prev_lod, next_lod));
-        } else {
-          sliced_dx.emplace_back(paddle::none);
-        }
-      }
-    }
-
-    std::vector<const phi::DenseTensor *> sliced_x_ptr;
-    sliced_x_ptr.reserve(sliced_x.size());
-    for (auto &x : sliced_x) {
-      sliced_x_ptr.emplace_back(&x);
-    }
-
-    std::vector<phi::DenseTensor *> sliced_dx_ptr;
-    sliced_dx_ptr.reserve(sliced_dx.size());
-    for (auto &dx : sliced_dx) {
-      if (dx) {
-        sliced_dx_ptr.emplace_back(&dx.get());
-      }
-    }
-
-    math::SplitFunctor<DeviceContext, T> functor;
-    functor(context.template device_context<DeviceContext>(),
-            GET_DATA_SAFELY(
-                context.Input<phi::DenseTensor>(framework::GradVarName("Out")),
-                "Input",
-                framework::GradVarName("Out"),
-                "SeqConcatGrad"),
-            sliced_x_ptr,
-            0,
-            &sliced_dx_ptr);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
index c94f57807cd52..24109e8ed4531 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
@@ -40,14 +40,14 @@ class SequenceConvOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ctx->Attrs().Get<int>("contextStride"),
         1,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Currently, SequenceConvOp only supports contextStride=1. But "
             "received contextStride = %u.",
             ctx->Attrs().Get<int>("contextStride")));
     PADDLE_ENFORCE_EQ(
         in_dims.size() == 2 && filter_dims.size() == 2,
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(X, Filter) should be 2-D tensor. But received Input(X): "
             "input rank %u, input shape [%s]; received Input(Filter): "
             "input rank %u, input shape [%s].",
@@ -58,7 +58,7 @@ class SequenceConvOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         filter_dims[0],
         context_length * in_dims[1],
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Filter's height should be context_length * "
             "input_hidden_size. But received: filter's height = %d, "
             "context_length * input_hidden_size = %d.",
@@ -82,13 +82,13 @@ class SequenceConvOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           start_length,
           false,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "If context_start is 0 and context_length is 1, paddingTrainable "
               "should be false."));
       PADDLE_ENFORCE_EQ(
           padding_dim.size(),
           2,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Input(PaddingData) should be 2-D tensor. But received: "
               "input rank %u, input shape [%s].",
               padding_dim.size(),
@@ -96,14 +96,14 @@ class SequenceConvOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           padding_dim[0] == total_pad && padding_dim[1] == input_width,
           true,
-          platform::errors::InvalidArgument("Input(PaddingData)'s shape is not "
-                                            "consistent with 'context_start' "
-                                            "and 'context_length'. Received "
-                                            "Input(PaddingData): input rank "
-                                            "%u, "
-                                            "input shape [%s].",
-                                            padding_dim.size(),
-                                            padding_dim));
+          phi::errors::InvalidArgument("Input(PaddingData)'s shape is not "
+                                       "consistent with 'context_start' "
+                                       "and 'context_length'. Received "
+                                       "Input(PaddingData): input rank "
+                                       "%u, "
+                                       "input shape [%s].",
+                                       padding_dim.size(),
+                                       padding_dim));
     }
 
     in_dims[1] = filter_dims[1];
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.h b/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
index 347db6e37db10..b7820f5dda0a7 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
@@ -39,13 +39,13 @@ class SequenceConvKernel : public framework::OpKernel<T> {
 
     PADDLE_ENFORCE_EQ(in->lod().empty(),
                       false,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input(X) phi::DenseTensor of SequenceConvOp "
                           "does not contain LoD information."));
     PADDLE_ENFORCE_EQ(
         in->lod().size(),
         1UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Only support input sequence with lod level equal to 1 at "
             "present. But received: lod level %u.",
             in->lod().size()));
@@ -107,7 +107,7 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         in->lod().size(),
         1UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Only support input sequence with lod level equal to 1 at "
             "present. But received: lod level %u.",
             in->lod().size()));
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
index 53fb13180c36a..94f65ecd1c6e4 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
@@ -37,13 +37,13 @@ class SequenceConvXPUKernel : public framework::OpKernel<T> {
 
     PADDLE_ENFORCE_EQ(in->lod().empty(),
                       false,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input(X) phi::DenseTensor of SequenceConvOp "
                           "does not contain LoD information."));
     PADDLE_ENFORCE_EQ(
         in->lod().size(),
         1UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Only support input sequence with lod level equal to 1 at "
             "present. But received: lod level %u.",
             in->lod().size()));
@@ -51,19 +51,19 @@ class SequenceConvXPUKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         padding_trainable,
         false,
-        platform::errors::InvalidArgument("Only support padding_trainable "
-                                          "equal false."));
+        phi::errors::InvalidArgument("Only support padding_trainable "
+                                     "equal false."));
 
     int up_pad = std::max(0, -context_start);
     int down_pad = std::max(0, context_start + context_length - 1);
     PADDLE_ENFORCE_EQ(
         up_pad,
         2,
-        platform::errors::InvalidArgument("Only support up_pad equal 2."));
+        phi::errors::InvalidArgument("Only support up_pad equal 2."));
     PADDLE_ENFORCE_EQ(
         down_pad,
         2,
-        platform::errors::InvalidArgument("Only support down_pad equal 2."));
+        phi::errors::InvalidArgument("Only support down_pad equal 2."));
 
     auto xpu_context =
         context.template device_context<DeviceContext>().x_context();
@@ -73,8 +73,8 @@ class SequenceConvXPUKernel : public framework::OpKernel<T> {
     xpu::ctx_guard RAII_GUARD(xpu_context);
     int col_numel = col_shape[0] * col_shape[1];
     T* col_data = RAII_GUARD.alloc_l3_or_gm<T>(col_numel);
-    PADDLE_ENFORCE_NOT_NULL(
-        col_data, paddle::platform::errors::Fatal("XPU memory is not enough"));
+    PADDLE_ENFORCE_NOT_NULL(col_data,
+                            phi::errors::Fatal("XPU memory is not enough"));
 
     auto lod_level_0 = in->lod()[0];
     int lod_size = lod_level_0.size();
@@ -84,7 +84,7 @@ class SequenceConvXPUKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_LE(
         lod_size,
         257,
-        platform::errors::InvalidArgument("Only support batch size <= 256."));
+        phi::errors::InvalidArgument("Only support batch size <= 256."));
 
     std::vector<int> cpu_lodx(lod_size);
     for (int i = 0; i < lod_size; i++) {
@@ -113,7 +113,7 @@ class SequenceConvXPUKernel : public framework::OpKernel<T> {
     int n = filter.dims()[1];
     PADDLE_ENFORCE_EQ(k,
                       k1,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The shape of FC in SequenceConvOp is invalid."
                           "The k of matrix A is %d, k1 of matrix B is %d."
                           "But expect k == k1",
@@ -173,13 +173,13 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
 
     PADDLE_ENFORCE_EQ(in->lod().empty(),
                       false,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input(X) phi::DenseTensor of SequenceConvOp "
                           "does not contain LoD information."));
     PADDLE_ENFORCE_EQ(
         in->lod().size(),
         1UL,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Only support input sequence with lod level equal to 1 at "
             "present. But received: lod level %u.",
             in->lod().size()));
@@ -187,26 +187,26 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         padding_trainable,
         false,
-        platform::errors::InvalidArgument("Only support padding_trainable "
-                                          "equal false."));
+        phi::errors::InvalidArgument("Only support padding_trainable "
+                                     "equal false."));
 
     int up_pad = std::max(0, -context_start);
     int down_pad = std::max(0, context_start + context_length - 1);
     PADDLE_ENFORCE_EQ(
         up_pad,
         2,
-        platform::errors::InvalidArgument("Only support up_pad equal 2."));
+        phi::errors::InvalidArgument("Only support up_pad equal 2."));
     PADDLE_ENFORCE_EQ(
         down_pad,
         2,
-        platform::errors::InvalidArgument("Only support down_pad equal 2."));
+        phi::errors::InvalidArgument("Only support down_pad equal 2."));
 
     auto lod_level_0 = in->lod()[0];
     int lod_size = lod_level_0.size();
     PADDLE_ENFORCE_LE(
         lod_size,
         257,
-        platform::errors::InvalidArgument("Only support batch size <= 256."));
+        phi::errors::InvalidArgument("Only support batch size <= 256."));
 
     std::vector<int> cpu_lodx(lod_size);
     for (int i = 0; i < lod_size; i++) {
@@ -223,8 +223,8 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
     xpu::ctx_guard RAII_GUARD(xpu_context);
     int col_numel = col_shape[0] * col_shape[1];
     T* col_data = RAII_GUARD.alloc_l3_or_gm<T>(col_numel);
-    PADDLE_ENFORCE_NOT_NULL(
-        col_data, paddle::platform::errors::Fatal("XPU memory is not enough"));
+    PADDLE_ENFORCE_NOT_NULL(col_data,
+                            phi::errors::Fatal("XPU memory is not enough"));
 
     if (in_g || filter_g) {
       bool trans_a = false;
@@ -235,7 +235,7 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
       int k1 = filter->dims()[1];
       PADDLE_ENFORCE_EQ(k,
                         k1,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The shape of FC in SequenceConvGradOp is invalid."
                             "The k of matrix A is %d, k1 of matrix B is %d."
                             "But expect k == k1",
@@ -273,10 +273,10 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
     }
 
     if (in_g) {
-      PADDLE_ENFORCE_LT(sequence_width,
-                        512,
-                        platform::errors::InvalidArgument(
-                            "Only support sequence_width < 512."));
+      PADDLE_ENFORCE_LT(
+          sequence_width,
+          512,
+          phi::errors::InvalidArgument("Only support sequence_width < 512."));
 
       in_g->mutable_data<T>(context.GetPlace());
       in_g->set_lod(in->lod());
@@ -317,7 +317,7 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
       int n = out_g->dims()[1];
       PADDLE_ENFORCE_EQ(k,
                         k1,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The shape of FC in SequenceConvGradOp is invalid."
                             "The k of matrix A is %d, k1 of matrix B is %d."
                             "But expect k == k1",
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
deleted file mode 100644
index 9ff5f1f96f389..0000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h"
-
-namespace paddle {
-namespace operators {
-
-class SequenceEnumerateOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SequenceEnumerate");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SequenceEnumerate");
-
-    const auto x_dims = ctx->GetInputDim("X");
-    const auto win_size = ctx->Attrs().Get<int>("win_size");
-    ctx->SetOutputDim("Out", {x_dims[0], win_size});
-    ctx->ShareLoD("X", "Out");
-  }
-};
-
-class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(2-D phi::DenseTensor with the 2nd dimension equal to 1) "
-             "Input phi::DenseTensor of SequenceEnumerate operator.");
-    AddOutput("Out",
-              "(2-D phi::DenseTensor with the 2nd dimension equal to win_size) "
-              "Output phi::DenseTensor of SequenceEnumerate operator.");
-    AddAttr<int>("win_size", "(int) The enumerate sequence window size.")
-        .AddCustomChecker([](const int& win_size) {
-          PADDLE_ENFORCE_GE(win_size,
-                            2,
-                            platform::errors::InvalidArgument(
-                                "The window size should be not less than 2."
-                                "Received window size is %d",
-                                win_size));
-        });
-    AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.")
-        .SetDefault(0);
-    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
-                  "Skip calling InferShape() function in the runtime.")
-        .SetDefault(true);
-    AddComment(R"DOC(
-Sequence Enumerate Operator.
-
-Generate a new sequence for the input index sequence, which enumerates all the
-sub-sequences with length `win_size` of the input.
-The enumerated sequence has the same 1st dimension with variable `input`, and
-the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
-
-Examples:
-Case 1:
-  Input:
-    X.lod = [[0, 3, 5]]
-    X.data = [[1], [2], [3], [4], [5]]
-    X.dims = [5, 1]
-  Attrs:
-    win_size = 2
-    pad_value = 0
-  Output:
-    Out.lod = [[0, 3, 5]]
-    Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
-    Out.dims = [5, 2]
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(sequence_enumerate,
-                             ops::SequenceEnumerateOp,
-                             ops::SequenceEnumerateOpMaker);
-PD_REGISTER_STRUCT_KERNEL(sequence_enumerate,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::SequenceEnumerateKernel,
-                          int32_t,
-                          int64_t) {}
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
deleted file mode 100644
index 7884232e5b10f..0000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
+++ /dev/null
@@ -1,100 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-
-#include "paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h"
-#include "paddle/phi/backends/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-using phi::PADDLE_CUDA_NUM_THREADS;
-
-template <typename T>
-__global__ void CalcOutPut(const T* in_data,
-                           const size_t* in_lod,
-                           const size_t lod_len,
-                           const int64_t win_size,
-                           const int64_t pad_value,
-                           T* out_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < in_lod[lod_len - 1]) {
-    int end_idx = 0;
-    // Get LoD interval of index
-    for (int i = 1; i < lod_len; ++i) {
-      if (index < in_lod[i]) {
-        end_idx = in_lod[i];
-        break;
-      }
-    }
-    for (size_t i = 0; i < win_size; ++i) {
-      int word_pos = index + i;
-      out_data[index * win_size + i] =
-          word_pos < end_idx ? in_data[word_pos] : pad_value;
-    }
-  }
-}
-
-template <typename T, typename DeviceContext>
-class SequenceEnumerateOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<phi::DenseTensor>("X");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-    int win_size = context.Attr<int>("win_size");
-    int pad_value = context.Attr<int>("pad_value");
-
-    auto in_dims = in->dims();
-    auto in_lod = in->lod();
-
-    PADDLE_ENFORCE_EQ(
-        static_cast<uint64_t>(in_dims[0]),
-        in_lod[0].back(),
-        platform::errors::InvalidArgument(
-            "The actual input data's size mismatched with LoD information."
-            "Received input data size is %d (actual) vs %d (loD information).",
-            static_cast<uint64_t>(in_dims[0]),
-            in_lod[0].back()));
-
-    /* Generate enumerate sequence set */
-    auto stream = context.cuda_device_context().stream();
-    auto lod0 = in_lod[0];
-    auto in_len = in->numel();
-    auto in_data = in->data<T>();
-    out->Resize({in_dims[0], win_size});
-    auto out_data = out->mutable_data<T>(context.GetPlace());
-    // Copy LoD to GPU
-    phi::MixVector<size_t> mixv_lod0(&lod0);
-    const size_t* dev_in_lod_ptr = mixv_lod0.CUDAData(context.GetPlace());
-    // Calc output tensor
-    CalcOutPut<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
-                 PADDLE_CUDA_NUM_THREADS,
-                 0,
-                 stream>>>(
-        in_data, dev_in_lod_ptr, lod0.size(), win_size, pad_value, out_data);
-    out->set_lod(in->lod());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-PD_REGISTER_STRUCT_KERNEL(sequence_enumerate,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::SequenceEnumerateOpCUDAKernel,
-                          int32_t,
-                          int64_t) {}
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
deleted file mode 100644
index c66f4065a58f1..0000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
+++ /dev/null
@@ -1,95 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class SequenceEnumerateKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<phi::DenseTensor>("X");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-    int win_size = context.Attr<int>("win_size");
-    auto pad_value = static_cast<T>(context.Attr<int>("pad_value"));
-
-    PADDLE_ENFORCE_EQ(
-        in->lod().empty(),
-        false,
-        platform::errors::InvalidArgument(
-            "Input(X) phi::DenseTensor of SequenceEnumerateOp does not contain "
-            "LoD information."));
-
-    auto in_dims = common::vectorize<int>(in->dims());
-    auto lod0 = in->lod()[0];
-    PADDLE_ENFORCE_EQ(
-        static_cast<uint64_t>(in_dims[0]),
-        lod0.back(),
-        platform::errors::InvalidArgument(
-            "The actual input data's size mismatched with LoD information."
-            "Received input data size is %d (actual) vs %d (loD information).",
-            static_cast<uint64_t>(in_dims[0]),
-            lod0.back()));
-    PADDLE_ENFORCE_EQ(
-        in_dims.size(),
-        2UL,
-        platform::errors::InvalidArgument(
-            "Input(X) of SequenceEnumerate operator's rank should be 2."
-            "Received %d instead.",
-            in_dims.size()));
-    PADDLE_ENFORCE_EQ(in_dims[1],
-                      1,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of SequenceEnumerate operator's 2nd "
-                          "dimension should be 1. Received %d instead.",
-                          in_dims[1]));
-
-    // Generate enumerate sequence set
-    auto in_data = in->data<T>();
-    out->Resize({in_dims[0], win_size});
-    out->set_lod(in->lod());
-    auto out_data = out->mutable_data<T>(context.GetPlace());
-    for (size_t i = 0; i < lod0.size() - 1; ++i) {
-      if (lod0[i] == lod0[i + 1]) continue;
-      int start = lod0[i];
-      int end = lod0[i + 1];
-
-      int copy_size = win_size < end - start + 1 ? win_size : end - start + 1;
-      int mid = end + 1 - copy_size;
-      int pad_num = win_size - copy_size;
-      copy_size *= sizeof(T);
-      for (int idx = start; idx < mid; ++idx) {
-        std::memcpy(out_data, in_data + idx, copy_size);
-        out_data += win_size;
-      }
-      for (int idx = mid; idx < end; ++idx) {
-        copy_size -= sizeof(T);
-        pad_num++;
-        std::memcpy(out_data, in_data + idx, copy_size);
-        T* pdata = out_data + copy_size / sizeof(T);
-        for (int i = 0; i < pad_num; ++i) {
-          pdata[i] = pad_value;
-        }
-        out_data += win_size;
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
index 7f6eeff11b5be..03edbdc1a5d04 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
@@ -29,7 +29,7 @@ class SequenceEraseOp : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     PADDLE_ENFORCE(
         x_dims.size() == 2 && x_dims[1] == 1,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(X) of SequenceEraseOp should be a 2-D phi::DenseTensor "
             "with the 2nd dimension equal to 1,"
             "but received size %d with the 2nd dimension %d.",
diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
index bbc80587a9cf7..8b4b76a762d94 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
@@ -73,7 +73,7 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         lod[lod.size() - 1].back(),
         (size_t)in->numel(),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The actual size mismatches with the LoD information."));
     auto tokens = ctx.Attr<std::vector<int>>("tokens");
     auto in_len = in->numel();
diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.h b/paddle/fluid/operators/sequence_ops/sequence_erase_op.h
index 9a4aef1d93ab4..505c4245155ad 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.h
@@ -32,11 +32,11 @@ class SequenceEraseKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         lod.empty(),
         false,
-        platform::errors::InvalidArgument("Input(X) Tensor of SequenceEraseOp "
-                                          "does not contain LoD information."));
+        phi::errors::InvalidArgument("Input(X) Tensor of SequenceEraseOp "
+                                     "does not contain LoD information."));
     PADDLE_ENFORCE_EQ(lod[lod.size() - 1].back(),
                       static_cast<size_t>(in->numel()),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The actual input size %d mismatches with the LoD "
                           "information size %d.",
                           lod[lod.size() - 1].back(),
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc
deleted file mode 100644
index 1f9fd565ca77c..0000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc
+++ /dev/null
@@ -1,227 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h"
-
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class SequenceExpandAsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "SequenceExpandAs");
-    OP_INOUT_CHECK(ctx->HasInputs("Y"), "Input", "Y", "SequenceExpandAs");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SequenceExpandAs");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto out_dims = x_dims;
-
-    PADDLE_ENFORCE_GE(x_dims.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "Dimension number of Input(X) should be at least 2. "
-                          "But received X's dimensions = %d, X's shape = [%s].",
-                          x_dims.size(),
-                          x_dims));
-
-    if (ctx->IsRuntime()) {
-      framework::Variable* x_var =
-          PADDLE_GET(framework::Variable*, ctx->GetInputVarPtrs("X")[0]);
-      framework::Variable* y_var =
-          PADDLE_GET(framework::Variable*, ctx->GetInputVarPtrs("Y")[0]);
-
-      auto& x_dim = x_var->Get<phi::DenseTensor>().dims();
-      auto& y_lod = y_var->Get<phi::DenseTensor>().lod();
-
-      PADDLE_ENFORCE_EQ(y_lod.size(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Level number of Input(Y)'s lod should be 1. But "
-                            "received Y's lod level = %d.",
-                            y_lod.size()));
-
-      PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dim[0]),
-                        y_lod[0].size() - 1,
-                        platform::errors::InvalidArgument(
-                            "The first dimension of Input(X) should be one "
-                            "less than the size of Input(Y)'s 0 level lod. But "
-                            "received X's shape[0] = %d, Y's lod[0].size = %d.",
-                            x_dim[0],
-                            y_lod[0].size()));
-
-      int64_t out_first_dim = 0;
-      if (y_lod[0].size() <= 1) {
-        out_first_dim = x_dims[0];
-      } else {
-        for (size_t i = 1; i < y_lod[0].size(); ++i) {
-          out_first_dim += static_cast<int64_t>(y_lod[0][i] - y_lod[0][i - 1]);
-        }
-      }
-      out_dims[0] = out_first_dim;
-    } else {
-      out_dims[0] = -1;
-    }
-
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("Y", /*->*/ "Out");
-  }
-
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class SequenceExpandAsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(phi::DenseTensor, default phi::DenseTensor<float>) A 2-D "
-             "phi::DenseTensor whose lod "
-             "level is at most 1.");
-    AddInput("Y",
-             "(phi::DenseTensor, default phi::DenseTensor<float>) Referred "
-             "phi::DenseTensor whose "
-             "lod (specified level) is referred by Input(X).");
-    AddOutput("Out",
-              "(phi::DenseTensor, default phi::DenseTensor<float>) Output "
-              "phi::DenseTensor which is "
-              "generated from Input(X) by referring lod of Input(Y).");
-    AddComment(R"DOC(
-Sequence Expand As Operator.
-
-This operator expands `X` according to the zeroth level lod of `Y`. Current
-implementation requires the level number of Input(Y)'s lod should be 1, and
-the first dimension of Input(X) should be equal to the size of Input(Y)'s zeroth
-level lod, and lod of Input(X) is not considered.
-
-Following are cases to better explain how this works:
-
-Case 1:
-
-Given a 1-level phi::DenseTensor input(X)
-    X.data = [[a], [b], [c], [d]]
-    X.dims = [4, 1]
-and input(Y)
-    Y.lod = [[0, 3, 6, 7, 8]]
-ref_level: 0
-then we get 1-level phi::DenseTensor
-    Out.lod =  [[0,            3,              6,  7,  8]]
-    Out.data = [[a], [a], [a], [b], [b], [b], [c], [d]]
-    Out.dims = [8, 1]
-
-Case 2:
-
-Given a common phi::DenseTensor input(X)
-    X.data = [[a, b], [c, d], [e, f]]
-    X.dims = [3, 2]
-and input(Y)
-    Y.lod = [[0, 2, 3, 6]]
-ref_level: 0
-then we get a common phi::DenseTensor
-    Out.lod =  [[0,             2,     3,                    6]]
-    Out.data = [[a, b], [a, b] [c, d], [e, f], [e, f], [e, f]]
-    Out.dims = [6, 2]
-
-)DOC");
-  }
-};
-
-class SequenceExpandAsOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "SequenceExpandAsGrad");
-    OP_INOUT_CHECK(ctx->HasInputs(framework::GradVarName("Out")),
-                   "Input",
-                   "Out@GRAD",
-                   "SequenceExpandAsGrad");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_grad_name = framework::GradVarName("X");
-
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-      ctx->ShareLoD("X", x_grad_name);
-    }
-  }
-
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Out")),
-                          ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class SequenceExpandAsOpGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("sequence_expand_as_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("Y", this->Input("Y"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(SequenceExpandAsOpNoNeedBufferVarsInferer,
-                                    "Y");
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(
-    SequenceExpandAsGradOpNoNeedBufferVarsInferer, "X", "Y");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    sequence_expand_as,
-    ops::SequenceExpandAsOp,
-    ops::SequenceExpandAsOpMaker,
-    ops::SequenceExpandAsOpGradOpMaker<paddle::framework::OpDesc>,
-    ops::SequenceExpandAsOpGradOpMaker<paddle::imperative::OpBase>,
-    ops::SequenceExpandAsOpNoNeedBufferVarsInferer);
-REGISTER_OPERATOR(sequence_expand_as_grad,
-                  ops::SequenceExpandAsOpGrad,
-                  ops::SequenceExpandAsGradOpNoNeedBufferVarsInferer);
-PD_REGISTER_STRUCT_KERNEL(sequence_expand_as,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::SequenceExpandAsKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
-PD_REGISTER_STRUCT_KERNEL(sequence_expand_as_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::SequenceExpandAsGradKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
deleted file mode 100644
index 053c439814e95..0000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-
-#include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h"
-#include "paddle/phi/backends/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-static __global__ void sequence_expand_as_kernel(const T *in_data,
-                                                 const size_t *expand_offset,
-                                                 const size_t src_hight,
-                                                 const size_t src_widht,
-                                                 T *out_data) {
-  for (int h_id = blockIdx.x; h_id < src_hight; h_id += gridDim.x) {
-    int span = expand_offset[h_id + 1] - expand_offset[h_id];
-    if (span == 0) continue;
-    const T *src = in_data + h_id * src_widht;
-    for (int w_id = threadIdx.x; w_id < src_widht; w_id += blockDim.x) {
-      T ele = src[w_id];
-      int offset = expand_offset[h_id] * src_widht;
-      for (int k = 0; k < span; ++k) {
-        out_data[offset + k * src_widht + w_id] = ele;
-      }
-    }
-  }
-}
-
-template <typename T>
-static __global__ void sequence_expand_as_grad_kernel(
-    const T *dout_data,
-    const size_t *expand_offset,
-    const size_t dst_hight,
-    const size_t dst_width,
-    T *dx_data) {
-  for (int h_id = blockIdx.x; h_id < dst_hight; h_id += gridDim.x) {
-    T *dst = dx_data + h_id * dst_width;
-    int span = expand_offset[h_id + 1] - expand_offset[h_id];
-
-    for (int w_id = threadIdx.x; w_id < dst_width; w_id += blockDim.x) {
-      T result = 0;
-      for (int k = 0; k < span; ++k) {
-        int offset = (expand_offset[h_id] + k) * dst_width;
-        const T *src = dout_data + offset;
-        result += src[w_id];
-      }
-      dst[w_id] = result;
-    }
-  }
-}
-
-template <typename T>
-struct SequenceExpandAsFunctor<phi::GPUContext, T> {
-  void operator()(const phi::GPUContext &context,
-                  const phi::DenseTensor &x,
-                  const phi::Vector<size_t> &ref_lod, /*expand referenced lod*/
-                  phi::DenseTensor *out) {
-    int height = x.dims()[0];
-    int width = common::product(x.dims()) / height;
-
-    const int kThreadsPerBlock = 1024;
-    int thread_x = kThreadsPerBlock;
-    if (width < kThreadsPerBlock) {  // block_cols is aligned by 32.
-      thread_x = ((width + 31) >> 5) << 5;
-    }
-
-    int max_threads = context.GetMaxPhysicalThreadCount();
-    int block_x = std::max(max_threads / thread_x, 1);
-
-    dim3 block_size(thread_x);
-    dim3 grid_size(block_x);
-    phi::MixVector<size_t> mixv_ref_lod(&ref_lod);
-    sequence_expand_as_kernel<<<grid_size, block_size, 0, context.stream()>>>(
-        x.data<T>(),
-        mixv_ref_lod.CUDAData(context.GetPlace()),
-        height,
-        width,
-        out->mutable_data<T>(context.GetPlace()));
-  }
-};
-
-template <typename T>
-struct SequenceExpandAsGradFunctor<phi::GPUContext, T> {
-  void operator()(const phi::GPUContext &context,
-                  const phi::DenseTensor &dout,
-                  const phi::Vector<size_t> &ref_lod, /*expand based lod*/
-                  phi::DenseTensor *dx) {
-    int height = dx->dims()[0];
-    int width = common::product(dx->dims()) / height;
-
-    const int kThreadsPerBlock = 1024;
-    int thread_x = kThreadsPerBlock;
-    if (width < kThreadsPerBlock) {  // block_cols is aligned by 32.
-      thread_x = ((width + 31) >> 5) << 5;
-    }
-
-    int max_threads = context.GetMaxPhysicalThreadCount();
-    int block_x = std::max(max_threads / thread_x, 1);
-
-    dim3 block_size(thread_x);
-    dim3 grid_size(block_x);
-    phi::MixVector<size_t> mixv_ref_lod(&ref_lod);
-    sequence_expand_as_grad_kernel<<<grid_size,
-                                     block_size,
-                                     0,
-                                     context.stream()>>>(
-        dout.data<T>(),
-        mixv_ref_lod.CUDAData(context.GetPlace()),
-        height,
-        width,
-        dx->mutable_data<T>(context.GetPlace()));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-PD_REGISTER_STRUCT_KERNEL(sequence_expand_as,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::SequenceExpandAsKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
-PD_REGISTER_STRUCT_KERNEL(sequence_expand_as_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::SequenceExpandAsGradKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
deleted file mode 100644
index d9a1d419f5a9e..0000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
+++ /dev/null
@@ -1,167 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <numeric>  // std::iota
-#include <sstream>
-#include <vector>
-
-#include "glog/logging.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-struct SequenceExpandAsFunctor {
-  void operator()(const DeviceContext &ctx,
-                  const phi::DenseTensor &x,
-                  const phi::Vector<size_t> &ref_lod, /*expand referenced lod*/
-                  phi::DenseTensor *out);
-};
-
-template <typename DeviceContext, typename T>
-struct SequenceExpandAsGradFunctor {
-  void operator()(const DeviceContext &ctx,
-                  const phi::DenseTensor &dout,
-                  const phi::Vector<size_t> &ref_lod, /*expand referenced lod*/
-                  phi::DenseTensor *dx);
-};
-
-template <typename T>
-struct SequenceExpandAsFunctor<phi::CPUContext, T> {
-  void operator()(const phi::CPUContext &context,
-                  const phi::DenseTensor &x,
-                  const phi::Vector<size_t> &ref_lod, /*expand referenced lod*/
-                  phi::DenseTensor *out) {
-    int64_t height = x.dims()[0];
-    int64_t width = common::product(x.dims()) / height;
-
-    const T *in_data = x.data<T>();
-    T *out_data = out->mutable_data<T>(context.GetPlace());
-
-    for (int h_id = 0; h_id < height; ++h_id) {
-      size_t span = ref_lod[h_id + 1] - ref_lod[h_id];
-      if (span == 0) continue;
-      const T *src = in_data + h_id * width;
-      for (int64_t w_id = 0; w_id < width; ++w_id) {
-        T ele = src[w_id];
-        size_t offset = ref_lod[h_id] * width;
-        for (size_t k = 0; k < span; ++k) {
-          out_data[offset + k * width + w_id] = ele;
-        }
-      }
-    }
-  }
-};
-
-template <typename T, typename DeviceContext>
-class SequenceExpandAsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *x = context.Input<phi::DenseTensor>("X");
-    auto *y = context.Input<phi::DenseTensor>("Y");
-    auto *out = context.Output<phi::DenseTensor>("Out");
-
-    PADDLE_ENFORCE_EQ(
-        y->lod().empty(),
-        false,
-        platform::errors::InvalidArgument(
-            "Input(Y) of SequenceExpandAsOp has wrong LoD information. "
-            "Expected Y's lod is not empty, but received empty lod."));
-
-    auto &y_lod = y->lod();
-    PADDLE_ENFORCE_EQ(y_lod.size(),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "Input(Y) of SequenceExpandAsOp has wrong LoD "
-                          "information. Expected Y's lod level = 1, but "
-                          "received  lod level = %d.",
-                          y_lod.size()));
-    PADDLE_ENFORCE_GT(y_lod[0].size(),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "Input(Y) of SequenceExpandAsOp has wrong LoD "
-                          "information. Expected the size of Y's lod[0] > 1, "
-                          "but received lod[0].size = %d.",
-                          y_lod[0].size()));
-
-    out->mutable_data<T>(context.GetPlace());
-
-    auto &dev_ctx = context.template device_context<DeviceContext>();
-    SequenceExpandAsFunctor<DeviceContext, T> seq_expand_functor;
-    seq_expand_functor(dev_ctx, *x, y_lod[0], out);
-  }
-};
-
-/*
- *Given Grad(Out)
- *
- *    Grad(Out).lod = [[0,              3,            6]]
- *    Grad(Out).data = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
- * Then
- *    Grad(X).data = [(0.1 + 0.2 + 0.3), (0.4 + 0.5 + 0.6)]
- *                 = [0.6, 1.5]
- *    Grad(X).lod = Input(X).lod
- *
- * */
-template <typename T>
-struct SequenceExpandAsGradFunctor<phi::CPUContext, T> {
-  void operator()(const phi::CPUContext &context,
-                  const phi::DenseTensor &dout,
-                  const phi::Vector<size_t> &ref_lod, /*expand referenced lod*/
-                  phi::DenseTensor *dx) {
-    int64_t height = dx->dims()[0];
-    int64_t width = common::product(dx->dims()) / height;
-
-    const T *dout_data = dout.data<T>();
-    T *dx_data = dx->mutable_data<T>(context.GetPlace());
-
-    for (int64_t h_id = 0; h_id < height; ++h_id) {
-      T *dst = dx_data + h_id * width;
-      size_t span = ref_lod[h_id + 1] - ref_lod[h_id];
-      for (int64_t w_id = 0; w_id < width; ++w_id) {
-        T result = 0;
-        for (size_t k = 0; k < span; ++k) {
-          size_t offset = (ref_lod[h_id] + k) * width;
-          result += dout_data[offset + w_id];
-        }
-        dst[w_id] = result;
-      }
-    }
-  }
-};
-
-template <typename T, typename DeviceContext>
-class SequenceExpandAsGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *g_out =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto *y = context.Input<phi::DenseTensor>("Y");
-    auto *g_x = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    g_x->mutable_data<T>(context.GetPlace());
-
-    SequenceExpandAsGradFunctor<DeviceContext, T> functor;
-    functor(context.template device_context<DeviceContext>(),
-            *g_out,
-            y->lod()[0],
-            g_x);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
index e6a64be83473d..4e7deab77952a 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
@@ -38,7 +38,7 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GE(
         x_dims.size(),
         2,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Dimension number of Input(X) should be at least 2. But "
             "received: input rank %u, input shape [%s].",
             x_dims.size(),
@@ -55,14 +55,14 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
 
       PADDLE_ENFORCE_LE(x_lod.size(),
                         1UL,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "Level of Input(X)'s lod should not be "
                             "greater than 1. But received: lod level %u.",
                             x_lod.size()));
       PADDLE_ENFORCE_GT(
           y_lod.size(),
           0UL,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Level of Input(Y)'s lod should be greater than 0. But "
               "received: lod level %u.",
               y_lod.size()));
@@ -70,7 +70,7 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
           ref_level == -1 ||
               (ref_level >= 0 && ref_level < static_cast<int>(y_lod.size())),
           true,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Invalid `ref_level`, which should be either equal to -1 "
               "or in [0, %d), but received `ref_level` = %u.",
               y_lod.size(),
@@ -82,7 +82,7 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
         PADDLE_ENFORCE_EQ(
             x_lod[0].size(),
             y_lod[ref_level].size(),
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "Level number of Input(X)'s lod could be 0. Otherwise "
                 "size of Input(X)'s first level lod should be equal to "
                 "size of Input(Y)'s referred level lod. But received: "
@@ -95,7 +95,7 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
         PADDLE_ENFORCE_EQ(
             x_dims[0],
             static_cast<int64_t>(y_lod[ref_level].size()) - 1,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "When Input(X)'s lod is null, the dims[0] of "
                 "Input(X) should match the "
                 "size of Input(Y)'s referred level lod. But received "
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h b/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
index 0f53249cfbc24..1204775c44226 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
@@ -96,7 +96,7 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         y_lod.empty(),
         false,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(Y) phi::DenseTensor of SequenceExpandOp does not contain "
             "LoD information."));
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
index a6cd59e44dff0..a123ae14f39b1 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
@@ -61,7 +61,7 @@ class SequenceMaskOpMaker : public framework::OpProtoAndCheckerMaker {
           PADDLE_ENFORCE_EQ(
               v < 0 || v >= 1,
               true,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "Attr(maxlen) must be less than 0 or larger than 1"));
         });
     AddAttr<int>("out_dtype", "Output data type");
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
deleted file mode 100644
index d033ac210c7c8..0000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
+++ /dev/null
@@ -1,305 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_pad_op.h"
-
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class SequencePadOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
-                      true,
-                      platform::errors::NotFound(
-                          "Input(X) of SequencePadOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("PadValue"),
-        true,
-        platform::errors::NotFound(
-            "Input(PadValue) of SequencePadOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"),
-                      true,
-                      platform::errors::NotFound(
-                          "Output(Out) of SequencePadOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Length"),
-        true,
-        platform::errors::NotFound(
-            "Output(Length) of SequencePadOp should not be null."));
-
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GE(x_dims.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The rank of SequencePadOp Input(X) can't be less "
-                          "than 2. But the rank we received is %d",
-                          x_dims.size()));
-    auto time_step_dims = common::slice_ddim(x_dims, 1, x_dims.size());
-    auto pad_value_dims = ctx->GetInputDim("PadValue");
-    PADDLE_ENFORCE_EQ(
-        pad_value_dims == common::make_ddim({1}) ||
-            pad_value_dims == common::make_ddim({}) ||
-            pad_value_dims == time_step_dims,
-        true,
-        platform::errors::InvalidArgument(
-            "The SequencePadOp Input(PadValue) must be a scalar or a tensor "
-            "whose shape equals to time steps in sequences"));
-
-    int out_dim_0 = -1;
-
-    int padded_length = ctx->Attrs().Get<int>("padded_length");
-    if (ctx->IsRuntime()) {
-      // run time
-      framework::Variable* x_var =
-          PADDLE_GET(framework::Variable*, ctx->GetInputVarPtrs("X")[0]);
-      const auto& x_lod = x_var->Get<phi::DenseTensor>().lod();
-      PADDLE_ENFORCE_EQ(x_lod.empty(),
-                        false,
-                        platform::errors::NotFound(
-                            "The SequencePadOp Input(X) must hold lod info."));
-      const auto& x_lod_0 = x_lod[0];
-      PADDLE_ENFORCE_GE(
-          x_lod_0.size(),
-          2,
-          platform::errors::InvalidArgument(
-              "The size of SequencePadOp Input(X)'s lod info can't be less "
-              "than 2. But the size we received is %d",
-              x_lod_0.size()));
-      PADDLE_ENFORCE_EQ(x_dims[0],
-                        static_cast<int64_t>(x_lod_0.back()),
-                        platform::errors::InvalidArgument(
-                            "The SequencePadOp Input(X)'s lod info mismatches "
-                            "the actual tensor shape. The 1st dimension of "
-                            "Input(X)'s lod info is %d, the 1st dimension of "
-                            "actual tensor shape is %d",
-                            x_dims[0],
-                            static_cast<int64_t>(x_lod_0.back())));
-
-      int seq_num = static_cast<int>(x_lod_0.size() - 1);
-      int max_seq_len =
-          static_cast<int>(phi::funcs::MaximumSequenceLength(x_lod_0));
-      if (padded_length == -1) {
-        padded_length = max_seq_len;
-      }
-      PADDLE_ENFORCE_GE(
-          padded_length,
-          max_seq_len,
-          platform::errors::InvalidArgument(
-              "The SequencePadOp Attr(padded_length) should be greater than or "
-              "equal to the "
-              "length of the longest original sequence. But the padded_length "
-              "we received is %d, the length of the longest original sequence "
-              "is %d",
-              padded_length,
-              max_seq_len));
-      out_dim_0 = seq_num;
-    } else {
-      // compile time
-      if (padded_length == -1) {
-        padded_length = 1;
-      }
-      PADDLE_ENFORCE_GT(
-          ctx->GetLoDLevel("X"),
-          0,
-          platform::errors::InvalidArgument(
-              "The LoD level of SequencePadOp Input(X) should be "
-              "larger than 0. But the LoD level we received is %d",
-              ctx->GetLoDLevel("X")));
-    }
-
-    std::vector<int> out_dims_vec{out_dim_0, padded_length};
-    std::vector<int> len_dims_vec{out_dim_0};
-    auto time_step_dims_vec = common::vectorize<int>(time_step_dims);
-    out_dims_vec.insert(out_dims_vec.end(),
-                        time_step_dims_vec.begin(),
-                        time_step_dims_vec.end());
-    ctx->SetOutputDim("Out", common::make_ddim(out_dims_vec));
-    ctx->SetOutputDim("Length", common::make_ddim(len_dims_vec));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    return phi::KernelKey(data_type, ctx.GetPlace());
-  }
-};
-
-class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(phi::DenseTensor, default phi::DenseTensor<float>) Input "
-             "variable which "
-             "should contain lod information.");
-    AddInput("PadValue",
-             "(phi::DenseTensor), this phi::DenseTensor holds values that will "
-             "be fill into "
-             "padded steps. It can be a scalar or a tensor whose shape equals "
-             "to time steps in sequences. If it's a scalar, it will be "
-             "automatically broadcasted to the shape of time step.");
-    AddOutput("Out",
-              "(phi::DenseTensor) The output variable, which contains padded "
-              "sequences.");
-    AddOutput("Length",
-              "(phi::DenseTensor) The output variable, which contains the "
-              "actual length of "
-              "sequences before padding.");
-    AddAttr<int>(
-        "padded_length",
-        "The length of padded sequences. It can be set to -1 or "
-        "any positive int. When it is -1, all sequences will be padded up to "
-        "the length of the longest one among them; when it a certain positive "
-        "value, it must be greater than the length of the longest original "
-        "sequence.")
-        .SetDefault(-1);
-    AddComment(R"DOC(
-      Sequence Pad Operator
-
-      This operator pads sequences in a same batch to a consistent length.
-      The length is specified by attribute 'padded_length'. New elements,
-      whose values are specified by input 'PadValue', will be appended to
-      the end of each sequence, to make their final lengths consistent.
-
-      Following are cases to better explain how this works:
-
-      Case 1:
-
-      Given a 1-level phi::DenseTensor input(X):
-          X.lod = [[0, 2,       5]]
-          X.data = [a, b, c, d, e]
-      and Input(PadValue):
-          PadValue.data = [0]
-      and attribute 'padded_length' = 4,
-      then we get phi::DenseTensor:
-          Out.data = [[a, b, 0, 0],
-                      [c, d, e, 0]]
-          Length.data = [2, 3]
-
-      Case 2:
-
-      Given a 1-level phi::DenseTensor input(X):
-          X.lod = [[0,               2,                           5]]
-          X.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]]
-      and Input(PadValue):
-          PadValue.data = [0]
-      and attribute 'padded_length' = -1, which mean using the length
-      of longest input sequence(3 in this case),
-      then we get phi::DenseTensor:
-          Out.data = [[[a1, a2], [b1, b2], [0, 0]],
-                      [[c1, c2], [d1, d2], [e1, e2]]]
-          Length.data = [2, 3]
-
-      Case 3:
-
-      Given a 1-level phi::DenseTensor input(X):
-          X.lod = [[0,               2,                           5]]
-          X.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]]
-      and Input(PadValue):
-          PadValue.data = [p1, p2]
-      and attribute 'padded_length' = -1, which mean using the length
-      of longest input sequence(3 in this case),
-      then we get phi::DenseTensor:
-          Out.data = [[[a1, a2], [b1, b2], [p1, p2]],
-                      [[c1, c2], [d1, d2], [e1, e2]]]
-          Length.data = [2, 3]
-
-    )DOC");
-  }
-};
-
-class SequencePadGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
-                      true,
-                      platform::errors::NotFound(
-                          "Input(X) of SequencePadGradOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput(framework::GradVarName("Out")),
-        true,
-        platform::errors::NotFound(
-            "Input(Out@GRAD) of SequencePadGradOp should not be null."));
-
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-      ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(
-        ctx, framework::GradVarName("Out"));
-    return phi::KernelKey(data_type, ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class SequencePadGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("sequence_pad_grad");
-    op->SetAttrMap(this->Attrs());
-    op->SetInput("X", this->Input("X"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(SequencePadGradOpNoNeedBufferVarsInferer,
-                                    "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(sequence_pad,
-                  ops::SequencePadOp,
-                  ops::SequencePadOpMaker,
-                  ops::SequencePadGradOpMaker<paddle::framework::OpDesc>,
-                  ops::SequencePadGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(sequence_pad_grad,
-                  ops::SequencePadGradOp,
-                  ops::SequencePadGradOpNoNeedBufferVarsInferer);
-
-PD_REGISTER_STRUCT_KERNEL(sequence_pad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::SequencePadOpKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
-PD_REGISTER_STRUCT_KERNEL(sequence_pad_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::SequencePadGradOpKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cu b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cu
deleted file mode 100644
index 910a4eae21f1e..0000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_pad_op.h"
-
-namespace ops = paddle::operators;
-PD_REGISTER_STRUCT_KERNEL(sequence_pad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::SequencePadOpKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
-PD_REGISTER_STRUCT_KERNEL(sequence_pad_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::SequencePadGradOpKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.h b/paddle/fluid/operators/sequence_ops/sequence_pad_op.h
deleted file mode 100644
index d31611b94a658..0000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/sequence_padding.h"
-
-namespace paddle {
-namespace operators {
-
-using LoD = framework::LoD;
-template <typename T, typename DeviceContext>
-class SequencePadOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto* len_t = ctx.Output<phi::DenseTensor>("Length");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    PADDLE_ENFORCE_EQ(x->lod().empty(),
-                      false,
-                      platform::errors::NotFound(
-                          "Input(X) phi::DenseTensor of SequencePadOp does not "
-                          "contain LoD information."));
-
-    const auto* pad_value = ctx.Input<phi::DenseTensor>("PadValue");
-
-    int padded_length = ctx.Attr<int>("padded_length");
-
-    phi::funcs::PaddingLoDTensorFunctor<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(),
-        *x,
-        out,
-        *pad_value,
-        padded_length,
-        0,
-        false,
-        phi::funcs::kBatchLengthWidth);
-
-    phi::DenseTensor seq_len;
-    seq_len.Resize(len_t->dims());
-    int64_t* len_data = seq_len.mutable_data<int64_t>(platform::CPUPlace());
-    for (size_t i = 1; i < x->lod()[0].size(); ++i) {
-      len_data[i - 1] = x->lod()[0][i] - x->lod()[0][i - 1];
-    }
-    framework::TensorCopy(seq_len,
-                          ctx.GetPlace(),
-                          ctx.template device_context<DeviceContext>(),
-                          len_t);
-  }
-};
-
-template <typename T, typename DeviceContext>
-class SequencePadGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    if (d_x) {
-      const auto* d_out =
-          ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-      d_x->mutable_data<T>(ctx.GetPlace());
-
-      int padded_length = ctx.Attr<int>("padded_length");
-
-      phi::funcs::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
-          ctx.template device_context<DeviceContext>(),
-          *d_out,
-          d_x,
-          padded_length,
-          0,
-          false,
-          phi::funcs::kBatchLengthWidth);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
index d616bca2c4e3b..2d58d2b32276f 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
@@ -31,10 +31,10 @@ class SequencePoolOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_GT(
           in_lod_level,
           0,
-          platform::errors::InvalidArgument("The LoD level of Input(X) should "
-                                            "be larger than 0, but received: "
-                                            "lod level %u.",
-                                            in_lod_level));
+          phi::errors::InvalidArgument("The LoD level of Input(X) should "
+                                       "be larger than 0, but received: "
+                                       "lod level %u.",
+                                       in_lod_level));
       ctx->SetLoDLevel("Out", in_lod_level - 1);
     }
 
@@ -126,7 +126,7 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     PADDLE_ENFORCE_EQ(og_dims.size(),
                       x_dims.size(),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The rank of output grad must equal to Input(X). But "
                           "received: input rank %u, input shape [%s].",
                           og_dims.size(),
@@ -135,7 +135,7 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           og_dims[i],
           x_dims[i],
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The dimension mismatch between Input(OUT@GRAD) and "
               "Input(X). Received Input(OUT@GRAD): input rank %u, "
               "input shape [%s]; received Input(X): input rank %u, "
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc
index 6e34f76fbd37d..23ce04ca74262 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc
@@ -27,18 +27,18 @@ class SequenceReshapeOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
                       true,
-                      platform::errors::NotFound(
+                      phi::errors::NotFound(
                           "Input(X) of SequenceReshapeOp should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("Out"),
         true,
-        platform::errors::NotFound(
+        phi::errors::NotFound(
             "Output(Out) of SequenceReshapeOp should not be null."));
     auto x_dims = ctx->GetInputDim("X");
     auto x_numel = product(x_dims);
     PADDLE_ENFORCE_EQ(x_dims.size(),
                       2U,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The rank of SequenceReshapeOp Input(X) should be 2. "
                           "But the rank we received is %d",
                           x_dims.size()));
@@ -105,12 +105,12 @@ class SequenceReshapeGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ctx->HasInput(framework::GradVarName("Out")),
         true,
-        platform::errors::NotFound(
+        phi::errors::NotFound(
             "Input(Out@GRAD) of SequenceReshapeGradOp should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("X"),
         true,
-        platform::errors::NotFound(
+        phi::errors::NotFound(
             "Input(X) of SequenceReshapeGradOp should not be null."));
 
     ctx->ShareDim("X", /*->*/ framework::GradVarName("X"));
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.h b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.h
index de530bed0d663..e506b310ea2bb 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.h
@@ -32,14 +32,14 @@ class SequenceReshapeKernel : public framework::OpKernel<T> {
     int64_t in_width = in_dims[1];
     auto& in_lod = in->lod();
 
-    PADDLE_ENFORCE_EQ(in_lod.empty(),
-                      false,
-                      platform::errors::NotFound(
-                          "Input(X) Tensor of SequenceReshapeOp does not "
-                          "contain LoD information."));
+    PADDLE_ENFORCE_EQ(
+        in_lod.empty(),
+        false,
+        phi::errors::NotFound("Input(X) Tensor of SequenceReshapeOp does not "
+                              "contain LoD information."));
     PADDLE_ENFORCE_EQ(in_lod.size(),
                       1UL,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input(X) Tensor of SequenceReshapeOp Only support "
                           "one level sequence now. But lod size "
                           "of Input(X) is %d",
@@ -47,7 +47,7 @@ class SequenceReshapeKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         (uint64_t)in_dims[0],
         in_lod[0].back(),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The size of SequenceReshapeOp X.shape[0] and X.lod()[0].back() "
             "should "
             "be same. But X.shape[0] = %d, X.lod()[0].back() = %d",
@@ -71,7 +71,7 @@ class SequenceReshapeKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_EQ(
             offset * out_width,
             seq_len * in_width,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "Please make sure (sequence_length * dimension) "
                 "can be divided by context Attr(new_dim) with no remainder for "
                 "each sequence. But the %dth sequence is invalid.",
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
index 2236988025cbc..5b2d22218adf8 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
@@ -31,17 +31,17 @@ class SequenceReverseOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("X"),
         true,
-        platform::errors::NotFound("Input(X) of SequenceReverse must exist"));
+        phi::errors::NotFound("Input(X) of SequenceReverse must exist"));
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("Y"),
         true,
-        platform::errors::NotFound("Output(Y) of SequenceReverse must exist"));
+        phi::errors::NotFound("Output(Y) of SequenceReverse must exist"));
 
     auto x_dim = ctx->GetInputDim("X");
     PADDLE_ENFORCE_GE(
         x_dim.size(),
         2,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The rank of SequenceReverseOp Input(X) must be greater "
             "than or equal to 2. But the Input(X) tensor's rank we received is "
             "%d",
@@ -120,15 +120,15 @@ class SequenceReverseOpKernel : public framework::OpKernel<T> {
     auto &x = *ctx.Input<LoDTensor>("X");
     auto *y = ctx.Output<LoDTensor>("Y");
 
-    PADDLE_ENFORCE_EQ(x.lod().empty(),
-                      false,
-                      platform::errors::NotFound(
-                          "Input(X) Tensor of SequenceReverseOp does not "
-                          "contain LoD information."));
+    PADDLE_ENFORCE_EQ(
+        x.lod().empty(),
+        false,
+        phi::errors::NotFound("Input(X) Tensor of SequenceReverseOp does not "
+                              "contain LoD information."));
 
     PADDLE_ENFORCE_EQ(x.lod().size(),
                       1,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "SequenceReverseOp only support one "
                           "level lod. But the Input(X) lod size is %d",
                           x.lod().size()));
@@ -156,7 +156,7 @@ class SequenceReverseOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_NE(
         x_data,
         y_data,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "SequenceReverse Op does not support in-place operation"));
 
     if (platform::is_cpu_place(ctx.GetPlace())) {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
deleted file mode 100644
index cf7e549134cd0..0000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
+++ /dev/null
@@ -1,204 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_scatter_op.h"
-
-#include <memory>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = phi::DenseTensor;
-using LoDTensor = phi::DenseTensor;
-
-class SequenceScatterOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) The source input of sequence scatter op");
-    AddInput("Ids",
-             "(LoDTensor) The index input of sequence scatter op where X"
-             " will be  updated, must be a LoDTensor");
-    AddInput("Updates",
-             "(LoDTensor) The values to scatter to the input tensor "
-             "X, must be a LoDTensor with the same LoD information as Ids");
-    AddOutput("Out",
-              "(Tensor) The output tensor of sequence scatter op, which "
-              "has the same dims as X");
-    AddComment(R"DOC(
-Sequence Scatter Operator.
-
-This operator scatters the Updates tensor to the input X. It uses the LoD
-information of Ids to select the rows to update, and use the values in Ids as
-the columns to update in each row of X.
-
-Following are cases to better explain how this works:
-
-Example 1:
-Given an all-ones Tensor input(X)
-    X.data = [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
-              [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
-              [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]
-    X.dims = [3, 6]
-a LoDTensor input(Ids)
-    Ids.data = [[0], [1], [2], [5], [4], [3], [2], [1], [3], [2], [5], [4]]
-    Ids.lod =  [[0,        3,                       8,                 12]]
-and a Tensor input(Updates)
-    Updates.data = [[0.3], [0.3], [0.4], [0.1], [0.2], [0.3], [0.4], [0.0], [0.2], [0.3], [0.1], [0.4]]
-    Updates.lod =  [[  0,            3,                                 8,                         12]]
-then we get an output Tensor
-    Out.data = [[1.3, 1.3, 1.4, 1.0, 1.0, 1.0],
-                [1.0, 1.0, 1.4, 1.3, 1.2, 1.1],
-                [1.0, 1.0, 1.3, 1.2, 1.4, 1.1]]
-    Out.dims = X.dims = [3, 6]
-)DOC");
-  }
-};
-
-class SequenceScatterOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    // Enforce has inputs and outputs
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SequenceScatter");
-    OP_INOUT_CHECK(ctx->HasInput("Ids"), "Input", "Ids", "SequenceScatter");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Updates"), "Input", "Updates", "SequenceScatter");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SequenceScatter");
-
-    // Set output dim the same as input
-    auto ref_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", ref_dims);
-
-    // Enforce the Updates and Ids are the same shape
-    auto updates_dim = ctx->GetInputDim("Updates");
-    auto ids_dim = ctx->GetInputDim("Ids");
-    PADDLE_ENFORCE_EQ(
-        updates_dim[0],
-        ids_dim[0],
-        platform::errors::InvalidArgument(
-            "The shape of SequenceScatter operator's input Updates and Ids do "
-            "not match, receive Updates's shape is [%s], Ids's shape is [%s].",
-            updates_dim,
-            ids_dim));
-
-    // Enforce LoD of ids and updates be the same
-    if (ctx->IsRuntime()) {
-      framework::Variable* ids_var =
-          PADDLE_GET(framework::Variable*, ctx->GetInputVarPtrs("Ids")[0]);
-      framework::Variable* updates_var =
-          PADDLE_GET(framework::Variable*, ctx->GetInputVarPtrs("Updates")[0]);
-
-      auto& ids_lod = ids_var->Get<LoDTensor>().lod();
-      auto& updates_lod = updates_var->Get<LoDTensor>().lod();
-      PADDLE_ENFORCE_EQ(
-          ids_lod.size(),
-          1,
-          platform::errors::InvalidArgument(
-              "The SequenceScatter operator’s Input Ids holds wrong LoD "
-              "information. Currently SequenceScatter operator can only deal "
-              "with one level LoD for input Ids, but received LoD level is %d.",
-              ids_lod.size()));
-      PADDLE_ENFORCE_EQ(
-          updates_lod.size(),
-          1,
-          platform::errors::InvalidArgument(
-              "The SequenceScatter operator’s Input Updates holds wrong LoD "
-              "information. Currently SequenceScatter operator can only deal "
-              "with one level LoD for input Updates, but received LoD level is "
-              "%d.",
-              ids_lod.size()));
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          platform::CPUPlace());
-  }
-};
-
-class SequenceScatterGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("Updates"),
-                      ctx->GetInputDim("Updates"));
-    ctx->SetOutputDim(framework::GradVarName("X"),
-                      ctx->GetInputDim(framework::GradVarName("Out")));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Out")),
-                          platform::CPUPlace());
-  }
-};
-
-template <typename T>
-class SequenceScatterGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("sequence_scatter_grad");
-    op->SetInput("Ids", this->Input("Ids"));
-    op->SetInput("Updates", this->Input("Updates"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Updates"),
-                  this->InputGrad("Updates"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(SequenceScatterGradNoNeedBufferVarsInferer,
-                                    "Updates");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(sequence_scatter,
-                  ops::SequenceScatterOp,
-                  ops::SequenceScatterOpMaker,
-                  ops::SequenceScatterGradMaker<paddle::framework::OpDesc>,
-                  ops::SequenceScatterGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(sequence_scatter_grad,
-                  ops::SequenceScatterGradOp,
-                  ops::SequenceScatterGradNoNeedBufferVarsInferer);
-PD_REGISTER_STRUCT_KERNEL(sequence_scatter,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::SequenceScatterOpKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
-PD_REGISTER_STRUCT_KERNEL(sequence_scatter_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::SequenceScatterGradientOpKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h
deleted file mode 100644
index 389b630015e6f..0000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/scatter.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = phi::DenseTensor;
-using LoDTensor = phi::DenseTensor;
-
-template <typename T, typename DeviceContext>
-class SequenceScatterOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* ids = ctx.Input<LoDTensor>("Ids");
-    auto* updates = ctx.Input<LoDTensor>("Updates");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto& ids_lod = ids->lod();
-    PADDLE_ENFORCE_EQ(ids_lod.empty(),
-                      false,
-                      platform::errors::InvalidArgument(
-                          "Input(Ids) Tensor of SequenceScatter operator does "
-                          "not contain LoD information."));
-
-    // Initialize out as same as x
-    out->mutable_data<T>(ctx.GetPlace());
-    framework::TensorCopySync(*x, ctx.GetPlace(), out);
-
-    auto x_dims = x->dims();
-    auto out_dims = out->dims();
-
-    for (int i = 0; i < x_dims.size(); ++i)
-      PADDLE_ENFORCE_EQ(x_dims[i],
-                        out_dims[i],
-                        platform::errors::InvalidArgument(
-                            "Input(X) and output(Out) shape of SequenceScatter "
-                            "operator do not match. Received input(X)'s shape "
-                            "is [%s], output(Out)'s shape is [%s].",
-                            x_dims,
-                            out_dims));
-
-    size_t slice_size = 1;
-    for (int i = 1; i < x_dims.size(); ++i) slice_size *= x_dims[i];
-
-    auto lod_vec = ids_lod[0];
-    unsigned int seg = 0;
-    for (int i = 0; i < ids->dims()[0]; ++i) {
-      PADDLE_ENFORCE_LT(
-          seg,
-          lod_vec.size() - 1,
-          platform::errors::OutOfRange("The segment index is out of bound in "
-                                       "SequenceScatter operator, it must be "
-                                       "less than batch size. The segment "
-                                       "index is %d, the batch size is %d.",
-                                       seg,
-                                       lod_vec.size()));
-      int lower_bound = lod_vec[seg];
-      int upper_bound = lod_vec[seg + 1];
-      if (i >= lower_bound && i < upper_bound) {
-        T* p_out = out->data<T>();
-        const T* p_updates = updates->data<T>();
-        const int64_t* p_index = ids->data<int64_t>();
-        p_out[seg * slice_size + p_index[i]] += p_updates[i];
-      } else {
-        ++seg;
-        --i;
-      }
-    }
-  }
-};
-
-template <typename T, typename DeviceContext>
-class SequenceScatterGradientOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()),
-        true,
-        platform::errors::Unimplemented("Device dose not match. The "
-                                        "SequenceScatterGradientOpKernel can "
-                                        "only run on CPU device."));
-    auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dUpdates = ctx.Output<LoDTensor>(framework::GradVarName("Updates"));
-    auto* ids = ctx.Input<LoDTensor>("Ids");
-    auto* dOut = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-
-    auto& ids_lod = ids->lod();
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    framework::TensorCopySync(*dOut, ctx.GetPlace(), dX);
-    dUpdates->mutable_data<T>(ctx.GetPlace());
-
-    auto dx_dims = dX->dims();
-    auto dout_dims = dOut->dims();
-
-    for (int i = 0; i < dx_dims.size(); ++i)
-      PADDLE_ENFORCE_EQ(dx_dims[i],
-                        dout_dims[i],
-                        platform::errors::InvalidArgument(
-                            "Input(Out@GRAD) and output(X@GRAD) shape of "
-                            "SequenceScatterGradient operator do not match. "
-                            "Received input(Out@GRAD)'s shape is [%s], "
-                            "output(X@GRAD)'s shape is [%s].",
-                            dout_dims,
-                            dx_dims));
-
-    size_t slice_size = 1;
-    for (int i = 1; i < dx_dims.size(); ++i) slice_size *= dx_dims[i];
-
-    auto lod_vec = ids_lod[0];
-    unsigned int seg = 0;
-
-    for (int i = 0; i < ids->dims()[0]; ++i) {
-      PADDLE_ENFORCE_LT(
-          seg,
-          lod_vec.size() - 1,
-          platform::errors::OutOfRange(
-              "The segment index is out of bound in SequenceScatterGradient "
-              "operator, it must be less than batch size. The segment index is "
-              "%d, the batch size is %d.",
-              seg,
-              lod_vec.size()));
-      int lower_bound = lod_vec[seg];
-      int upper_bound = lod_vec[seg + 1];
-      if (i >= lower_bound && i < upper_bound) {
-        const T* p_dOut = dOut->data<T>();
-        const int64_t* p_index = ids->data<int64_t>();
-        T* p_dUpdates = dUpdates->data<T>();
-        p_dUpdates[i] = p_dOut[seg * slice_size + p_index[i]];
-      } else {
-        ++seg;
-        --i;
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
deleted file mode 100644
index ed6e53b9ca7e8..0000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_slice_op.h"
-
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class SequenceSliceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SequenceSlice");
-    OP_INOUT_CHECK(ctx->HasInput("Offset"), "Input", "Offset", "SequenceSlice");
-    OP_INOUT_CHECK(ctx->HasInput("Length"), "Input", "Length", "SequenceSlice");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SequenceSlice");
-    auto input_dims = ctx->GetInputDim("X");
-
-    auto offset_dim = ctx->GetInputDim("Offset");
-    auto length_dim = ctx->GetInputDim("Length");
-
-    PADDLE_ENFORCE_EQ(
-        offset_dim.size(),
-        2UL,
-        platform::errors::InvalidArgument(
-            "Input Offset dimension error. SequenceSlice operator only support "
-            "one level sequence now, the dimension of input Offset must be 2, "
-            "but received dimension is %d.",
-            offset_dim.size()));
-    PADDLE_ENFORCE_EQ(
-        length_dim.size(),
-        2UL,
-        platform::errors::InvalidArgument(
-            "Input Length dimension error. SequenceSlice operator only support "
-            "one level sequence now, the dimension of input Length must be 2, "
-            "but received dimension is %d.",
-            offset_dim.size()));
-
-    // Initialize the output's dims to maximum,
-    // and re-set to real dims by the value of Offset and Length at kernel
-    ctx->SetOutputDim("Out", input_dims);
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class SequenceSliceGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   framework::GradVarName("Out"),
-                   "SequenceSliceGrad");
-    OP_INOUT_CHECK(ctx->HasOutputs(framework::GradVarName("X")),
-                   "Output",
-                   framework::GradVarName("X"),
-                   "SequenceSliceGrad");
-    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Out")),
-                          ctx.GetPlace());
-  }
-};
-
-class SequenceSliceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(LoDTensor), "
-             "the input of SequenceSliceOp.");
-    AddInput("Offset",
-             "(Tensor), "
-             "a vector<int> to describe the offset of every input sequence for "
-             "sub sequence item.");
-    AddInput("Length",
-             "(Tensor), "
-             "a vector<int> to describe the length of every input sequence for "
-             "sub sequence item.");
-    AddOutput("Out", "(LoDTensor), the output of SequenceSliceOp.");
-    AddComment(R"DOC(
-Sequence slice operator
-
-The operator crops a subsequence from given sequence with given start offset and subsequence length.
-It only supports sequence (LoD Tensor with level number is 1).
-- Case:
-    X = [[a1, a2;
-        b1, b2;
-        c1, c2]
-       [d1, d2;
-        e1, e2]]
-    LoD(X) = {{0, 3, 5}}; Dims(X) = (5, 2)
-    Offset = [[0], [1]]; Length = [[2], [1]]
-
-    Out = [[a1, a2;
-            b1, b2]
-            [e1, e2]]
-    LoD(Out) = {{0, 2, 3}}; Dims(Out) = (3, 2)
-NOTE: The first dimension size of input, the size of offset and Length, should be equal. The offset start from 0.
-    )DOC");
-  }
-};
-
-template <typename T>
-class SequenceSliceGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("sequence_slice_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("Offset", this->Input("Offset"));
-    op->SetInput("Length", this->Input("Length"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(SequenceSliceGradNoNeedBufferVarsInferer,
-                                    "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(sequence_slice,
-                  ops::SequenceSliceOp,
-                  ops::SequenceSliceOpMaker,
-                  ops::SequenceSliceGradOpMaker<paddle::framework::OpDesc>,
-                  ops::SequenceSliceGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(sequence_slice_grad,
-                  ops::SequenceSliceGradOp,
-                  ops::SequenceSliceGradNoNeedBufferVarsInferer);
-PD_REGISTER_STRUCT_KERNEL(sequence_slice,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::SequenceSliceOpKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
-PD_REGISTER_STRUCT_KERNEL(sequence_slice_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::SequenceSliceGradOpKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cu b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cu
deleted file mode 100644
index 407eb2e3ad7db..0000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_slice_op.h"
-
-namespace ops = paddle::operators;
-PD_REGISTER_STRUCT_KERNEL(sequence_slice,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::SequenceSliceOpKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
-PD_REGISTER_STRUCT_KERNEL(sequence_slice_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::SequenceSliceGradOpKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
deleted file mode 100644
index 50a3e97633475..0000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/strided_memcpy.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = phi::DenseTensor;
-using LoDTensor = phi::DenseTensor;
-using LoD = framework::LoD;
-
-template <typename T>
-inline LoD SequenceSliceLoD(const T& in,
-                            const int64_t* offset_data,
-                            const int64_t* length_data) {
-  auto out_lod = in.lod();
-  size_t lod_offset = 0;
-
-  auto n = in.lod()[0].size() - 1;
-  out_lod[0][0] = 0;
-  for (size_t i = 0; i < n; ++i) {
-    lod_offset += length_data[i];
-    out_lod[0][i + 1] = lod_offset;
-  }
-  return out_lod;
-}
-
-template <typename T, typename DeviceContext>
-class SequenceSliceOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<LoDTensor>("X");
-    auto* offset = ctx.Input<phi::DenseTensor>("Offset");
-    auto* length = ctx.Input<phi::DenseTensor>("Length");
-    auto* out = ctx.Output<LoDTensor>("Out");
-
-    auto lod = in->lod();
-    PADDLE_ENFORCE_EQ(lod.empty(),
-                      false,
-                      platform::errors::InvalidArgument(
-                          "Input(X) Tensor of SequenceSlice operator does not "
-                          "contain LoD information."));
-
-    PADDLE_ENFORCE_EQ(
-        lod.size(),
-        1UL,
-        platform::errors::InvalidArgument(
-            "LoD information error. SequenceSlice operator only support one "
-            "level sequence now, but received LoD level is %d.",
-            lod.size()));
-    auto n = lod[0].size() - 1;
-    PADDLE_ENFORCE_EQ(
-        n,
-        static_cast<size_t>(length->dims()[0]),
-        platform::errors::InvalidArgument(
-            "Input length shape error. The length of input LoD sequence and "
-            "input length-array‘s first dimension should be equal, but the LoD "
-            "sequence length is %d, the length-array‘s first dimension is %d.",
-            n,
-            static_cast<size_t>(length->dims()[0])));
-    PADDLE_ENFORCE_EQ(
-        n,
-        static_cast<size_t>(offset->dims()[0]),
-        platform::errors::InvalidArgument(
-            "Input offset shape error. The length of input LoD sequence and "
-            "input offset-array‘s first dimension should be equal, but the LoD "
-            "sequence length is %d, the offset-array‘s first dimension is %d.",
-            n,
-            static_cast<size_t>(offset->dims()[0])));
-
-    const int64_t* offset_data = offset->data<int64_t>();
-    const int64_t* length_data = length->data<int64_t>();
-    phi::DenseTensor offset_cpu;
-    phi::DenseTensor length_cpu;
-
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
-      framework::TensorCopySync(*offset, platform::CPUPlace(), &offset_cpu);
-      offset_data = offset_cpu.data<int64_t>();
-
-      length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
-      framework::TensorCopySync(*length, platform::CPUPlace(), &length_cpu);
-      length_data = length_cpu.data<int64_t>();
-    }
-
-    for (size_t i = 0; i < n; ++i) {
-      PADDLE_ENFORCE_LE(0,
-                        offset_data[i],
-                        platform::errors::InvalidArgument(
-                            "The input offset[%d]'s value is negative, its "
-                            "value is %d, expect it to be non-negative.",
-                            i,
-                            offset_data[i]));
-      PADDLE_ENFORCE_LE(0,
-                        length_data[i],
-                        platform::errors::InvalidArgument(
-                            "The input length[%d]'s value is negative, its "
-                            "value is %d, expect it to be non-negative.",
-                            i,
-                            offset_data[i]));
-      PADDLE_ENFORCE_LE(
-          lod[0][i] + offset_data[i] + length_data[i],
-          lod[0][i + 1],
-          platform::errors::OutOfRange(
-              "The slice end index of target tensor is out of range. expect it "
-              "less than or equal to %d, but the actual slice end index is %d.",
-              lod[0][i + 1],
-              lod[0][i] + offset_data[i] + length_data[i]));
-    }
-
-    out->mutable_data<T>(ctx.GetPlace());
-    auto out_lod = SequenceSliceLoD(*in, offset_data, length_data);
-    auto out_dims = in->dims();
-    out_dims[0] = out_lod[0][out_lod[0].size() - 1];
-    out->Resize(out_dims);
-    out->set_lod(out_lod);
-
-    auto in_stride = common::stride(in->dims());
-    auto out_stride = common::stride(out->dims());
-
-    size_t out_offset = 0;
-    for (size_t i = 0; i < n; ++i) {
-      if (length_data[i] == 0) continue;
-      Tensor in_t = in->Slice(
-          static_cast<int>(lod[0][i] + offset_data[i]),
-          static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
-
-      phi::funcs::StridedMemcpy<T>(ctx.device_context(),
-                                   in_t.data<T>(),
-                                   in_stride,
-                                   in_t.dims(),
-                                   out_stride,
-                                   out->data<T>() + out_offset);
-      out_offset += length_data[i] * in_stride[0];
-    }
-  }
-};
-
-template <typename T, typename DeviceContext>
-class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<LoDTensor>("X");
-    auto* offset = ctx.Input<phi::DenseTensor>("Offset");
-    auto* length = ctx.Input<phi::DenseTensor>("Length");
-    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    const int64_t* offset_data = offset->data<int64_t>();
-    const int64_t* length_data = length->data<int64_t>();
-    phi::DenseTensor offset_cpu;
-    phi::DenseTensor length_cpu;
-
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
-      framework::TensorCopySync(*offset, platform::CPUPlace(), &offset_cpu);
-      offset_data = offset_cpu.data<int64_t>();
-
-      length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
-      framework::TensorCopySync(*length, platform::CPUPlace(), &length_cpu);
-      length_data = length_cpu.data<int64_t>();
-    }
-
-    auto lod = in->lod();
-    // to avoid out_grad missing lod, compute lod again
-    auto out_lod = SequenceSliceLoD(*in, offset_data, length_data);
-
-    if (x_grad) {
-      x_grad->mutable_data<T>(ctx.GetPlace());
-      x_grad->set_lod(in->lod());
-      phi::funcs::SetConstant<DeviceContext, T> set_zero;
-      set_zero(ctx.template device_context<DeviceContext>(),
-               x_grad,
-               static_cast<T>(0));
-
-      for (size_t i = 0; i < out_lod[0].size() - 1; ++i) {
-        if (length_data[i] == 0) continue;
-        Tensor out_grad_t =
-            out_grad->Slice(static_cast<int>(out_lod[0][i]),
-                            static_cast<int>(out_lod[0][i + 1]));
-        auto out_grad_stride = common::stride(out_grad_t.dims());
-
-        auto x_grad_stride = common::stride(x_grad->dims());
-
-        Tensor x_grad_t = x_grad->Slice(
-            static_cast<int>(lod[0][i] + offset_data[i]),
-            static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
-
-        phi::funcs::StridedMemcpy<T>(ctx.device_context(),
-                                     out_grad_t.data<T>(),
-                                     out_grad_stride,
-                                     out_grad_t.dims(),
-                                     x_grad_stride,
-                                     x_grad_t.data<T>());
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
index 01f7bb3e92890..0a4d5a69a8e2b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
@@ -36,7 +36,7 @@ class SequenceSoftmaxCUDNNKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         dims[0],
         static_cast<int64_t>(lod[level].back()),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The first dimension of Input(X) should be equal to the sum of all "
             "sequences' lengths. But received first dimension of Input(X) is "
             "%d, the sum of all sequences' lengths is %d.",
@@ -44,7 +44,7 @@ class SequenceSoftmaxCUDNNKernel : public framework::OpKernel<T> {
             static_cast<int64_t>(lod[level].back())));
     PADDLE_ENFORCE_EQ(dims[0],
                       x->numel(),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The width of each timestep in Input(X) of "
                           "SequenceSoftmaxOp should be 1."));
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
index 12d4f72a91169..5fbbd49a88521 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
@@ -111,7 +111,7 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         out_dim,
         out_grad_dim,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The shape of Input(Out) and Input(Out@GRAD) of "
             "SequenceSoftmaxGrad operator do not match. The Input(Out)'s shape "
             "is [%s], the Input(Out@GRAD)'s shape is [%s].",
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
index 6c6f1b69c8196..ee372e6a9d382 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
@@ -97,7 +97,7 @@ class SequenceSoftmaxKernel : public framework::OpKernel<T> {
     auto dims = x->dims();
     PADDLE_ENFORCE_EQ(lod.empty(),
                       false,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input(X) phi::DenseTensor of SequenceSoftmax "
                           "operator does not contain "
                           "LoD information."));
@@ -106,7 +106,7 @@ class SequenceSoftmaxKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         dims[0],
         static_cast<int64_t>(lod[level].back()),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The first dimension of Input(X) should be equal to the sum of all "
             "sequences' lengths. But the first dimension of Input(X) is %d, "
             "the sum of all sequences' lengths is %d.",
@@ -115,7 +115,7 @@ class SequenceSoftmaxKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         dims[0],
         x->numel(),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The width of each timestep in Input(X) of SequenceSoftmax "
             "operator should be 1. But the first dimension of Input(X) is %d, "
             "the number of elements is %d.",
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
deleted file mode 100644
index 4b19faea335bf..0000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
+++ /dev/null
@@ -1,213 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_unpad_op.h"
-
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class SequenceUnpadOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
-                      true,
-                      platform::errors::NotFound(
-                          "Input(X) of SequenceUnpadOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Length"),
-        true,
-        platform::errors::NotFound(
-            "Input(Length) of SequenceUnpadOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"),
-        true,
-        platform::errors::NotFound(
-            "Output(Out) of SequenceUnpadOp should not be null."));
-
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GE(x_dims.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The rank of Input(X) can't be less than 2. But the "
-                          "rank we received is %d",
-                          x_dims.size()));
-
-    auto len_dims = ctx->GetInputDim("Length");
-    PADDLE_ENFORCE_EQ(len_dims.size(),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "The rank of SequenceUnpadOp Input(Length) should "
-                          "be 1. But the rank we received is %d",
-                          len_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        len_dims[0],
-        x_dims[0],
-        platform::errors::InvalidArgument(
-            "The 1st dimension of SequenceUnpadOp Input(X) and Input(Length)"
-            "should be same. But the 1st dimension of "
-            "Input(X) is %d, Input(Length) is %d",
-            x_dims[0],
-            len_dims[0]));
-
-    int64_t out_dim_0 = -1;
-    if (ctx->IsRuntime()) {
-      out_dim_0 = x_dims[0] * x_dims[1];
-    }
-
-    std::vector<int64_t> out_dims_vec{out_dim_0};
-    if (x_dims.size() == 2) {
-      out_dims_vec.push_back(1);
-    } else {
-      for (int i = 2; i < x_dims.size(); ++i) {
-        out_dims_vec.push_back(x_dims[i]);
-      }
-    }
-    ctx->SetOutputDim("Out", common::make_ddim(out_dims_vec));
-    if (!ctx->IsRuntime()) {
-      ctx->SetLoDLevel("Out", 1);
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    return phi::KernelKey(data_type, ctx.GetPlace());
-  }
-};
-
-class SequenceUnpadOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(LoDTensor, default LoDTensor<float>) Input tensor which "
-             "contains the padded sequences with equal length.");
-    AddInput(
-        "Length",
-        "(LoDTensor) The input tensor which specifies the actual length of "
-        "sequences after unpadding.");
-    AddOutput(
-        "Out",
-        "(LoDTensor) The output tensor which contains unpadded sequences.");
-    AddComment(R"DOC(
-      Sequence Unpad Operator
-
-      This operator removes the padding data in the input sequences and convert
-      them into sequences with actual length as output, identified by lod
-      information.
-
-      Example:
-
-      Given input tensor Input(X):
-          X.data = [[ 1.0,  2.0,  3.0,  4.0,  5.0],
-                    [ 6.0,  7.0,  8.0,  9.0, 10.0],
-                    [11.0, 12.0, 13.0, 14.0, 15.0]],
-`
-      in which there are 3 sequences padded to length 5, and the actual length
-      specified by Input(Length):
-
-          Length.data = [2, 3, 4],
-
-      after unpadding, Output(Out) will be:
-
-          Out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]]
-          Out.lod = [[0, 2, 5, 9]]
-
-    )DOC");
-  }
-};
-
-class SequenceUnpadGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"),
-        true,
-        platform::errors::NotFound(
-            "Input(X) of SequenceUnpadGradOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput(framework::GradVarName("Out")),
-        true,
-        platform::errors::NotFound(
-            "Input(Out@GRAD) of SequenceUnpadGradOp should not be null."));
-
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-      ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(
-        ctx, framework::GradVarName("Out"));
-    return phi::KernelKey(data_type, ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class SequenceUnpadGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("sequence_unpad_grad");
-    op->SetAttrMap(this->Attrs());
-    op->SetInput("X", this->Input("X"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(SequenceUnpadGradOpNoNeedBufferVarsInferer,
-                                    "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(sequence_unpad,
-                  ops::SequenceUnpadOp,
-                  ops::SequenceUnpadOpMaker,
-                  ops::SequenceUnpadGradOpMaker<paddle::framework::OpDesc>,
-                  ops::SequenceUnpadGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(sequence_unpad_grad,
-                  ops::SequenceUnpadGradOp,
-                  ops::SequenceUnpadGradOpNoNeedBufferVarsInferer);
-PD_REGISTER_STRUCT_KERNEL(sequence_unpad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::SequenceUnpadOpKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
-PD_REGISTER_STRUCT_KERNEL(sequence_unpad_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::SequenceUnpadGradOpKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cu b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cu
deleted file mode 100644
index 8ba8b380c0976..0000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_unpad_op.h"
-
-namespace ops = paddle::operators;
-PD_REGISTER_STRUCT_KERNEL(sequence_unpad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::SequenceUnpadOpKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
-PD_REGISTER_STRUCT_KERNEL(sequence_unpad_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::SequenceUnpadGradOpKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
deleted file mode 100644
index cc38fd510ef1e..0000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/sequence_padding.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = phi::DenseTensor;
-using LoD = framework::LoD;
-
-template <typename T, typename DeviceContext>
-class SequenceUnpadOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x_t = ctx.Input<LoDTensor>("X");
-    auto* len_t = ctx.Input<LoDTensor>("Length");
-    auto* out_t = ctx.Output<LoDTensor>("Out");
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    phi::DenseTensor seq_len_cpu =
-        ctx.AllocateTmpTensor<T, DeviceContext>(len_t->dims(), dev_ctx);
-    if (platform::is_gpu_place(ctx.GetPlace()) ||
-        platform::is_xpu_place(ctx.GetPlace())) {
-      seq_len_cpu.mutable_data<int64_t>(platform::CPUPlace());
-      framework::TensorCopySync(*len_t, platform::CPUPlace(), &seq_len_cpu);
-    } else {
-      seq_len_cpu = *len_t;
-    }
-
-    const int64_t* seq_len_ptr = seq_len_cpu.data<int64_t>();
-    int64_t batch_size = len_t->dims()[0];
-    std::vector<size_t> out_lod0(batch_size + 1, 0);
-    for (int64_t i = 0; i < batch_size; ++i) {
-      out_lod0[i + 1] = out_lod0[i] + static_cast<size_t>(seq_len_ptr[i]);
-    }
-
-    framework::LoD out_lod;
-    out_lod.push_back(out_lod0);
-    out_t->set_lod(out_lod);
-    std::vector<int64_t> out_dims_vec{static_cast<int64_t>(out_lod0.back())};
-    if (x_t->dims().size() == 2) {
-      out_dims_vec.push_back(1);
-    } else {
-      for (int i = 2; i < x_t->dims().size(); ++i) {
-        out_dims_vec.push_back(x_t->dims()[i]);
-      }
-    }
-    out_t->Resize(common::make_ddim(out_dims_vec));
-
-    // after set the lod of output, allocate the memory
-    out_t->mutable_data<T>(ctx.GetPlace());
-
-    int64_t padded_length = x_t->dims()[1];
-    phi::funcs::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
-        dev_ctx,
-        *x_t,
-        out_t,
-        padded_length,
-        0,
-        false,
-        phi::funcs::kBatchLengthWidth);
-  }
-};
-
-template <typename T, typename DeviceContext>
-class SequenceUnpadGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
-    if (d_x) {
-      const auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
-      d_x->mutable_data<T>(ctx.GetPlace());
-
-      int padded_length = d_x->dims()[1];
-
-      LoDTensor zero_pads;
-      zero_pads.Resize({1, 1});
-      zero_pads.mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, T> set_zero;
-      auto& dev_ctx = ctx.template device_context<DeviceContext>();
-      set_zero(dev_ctx, &zero_pads, static_cast<T>(0));
-
-      phi::funcs::PaddingLoDTensorFunctor<DeviceContext, T>()(
-          ctx.template device_context<DeviceContext>(),
-          *d_out,
-          d_x,
-          zero_pads,
-          padded_length,
-          0,
-          false,
-          phi::funcs::kBatchLengthWidth);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op_xpu.cc b/paddle/fluid/operators/sequence_ops/sequence_unpad_op_xpu.cc
deleted file mode 100644
index c875cdc37e80b..0000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op_xpu.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU
-
-#include "paddle/fluid/operators/sequence_ops/sequence_unpad_op.h"
-
-namespace ops = paddle::operators;
-PD_REGISTER_STRUCT_KERNEL(
-    sequence_unpad, XPU, ALL_LAYOUT, ops::SequenceUnpadOpKernel, float) {}
-
-#endif
diff --git a/paddle/fluid/operators/sequence_ops/unity_build_rule.cmake b/paddle/fluid/operators/sequence_ops/unity_build_rule.cmake
index a22e6865cf103..460c845029a96 100644
--- a/paddle/fluid/operators/sequence_ops/unity_build_rule.cmake
+++ b/paddle/fluid/operators/sequence_ops/unity_build_rule.cmake
@@ -6,7 +6,6 @@
 # in combination rule, you can remove the source file from the following rules.
 register_unity_group(
   cc
-  sequence_concat_op.cc
   sequence_conv_op.cc
   sequence_enumerate_op.cc
   sequence_erase_op.cc
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index 5eeb356817a2a..268f7457f2136 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -189,7 +189,7 @@ class SetValueGrad : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_LT(
         in_dims.size(),
         7,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The dimension of set_value_grad operator's input should be less "
             "than 7, but received dimension is %d.",
             in_dims.size()));
diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h
index 7d5fd042bb0fb..85022ead0e905 100644
--- a/paddle/fluid/operators/set_value_op.h
+++ b/paddle/fluid/operators/set_value_op.h
@@ -22,10 +22,10 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/assign_value_op.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/slice_utils.h"
 
 namespace paddle {
@@ -68,7 +68,7 @@ inline void CheckIsDimsMatch(const framework::DDim first,
       return;
     }
   }
-  PADDLE_THROW(platform::errors::InvalidArgument(
+  PADDLE_THROW(phi::errors::InvalidArgument(
       "The shape of tensor assigned value must match the shape "
       "of target shape: %d, but now shape is %d.",
       second.to_str(),
diff --git a/paddle/fluid/operators/share_data_op.cc b/paddle/fluid/operators/share_data_op.cc
index b780ccba920c0..4accee24e17fa 100644
--- a/paddle/fluid/operators/share_data_op.cc
+++ b/paddle/fluid/operators/share_data_op.cc
@@ -33,12 +33,12 @@ class ShareDataOp : public framework::OperatorWithKernel {
         in_type == framework::proto::VarType::LOD_TENSOR ||
             in_type == framework::proto::VarType::SELECTED_ROWS,
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Type of Variable[X] must be phi::DenseTensor or SelectedRows!"));
     PADDLE_ENFORCE_EQ(
         in_type,
         out_type,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The type of input (X) and output (Out) are inconsistent."));
 
     ctx->ShareDim("X", "Out");
@@ -80,4 +80,4 @@ PD_REGISTER_STRUCT_KERNEL(share_data,
                           int64_t,
                           float,
                           double,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/share_data_op.cu b/paddle/fluid/operators/share_data_op.cu
index 7e67b491834ea..2b1c32d655b80 100644
--- a/paddle/fluid/operators/share_data_op.cu
+++ b/paddle/fluid/operators/share_data_op.cu
@@ -27,4 +27,4 @@ PD_REGISTER_STRUCT_KERNEL(share_data,
                           int64_t,
                           float,
                           double,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc
deleted file mode 100644
index e883ba8e83092..0000000000000
--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ /dev/null
@@ -1,225 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/array_operator.h"
-#include "paddle/phi/core/lod_utils.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace framework {
-class OpDesc;
-class Scope;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-class ShrinkRNNMemoryOp : public ArrayOp {
- public:
-  ShrinkRNNMemoryOp(const std::string &type,
-                    const framework::VariableNameMap &inputs,
-                    const framework::VariableNameMap &outputs,
-                    const framework::AttributeMap &attrs)
-      : ArrayOp(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto *x_var = scope.FindVar(Input("X"));
-    PADDLE_ENFORCE_NOT_NULL(x_var,
-                            platform::errors::NotFound(
-                                "Input(X) of ShrinkRNNMemoryOp is not found."));
-    auto &x_tensor = x_var->Get<phi::DenseTensor>();
-    size_t offset = this->GetOffset(scope, place);
-    auto *rank_table_var = scope.FindVar(Input("RankTable"));
-    PADDLE_ENFORCE_NOT_NULL(
-        rank_table_var,
-        platform::errors::NotFound(
-            "Input(RankTable) of ShrinkRNNMemoryOp is not found."));
-    auto &rank_table = rank_table_var->Get<framework::LoDRankTable>();
-
-    auto &rank_items = rank_table.items();
-    int dst_num_rows = static_cast<int>(
-        std::lower_bound(rank_items.begin(),
-                         rank_items.end(),
-                         offset,
-                         [](const framework::LoDRankTable::TableItem &a,
-                            size_t b) { return a.length > b; }) -
-        rank_items.begin());
-
-    auto *out_var = scope.FindVar(Output("Out"));
-    PADDLE_ENFORCE_NOT_NULL(
-        out_var,
-        platform::errors::NotFound(
-            "Output(Out) of ShrinkRNNMemoryOp is not found."));
-    auto &out_tensor = *out_var->GetMutable<phi::DenseTensor>();
-
-    size_t height = dst_num_rows;
-
-    // do shrink for the top level LoD
-    if (!x_tensor.lod().empty() &&
-        x_tensor.lod()[0].size() > static_cast<size_t>(dst_num_rows)) {
-      auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(
-          x_tensor.lod(), 0, dst_num_rows, 0);
-      height = lod_offset.second.second;
-      auto out_lod = out_tensor.mutable_lod();
-      phi::AppendLoD(out_lod, lod_offset.first);
-    }
-
-    if (dst_num_rows != 0) {
-      out_tensor.mutable_data(place, x_tensor.dtype());
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      framework::TensorCopy(x_tensor.Slice(0, static_cast<int64_t>(height)),
-                            place,
-                            *dev_ctx,
-                            &out_tensor);
-    }
-  }
-};
-
-class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(phi::DenseTensor) The RNN step memory to be shrank.");
-    AddInput("RankTable", "(LoDRankTable) The lod_rank_table of dynamic RNN.");
-    AddInput(
-        "I",
-        "(phi::DenseTensor) The step index. The RNN step memory 'X' will be "
-        "shrank to match the size of the input of the index'th step.");
-    AddOutput("Out", "(phi::DenseTensor) The shrank RNN step memory.");
-    AddComment(R"DOC(
-This operator is used to shrink output batch of memory defined in dynamic RNN.
-
-Dynamic RNN is able to handle variable-length sequences, in which, sequences in
-a mini-batch are sorted by their lengths first. After that, the longest sequence
-becomes the first one in the sorted batch, followed by the second longest, the
-third longest, and so on. Dynamic RNN then slices a batch input timestep by
-timestep from the sorted input. Once any sequence in the input batch reaches its
-end, memory defined in dynamicRNN has to shrink its outputs to adapt to the input
-batch size for the next time step.
-)DOC");
-  }
-};
-
-class ShrinkRNNMemoryInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "ShrinkRNNMemory");
-    OP_INOUT_CHECK(context->HasInput("I"), "Input", "I", "ShrinkRNNMemory");
-    OP_INOUT_CHECK(context->HasInput("RankTable"),
-                   "Input",
-                   "RankTable",
-                   "ShrinkRNNMemory");
-    context->SetOutputDim("Out", context->GetInputDim("X"));
-    // For runtime, output's lod is computed according to input's lod, but
-    // remove the finished sequence. It is set in detail kernel implementation.
-    if (!context->IsRuntime()) {
-      context->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-};
-
-class ShrinkRNNMemoryGradOp : public ArrayOp {
- public:
-  ShrinkRNNMemoryGradOp(const std::string &type,
-                        const framework::VariableNameMap &inputs,
-                        const framework::VariableNameMap &outputs,
-                        const framework::AttributeMap &attrs)
-      : ArrayOp(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto *dout_var = scope.FindVar(Input(framework::GradVarName("Out")));
-    auto *dx_var = scope.FindVar(Output(framework::GradVarName("X")));
-    PADDLE_ENFORCE_NOT_NULL(
-        dx_var,
-        platform::errors::NotFound(
-            "Input(X@GRAD) of ShrinkRNNMemoryGradOp is not found."));
-    auto *x_var = scope.FindVar(Input("X"));
-    PADDLE_ENFORCE_NOT_NULL(
-        x_var,
-        platform::errors::NotFound(
-            "Input(x) of ShrinkRNNMemoryGradOp is not found."));
-    auto &x_tensor = x_var->Get<phi::DenseTensor>();
-    auto &dx_tensor = *dx_var->GetMutable<phi::DenseTensor>();
-    dx_tensor.Resize(x_tensor.dims());
-    dx_tensor.mutable_data(x_tensor.place(), x_tensor.dtype());
-
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    if (dout_var == nullptr) {  // dx_tensor fill zero
-      phi::funcs::set_constant(dev_ctx, &dx_tensor, 0.0f);
-    } else {
-      auto &dout_tensor = dout_var->Get<phi::DenseTensor>();
-      auto height = dout_tensor.dims()[0];
-      auto slice = dx_tensor.Slice(0, static_cast<int>(height));
-      framework::TensorCopy(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
-      if (dx_tensor.dims()[0] > height) {
-        auto rest_tensor = dx_tensor.Slice(
-            static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0]));
-        phi::funcs::set_constant(dev_ctx, &rest_tensor, 0.0f);
-      }
-    }
-    dx_tensor.set_lod(x_tensor.lod());
-  }
-};
-
-class ShrinkRNNMemoryGradInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "ShrinkRNNMemoryGrad");
-    OP_INOUT_CHECK(context->HasOutput(framework::GradVarName("X")),
-                   "Output",
-                   "X",
-                   "ShrinkRNNMemoryGrad");
-
-    context->ShareDim("X", /*->*/ framework::GradVarName("X"));
-    context->ShareLoD("X", /*->*/ framework::GradVarName("X"));
-  }
-};
-
-template <typename T>
-class ShrinkRNNGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("shrink_rnn_memory_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(shrink_rnn_memory,
-                  ops::ShrinkRNNMemoryOp,
-                  ops::ShrinkRNNMemoryInferShape,
-                  ops::ShrinkRNNMemoryOpProtoMaker,
-                  ops::ShrinkRNNGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ShrinkRNNGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(shrink_rnn_memory_grad,
-                  ops::ShrinkRNNMemoryGradOp,
-                  ops::ShrinkRNNMemoryGradInferShape);
diff --git a/paddle/fluid/operators/shuffle_batch_op.cc b/paddle/fluid/operators/shuffle_batch_op.cc
index 0b5a7bf5540ab..1f1415aa995fd 100644
--- a/paddle/fluid/operators/shuffle_batch_op.cc
+++ b/paddle/fluid/operators/shuffle_batch_op.cc
@@ -38,26 +38,23 @@ class ShuffleBatchOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"),
-        true,
-        platform::errors::NotFound("Input(X) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Seed"),
-        true,
-        platform::errors::NotFound("Input(Seed) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"),
-        true,
-        platform::errors::NotFound("Output(Out) should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
+                      true,
+                      phi::errors::NotFound("Input(X) should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Seed"),
+                      true,
+                      phi::errors::NotFound("Input(Seed) should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"),
+                      true,
+                      phi::errors::NotFound("Output(Out) should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("ShuffleIdx"),
         true,
-        platform::errors::NotFound("Output(ShuffleIdx) should not be null."));
+        phi::errors::NotFound("Output(ShuffleIdx) should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("SeedOut"),
         true,
-        platform::errors::NotFound("Output(SeedOut) should not be null."));
+        phi::errors::NotFound("Output(SeedOut) should not be null."));
 
     ctx->ShareDim("X", "Out");
     ctx->ShareLoD("X", "Out");
@@ -122,15 +119,15 @@ class ShuffleBatchOpGrad : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("ShuffleIdx"),
         true,
-        platform::errors::NotFound("Input(ShuffleIdx) should not be null"));
+        phi::errors::NotFound("Input(ShuffleIdx) should not be null"));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput(framework::GradVarName("Out")),
         true,
-        platform::errors::NotFound("Grad Input(Out) should not be null"));
+        phi::errors::NotFound("Grad Input(Out) should not be null"));
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput(framework::GradVarName("X")),
         true,
-        platform::errors::NotFound("Grad Output(X) should not be null"));
+        phi::errors::NotFound("Grad Output(X) should not be null"));
 
     ctx->ShareDim(framework::GradVarName("Out"), framework::GradVarName("X"));
     ctx->ShareLoD(framework::GradVarName("Out"), framework::GradVarName("X"));
diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc
index c8f9d9469848e..f95bed3bed5ef 100644
--- a/paddle/fluid/operators/shuffle_channel_op.cc
+++ b/paddle/fluid/operators/shuffle_channel_op.cc
@@ -29,7 +29,7 @@ class ShuffleChannelOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         input_dims.size(),
         4,
-        platform::errors::InvalidArgument("The layout of input is NCHW."));
+        phi::errors::InvalidArgument("The layout of input is NCHW."));
 
     ctx->SetOutputDim("Out", input_dims);
   }
@@ -55,10 +55,10 @@ class ShuffleChannelOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("group", "the number of groups.")
         .SetDefault(1)
         .AddCustomChecker([](const int& group) {
-          PADDLE_ENFORCE_GE(group,
-                            1,
-                            platform::errors::InvalidArgument(
-                                "group should be larger than 0."));
+          PADDLE_ENFORCE_GE(
+              group,
+              1,
+              phi::errors::InvalidArgument("group should be larger than 0."));
         });
     AddComment(R"DOC(
     Shuffle Channel operator
@@ -83,7 +83,7 @@ class ShuffleChannelGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         input_dims.size(),
         4,
-        platform::errors::InvalidArgument("The layout of input is NCHW."));
+        phi::errors::InvalidArgument("The layout of input is NCHW."));
 
     ctx->SetOutputDim(framework::GradVarName("X"), input_dims);
   }
diff --git a/paddle/fluid/operators/similarity_focus_op.cc b/paddle/fluid/operators/similarity_focus_op.cc
deleted file mode 100644
index 4508459f25514..0000000000000
--- a/paddle/fluid/operators/similarity_focus_op.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/similarity_focus_op.h"
-
-namespace paddle {
-namespace operators {
-class SimilarityFocusOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, default Tensor<float>), a 4-D tensor with shape,"
-             " [BatchSize, X, Y, Z]");
-    AddOutput("Out",
-              "(Tensor, default Tensor<float>), the similarity focus mask"
-              " with the same shape of input X.");
-    AddAttr<int>("axis",
-                 "(int32), indicating the dimension to be select. It can"
-                 " only be 1, 2, or 3.");
-    AddAttr<std::vector<int>>("indexes",
-                              "(std::vector<int32>), indicating the indexes"
-                              " of the selected dimension.");
-    AddComment(R"DOC(
-SimilarityFocus Operator.
-
-Generate a similarity focus mask with the same shape of input using the following method:
-1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding
-   to the axis according to the indexes. For example, if axis=1 and indexes=[a],
-   it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X
-   is (BatchSize, A, B, C), the shape of tensor T is (BatchSize, B, C).
-2. For each index, find the largest numbers in the tensor T, so that the same
-   row and same column has at most one number(what it means is that if the
-   largest number has been found in the i-th row and the j-th column, then
-   the numbers in the i-th row or j-th column will be skipped. And then the
-   next largest number will be selected from the remaining numbers. Obviously
-   there will be min(B, C) numbers), and mark the corresponding position of the
-   3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for
-   each index.
-3. Broadcast the 3-D similarity focus mask to the same shape of input X.
-
-Refer to `Similarity Focus Layer <http://www.aclweb.org/anthology/N16-1108>`_
-)DOC");
-  }
-};
-
-class SimilarityFocusOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SimilarityFocus");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SimilarityFocus");
-
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(
-        x_dims.size(),
-        4,
-        platform::errors::InvalidArgument(
-            "The dimension size of Input(X) be 4, but received %d.",
-            x_dims.size()));
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          platform::CPUPlace());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    similarity_focus,
-    ops::SimilarityFocusOp,
-    ops::SimilarityFocusOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-PD_REGISTER_STRUCT_KERNEL(similarity_focus,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::SimilarityFocusKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/similarity_focus_op.h b/paddle/fluid/operators/similarity_focus_op.h
deleted file mode 100644
index 32349e9570369..0000000000000
--- a/paddle/fluid/operators/similarity_focus_op.h
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <cstring>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class SimilarityFocusKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    phi::DenseTensor* out = context.Output<phi::DenseTensor>("Out");
-    const phi::DenseTensor* x = context.Input<phi::DenseTensor>("X");
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    const T* x_data = x->data<T>();
-
-    int axis = context.Attr<int>("axis");
-    std::vector<int> indexes = context.Attr<std::vector<int>>("indexes");
-
-    int64_t batch_size = x->dims()[0];
-    int64_t dim[4];
-    for (int i = 1; i <= 3; ++i) {
-      dim[i] = x->dims()[i];
-    }
-
-    PADDLE_ENFORCE_GT(
-        indexes.size(),
-        0,
-        platform::errors::InvalidArgument("The size of Attr(indexes) must be "
-                                          "greater than 0, but received %d.",
-                                          indexes.size()));
-
-    for (size_t i = 0; i < indexes.size(); i++) {
-      PADDLE_ENFORCE_GT(
-          dim[axis],
-          indexes[i],
-          platform::errors::InvalidArgument(
-              "Each value of Attr(indexes) must be less than X.dim[axis], "
-              "but indexes[%d] received %d.",
-              i,
-              indexes[i]));
-    }
-
-    int64_t array_size = 1;
-    for (int i = 1; i <= 3; ++i) {
-      if (i != axis) {
-        array_size *= dim[i];
-      }
-    }
-
-    std::vector<std::pair<T, int64_t>> array(array_size);
-
-    bool (*cmp)(std::pair<T, int64_t>, std::pair<T, int64_t>) =
-        [](std::pair<T, int64_t> x, std::pair<T, int64_t> y) {
-          return x.first > y.first;
-        };
-
-    int64_t (*compute_index)(int64_t*, int, int, int, int) =
-        [](int64_t* dim, int d1, int d2, int d3, int d4) {
-          return d1 * dim[1] * dim[2] * dim[3] + d2 * dim[2] * dim[3] +
-                 d3 * dim[3] + d4;
-        };
-
-    PADDLE_ENFORCE_GT(
-        axis,
-        0,
-        platform::errors::InvalidArgument(
-            "The value of Attr(axis) must be 1 or 2 or 3, but received %d.",
-            axis));
-    PADDLE_ENFORCE_LT(
-        axis,
-        4,
-        platform::errors::InvalidArgument(
-            "The value of Attr(axis) must be 1 or 2 or 3, but received %d.",
-            axis));
-    memset(out_data, 0, sizeof(T) * batch_size * dim[1] * dim[2] * dim[3]);
-    for (int i = 0; i < batch_size; ++i) {
-      for (auto index : indexes) {
-        if (axis == 1) {
-          for (int j = 0; j < dim[2]; ++j) {
-            for (int k = 0; k < dim[3]; ++k) {
-              array[j * dim[3] + k] = std::make_pair(
-                  x_data[compute_index(dim, i, index, j, k)], j * dim[3] + k);
-            }
-          }
-
-          std::sort(array.begin(), array.end(), cmp);
-          int tag_num = 0;
-          std::vector<bool> tag2(dim[2]), tag3(dim[3]);
-          for (auto x : array) {
-            int idx2 = x.second / dim[3];
-            int idx3 = x.second % dim[3];
-            if (tag2[idx2] || tag3[idx3]) {
-              continue;
-            }
-            tag_num++;
-            tag2[idx2] = true;
-            tag3[idx3] = true;
-            for (int j = 0; j < dim[1]; ++j) {
-              out_data[compute_index(dim, i, j, idx2, idx3)] = 1;
-            }
-            if (tag_num == std::min(dim[2], dim[3])) {
-              break;
-            }
-          }
-        } else if (axis == 2) {
-          for (int j = 0; j < dim[1]; ++j) {
-            for (int k = 0; k < dim[3]; ++k) {
-              array[j * dim[3] + k] = std::make_pair(
-                  x_data[compute_index(dim, i, j, index, k)], j * dim[3] + k);
-            }
-          }
-
-          std::sort(array.begin(), array.end(), cmp);
-          int tag_num = 0;
-          std::vector<bool> tag1(dim[1]), tag3(dim[3]);
-          for (auto x : array) {
-            int idx1 = x.second / dim[3];
-            int idx3 = x.second % dim[3];
-            if (tag1[idx1] || tag3[idx3]) {
-              continue;
-            }
-            tag_num++;
-            tag1[idx1] = true;
-            tag3[idx3] = true;
-            for (int j = 0; j < dim[2]; ++j) {
-              out_data[compute_index(dim, i, idx1, j, idx3)] = 1;
-            }
-            if (tag_num == std::min(dim[1], dim[3])) {
-              break;
-            }
-          }
-        } else if (axis == 3) {
-          for (int j = 0; j < dim[1]; ++j) {
-            for (int k = 0; k < dim[2]; ++k) {
-              array[j * dim[2] + k] = std::make_pair(
-                  x_data[compute_index(dim, i, j, k, index)], j * dim[2] + k);
-            }
-          }
-
-          std::sort(array.begin(), array.end(), cmp);
-          int tag_num = 0;
-          std::vector<bool> tag1(dim[1]), tag2(dim[2]);
-          for (auto x : array) {
-            int idx1 = x.second / dim[2];
-            int idx2 = x.second % dim[2];
-            if (tag1[idx1] || tag2[idx2]) {
-              continue;
-            }
-            tag_num++;
-            tag1[idx1] = true;
-            tag2[idx2] = true;
-            for (int j = 0; j < dim[3]; ++j) {
-              out_data[compute_index(dim, i, idx1, idx2, j)] = 1;
-            }
-            if (tag_num == std::min(dim[1], dim[2])) {
-              break;
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index 16b895ce557a7..881e3b59f0db7 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -40,7 +40,7 @@ class SliceOp : public framework::OperatorWithKernel {
     if (x_var_type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
       PADDLE_ENFORCE_EQ(axes.size(),
                         1,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The size of axes must be 1 when the Input of "
                             "SliceOp is LoDTensorArray, "
                             "but received %d.",
@@ -63,7 +63,7 @@ class SliceOp : public framework::OperatorWithKernel {
     auto in_dims = ctx->GetInputDim("Input");
     PADDLE_ENFORCE_LT(in_dims.size(),
                       7,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The rank of input should be less than 7."));
     framework::DDim out_dims(in_dims);
 
@@ -83,31 +83,31 @@ class SliceOp : public framework::OperatorWithKernel {
 
     if (ctx->HasInputs("StartsTensorList")) {
       starts_size = ctx->Inputs("StartsTensorList").size();
-      PADDLE_ENFORCE_GT(starts_size,
-                        0,
-                        platform::errors::InvalidArgument(
-                            "StartsTensorList size can't be zero"));
+      PADDLE_ENFORCE_GT(
+          starts_size,
+          0,
+          phi::errors::InvalidArgument("StartsTensorList size can't be zero"));
     }
     if (ctx->HasInputs("EndsTensorList")) {
       ends_size = ctx->Inputs("EndsTensorList").size();
-      PADDLE_ENFORCE_GT(ends_size,
-                        0,
-                        platform::errors::InvalidArgument(
-                            "EndsTensorList size can't be zero"));
+      PADDLE_ENFORCE_GT(
+          ends_size,
+          0,
+          phi::errors::InvalidArgument("EndsTensorList size can't be zero"));
     }
 
     if (!ctx->HasInput("StartsTensor")) {
       PADDLE_ENFORCE_EQ(
           starts_size,
           axes.size(),
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The size of starts must be equal to the size of axes."));
     }
     if (!ctx->HasInput("EndsTensor")) {
       PADDLE_ENFORCE_EQ(
           ends_size,
           axes.size(),
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The size of ends must be equal to the size of axes."));
     }
     for (auto &axis : axes) {
@@ -143,7 +143,7 @@ class SliceOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           in_tensor.IsInitialized(),
           true,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The tensor Input (Input) of Slice op is not initialized."));
       // NOTE: cuda pinned tensor need to copy its data to target place
       if (platform::is_cuda_pinned_place(in_tensor.place())) {
@@ -304,14 +304,13 @@ class SliceOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Input"),
+                      true,
+                      phi::errors::InvalidArgument("Input should not be null"));
     PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Input"),
+        ctx->HasInput(framework::GradVarName("Out")),
         true,
-        platform::errors::InvalidArgument("Input should not be null"));
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(Out@GRAD) should not be null"));
+        phi::errors::InvalidArgument("Input(Out@GRAD) should not be null"));
     auto x_var_type = ctx->GetInputsVarType("Input")[0];
     if (x_var_type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
       // If the var type of input is LOD_TENSOR_ARRAY,
diff --git a/paddle/fluid/operators/soft_relu_op.cu b/paddle/fluid/operators/soft_relu_op.cu
index 3963b372c9c8e..e4273c73530f6 100644
--- a/paddle/fluid/operators/soft_relu_op.cu
+++ b/paddle/fluid/operators/soft_relu_op.cu
@@ -39,7 +39,7 @@ PD_REGISTER_STRUCT_KERNEL(soft_relu,
                           ops::SoftReluCudaKernel,
                           float,
                           double,
-                          plat::float16,
+                          phi::dtype::float16,
                           plat::bfloat16) {}
 PD_REGISTER_STRUCT_KERNEL(soft_relu_grad,
                           GPU,
@@ -47,5 +47,5 @@ PD_REGISTER_STRUCT_KERNEL(soft_relu_grad,
                           ops::SoftReluGradCudaKernel,
                           float,
                           double,
-                          plat::float16,
+                          phi::dtype::float16,
                           plat::bfloat16) {}
diff --git a/paddle/fluid/operators/sparse_attention_op.cc b/paddle/fluid/operators/sparse_attention_op.cc
index 26dfc0fbbc64d..6d6a567ab1b61 100644
--- a/paddle/fluid/operators/sparse_attention_op.cc
+++ b/paddle/fluid/operators/sparse_attention_op.cc
@@ -99,15 +99,15 @@ class SparseAttentionOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_EQ(dims_q.size(),
                       static_cast<size_t>(4),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Dimension in query' shapes should be 4."));
-    PADDLE_ENFORCE_EQ(dims_k.size(),
-                      static_cast<size_t>(4),
-                      platform::errors::InvalidArgument(
-                          "Dimension in key' shapes should be 4."));
+    PADDLE_ENFORCE_EQ(
+        dims_k.size(),
+        static_cast<size_t>(4),
+        phi::errors::InvalidArgument("Dimension in key' shapes should be 4."));
     PADDLE_ENFORCE_EQ(dims_v.size(),
                       static_cast<size_t>(4),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Dimension in value' shapes should be 4."));
 
     auto batch_size = dims_q[0];
diff --git a/paddle/fluid/operators/sparse_attention_op.cu b/paddle/fluid/operators/sparse_attention_op.cu
index 117de1c1a55df..ec41a829e7f72 100644
--- a/paddle/fluid/operators/sparse_attention_op.cu
+++ b/paddle/fluid/operators/sparse_attention_op.cu
@@ -314,7 +314,7 @@ void SparseSoftmaxForward(const phi::GPUContext& ctx,
                            columns_data,
                            num_rows);
   } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
+    PADDLE_THROW(phi::errors::InvalidArgument(
         "The head_dim of query in sparse_attention op should less or equal "
         "512"));
   }
@@ -412,7 +412,7 @@ void SparseSoftmaxBackward(const phi::GPUContext& ctx,
                            columns_data,
                            num_rows);
   } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
+    PADDLE_THROW(phi::errors::InvalidArgument(
         "The head_dim of query in sparse_attention op should less or equal "
         "512"));
   }
@@ -425,7 +425,7 @@ inline cudaDataType_t GetGpuType(const VarType::Type data_type) {
   } else if (data_type == VarType::FP64) {
     return CUDA_R_64F;
   } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
+    PADDLE_THROW(phi::errors::InvalidArgument(
         "Not support tensor type in sparse_attention OP: %s",
         framework::DataTypeToString(data_type)));
   }
diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
deleted file mode 100644
index 6b79d5c35b783..0000000000000
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/phi/core/lod_utils.h"
-
-namespace phi {
-class DenseTensor;
-}  // namespace phi
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-class Scope;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-struct CopyRange {
-  size_t begin;
-  size_t end;
-};
-
-using LoD = framework::LoD;
-
-class SplitLoDTensorOp : public framework::OperatorBase {
- public:
-  SplitLoDTensorOp(const std::string &type,
-                   const framework::VariableNameMap &inputs,
-                   const framework::VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto &x = scope.FindVar(Input("X"))->Get<phi::DenseTensor>();
-    auto &mask = scope.FindVar(Input("Mask"))->Get<phi::DenseTensor>();
-    auto *out_true =
-        scope.FindVar(Output("OutTrue"))->GetMutable<phi::DenseTensor>();
-    auto *out_false =
-        scope.FindVar(Output("OutFalse"))->GetMutable<phi::DenseTensor>();
-    auto level = static_cast<size_t>(Attr<int>("level"));
-    auto &x_lod = x.lod();
-    auto &mask_dim = mask.dims();
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-
-    std::unique_ptr<phi::DenseTensor> cpu_mask{new phi::DenseTensor()};
-    if (platform::is_cpu_place(mask.place())) {
-      cpu_mask->ShareDataWith(mask);
-    } else if (platform::is_gpu_place(mask.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      framework::TensorCopy(
-          mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
-#else
-      PADDLE_THROW(paddle::platform::errors::Fatal(
-          "Not support GPU, Please compile WITH_GPU option"));
-#endif
-    }
-    auto *mask_data = cpu_mask->data<bool>();
-
-    std::vector<std::vector<CopyRange>> copy_ranges(2);
-
-    // set out_true/out_false lod
-    for (size_t t = 0; t < 2; t++) {
-      LoD *lod = nullptr;
-      if (t == 0) {
-        lod = out_false->mutable_lod();
-      } else {
-        lod = out_true->mutable_lod();
-      }
-      lod->clear();
-      for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
-        if (static_cast<size_t>(mask_data[i]) == t) {
-          size_t start_idx = i;
-          auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
-              x_lod, start_idx, start_idx + 1, level);
-
-          auto &lod_length = lod_and_offset.first;
-          phi::AppendLoD(lod, lod_length);
-
-          size_t start_offset = lod_and_offset.second.first;
-          size_t end_offset = lod_and_offset.second.second;
-          copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
-        }
-      }
-    }
-
-    for (size_t t = 0; t < 2; ++t) {
-      phi::DenseTensor *out = nullptr;
-      if (t == 0) {
-        out = out_false;
-      } else {
-        out = out_true;
-      }
-      auto &ranges = copy_ranges[t];
-      size_t height = std::accumulate(
-          ranges.begin(), ranges.end(), 0UL, [](size_t a, const CopyRange &b) {
-            return a + b.end - b.begin;
-          });
-      auto x_dim = x.dims();
-      x_dim[0] = static_cast<int64_t>(height);
-      out->Resize(x_dim);
-      out->mutable_data(x.place(), x.type());
-      size_t offset = 0;
-      for (auto &each_range : ranges) {
-        size_t len = each_range.end - each_range.begin;
-        if (len == 0) {
-          continue;
-        }
-        // out[offset: offset+len] = x[each_range.begin: each_range.end]
-        auto slice = out->Slice(static_cast<int>(offset),
-                                static_cast<int>(offset + len));
-        framework::TensorCopy(x.Slice(static_cast<int>(each_range.begin),
-                                      static_cast<int>(each_range.end)),
-                              x.place(),
-                              dev_ctx,
-                              &slice);
-        offset += len;
-      }
-    }
-  }
-};
-
-class SplitLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input phi::DenseTensor");
-    AddInput("Mask", "A bool column vector which mask the input");
-    AddOutput("OutTrue", "True branch of input phi::DenseTensor");
-    AddOutput("OutFalse", "False branch of input phi::DenseTensor");
-    AddAttr<int>("level", "(int) the specific lod level to split.")
-        .SetDefault(0)
-        .EqualGreaterThan(0);
-    AddComment(
-        R"DOC(
-        Split a phi::DenseTensor with a Mask at certain level. The input phi::DenseTensor
-        has 3 sequence at certain lod level. The Mask is a bool column vector,
-        such as [0, 1, 0] at the same level. The first and third sequence will
-        be send to False Output phi::DenseTensor; whereas the second sequence will
-        be send to True Output phi::DenseTensor. Please refer to MergeLoDTensorOp.)DOC");
-  }
-};
-
-class SplitLoDTensorInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "SplitLoDTensor");
-    OP_INOUT_CHECK(
-        context->HasInput("Mask"), "Input", "Mask", "SplitLoDTensor");
-    OP_INOUT_CHECK(
-        context->HasOutput("OutTrue"), "Output", "OutTrue", "SplitLoDTensor");
-    OP_INOUT_CHECK(
-        context->HasOutput("OutFalse"), "Output", "OutFalse", "SplitLoDTensor");
-
-    auto mask_dim = context->GetInputDim("Mask");
-    PADDLE_ENFORCE_EQ(
-        mask_dim.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "If you are using IfElse OP:"
-            "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
-            "ie.true_block():\n    out_1 = ie.input(x)\n\n"
-            "Please ensure that the cond should be a 2-D tensor and "
-            "the second dim size of cond should be 1. "
-            "But now the cond's shape is [",
-            *mask_dim.Get(),
-            "].\n"));
-    PADDLE_ENFORCE_EQ(mask_dim[1],
-                      1,
-                      platform::errors::InvalidArgument(
-                          "If you are using IfElse OP:"
-                          "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
-                          "ie.true_block():\n    out_1 = ie.input(x)\n\n"
-                          "Please ensure that the cond should be a 2-D tensor "
-                          "and the second dim size of cond should be 1. "
-                          "But now the cond's shape is [",
-                          *mask_dim.Get(),
-                          "].\n"));
-
-    context->SetOutputDim("OutTrue", context->GetInputDim("X"));
-    context->SetOutputDim("OutFalse", context->GetInputDim("X"));
-  }
-};
-
-template <typename T>
-class SplitLoDTensorArrayGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("merge_lod_tensor");
-    grad_op->SetInput("InTrue", this->OutputGrad("OutTrue"));
-    grad_op->SetInput("InFalse", this->OutputGrad("OutFalse"));
-    grad_op->SetInput("Mask", this->Input("Mask"));
-    grad_op->SetInput("X", this->Input("X"));
-    grad_op->SetOutput("Out", this->InputGrad("X"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    split_lod_tensor,
-    ops::SplitLoDTensorOp,
-    ops::SplitLoDTensorOpProtoMaker,
-    ops::SplitLoDTensorInferShape,
-    ops::SplitLoDTensorArrayGradMaker<paddle::framework::OpDesc>,
-    ops::SplitLoDTensorArrayGradMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index ddda1131f5cc7..1a4eace1f3398 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -35,11 +35,11 @@ class SplitOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input(X) of SplitOp should not be null."));
     PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(),
                       1UL,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Outputs(Out) of SplitOp should not be empty."));
     int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
     int num = static_cast<int>(ctx->Attrs().Get<int>("num"));
@@ -218,7 +218,7 @@ class SplitCompositeGradOpMaker : public prim::CompositeGradOpMakerBase {
     std::vector<paddle::Tensor> out_grad = this->GetMultiOutputGrad("Out");
 
     if (tensor_axis.is_initialized() || tensor_sections.is_initialized()) {
-      PADDLE_THROW(platform::errors::Unimplemented(
+      PADDLE_THROW(phi::errors::Unimplemented(
           "We don't support dynamic index or sections from tensor for split "
           "composite grad for now. "));
     } else {
diff --git a/paddle/fluid/operators/split_op.h b/paddle/fluid/operators/split_op.h
index aaee366a4636a..e7ba7d0706fd2 100644
--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/utils.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/split_kernel.h"
 namespace paddle {
 namespace operators {
@@ -39,7 +39,7 @@ static inline std::vector<framework::DDim> UpdateOutsDims(
       PADDLE_ENFORCE_EQ(
           input_axis_dim % num,
           0,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The input's size along the split dimension "
               "must be evenly divisible by Attr(num_or_sections). "
               "But received Attr(num_or_sections) "
@@ -75,7 +75,7 @@ static inline std::vector<framework::DDim> UpdateOutsDims(
         PADDLE_ENFORCE_LE(
             num_of_unk,
             1,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "Only one dimension value of Attr(num_or_sections) "
                 "in SplitOp can be -1. "
                 "But received Attr(num_or_sections) = [%s].",
@@ -89,7 +89,7 @@ static inline std::vector<framework::DDim> UpdateOutsDims(
         PADDLE_ENFORCE_LT(
             sum_of_section,
             input_axis_dim,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "Sum of Attr(num_or_sections) other than unknown section "
                 "must be less than the input's "
                 "size "
@@ -105,7 +105,7 @@ static inline std::vector<framework::DDim> UpdateOutsDims(
         PADDLE_ENFORCE_EQ(
             sum_of_section,
             input_axis_dim,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "Sum of Attr(num_or_sections) must be equal to the input's "
                 "size "
                 "along the split dimension. But received Attr(num_or_sections)"
diff --git a/paddle/fluid/operators/spp_op.cc b/paddle/fluid/operators/spp_op.cc
deleted file mode 100644
index 98072746e8eee..0000000000000
--- a/paddle/fluid/operators/spp_op.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/spp_op.h"
-
-#include <string>
-#include <vector>
-namespace paddle {
-namespace operators {
-
-class SppOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "X",
-        "(Tensor) The input tensor of spp operator. "
-        "The format of input tensor is NCHW. Where N is batch size, C is the "
-        "number of channels, H and W is the height and width of feature.");
-    AddOutput("Out",
-              "(Tensor) The output tensor of spp operator."
-              "N * M."
-              "M = C * H * W");
-    AddAttr<int>("pyramid_height", "(int), multi level pooling");
-    AddAttr<std::string>(
-        "pooling_type",
-        "(string), pooling type, can be \"max\" for max-pooling "
-        "and \"avg\" for average-pooling.")
-        .InEnum({"max", "avg"});
-    AddComment(R"DOC(
-        "With spatial pyramid pooling, the input image can
-        be of any sizes. This not only allows arbitrary aspect
-        ratios, but also allows arbitrary scales. We can resize
-        the input image to any scale (e.g., min(w, h)=180, 224,
-        ...) and apply the same deep network. When the
-        input image is at different scales, the network (with
-        the same filter sizes) will extract features at different
-        scales. The scales play important roles in traditional
-        methods.
-        Input shape: $(N, C_{in}, H_{in}, W_{in})$
-        Output shape: $(H_{out}, W_{out})$
-        Where
-          $$
-            H_{out} = N \\
-            W_{out} = (((4^pyramid_height) - 1) / (4 - 1))$ * C_{in}
-          $$
-        paper https://arxiv.org/pdf/1406.4729v4.pdf
-        )DOC");
-  }
-};
-
-class SppOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of SppOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of SppOp should not be null."));
-    auto in_x_dims = ctx->GetInputDim("X");
-    int pyramid_height = ctx->Attrs().Get<int>("pyramid_height");
-    PADDLE_ENFORCE_EQ(in_x_dims.size(),
-                      4,
-                      platform::errors::InvalidArgument(
-                          "Spping intput must be of 4-dimensional."));
-    int outlen =
-        ((std::pow(4, pyramid_height) - 1) / (4 - 1)) * in_x_dims[1];  // NOLINT
-    std::vector<int64_t> output_shape({in_x_dims[0], outlen});
-    ctx->SetOutputDim("Out", common::make_ddim(output_shape));
-  }
-};
-
-class SppOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"),
-        true,
-        platform::errors::InvalidArgument("Input(X) must not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput(framework::GradVarName("X")),
-        true,
-        platform::errors::InvalidArgument("Input(X@GRAD) should not be null."));
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    spp,
-    ops::SppOp,
-    ops::SppOpMaker,
-    paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
-    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>);
-REGISTER_OPERATOR(spp_grad, ops::SppOpGrad);
-
-PD_REGISTER_STRUCT_KERNEL(spp, CPU, ALL_LAYOUT, ops::SppKernel, float, double) {
-}
-PD_REGISTER_STRUCT_KERNEL(
-    spp_grad, CPU, ALL_LAYOUT, ops::SppGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/spp_op.cu.cc b/paddle/fluid/operators/spp_op.cu.cc
deleted file mode 100644
index b41fa8ae5fcf7..0000000000000
--- a/paddle/fluid/operators/spp_op.cu.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/spp_op.h"
-
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(spp, GPU, ALL_LAYOUT, ops::SppKernel, float, double) {
-}
-PD_REGISTER_STRUCT_KERNEL(
-    spp_grad, GPU, ALL_LAYOUT, ops::SppGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h
deleted file mode 100644
index 5d3f4a78020a0..0000000000000
--- a/paddle/fluid/operators/spp_op.h
+++ /dev/null
@@ -1,220 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/phi_utils.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/pooling.h"
-#include "paddle/phi/kernels/funcs/strided_memcpy.h"
-
-namespace paddle {
-namespace operators {
-template <typename T, typename DeviceContext>
-class SppKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const phi::DenseTensor* in_x = context.Input<phi::DenseTensor>("X");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-    int pyramid_height = context.template Attr<int>("pyramid_height");
-    std::string pooling_type =
-        context.template Attr<std::string>("pooling_type");
-    out->mutable_data<T>(context.GetPlace());
-    auto out_stride = common::stride(out->dims());
-    int input_h = in_x->dims()[2];
-    int input_w = in_x->dims()[3];
-    size_t output_offset = 0;
-    for (int p = 0; p < pyramid_height; ++p) {
-      int bins = std::pow(2, p);
-      int kernel_size_h = std::ceil(input_h / static_cast<double>(bins));
-      int kernel_size_w = std::ceil(input_w / static_cast<double>(bins));
-      int padding_h = (kernel_size_h * bins - input_h + 1) / 2;
-      int padding_w = (kernel_size_w * bins - input_w + 1) / 2;
-      std::vector<int> kernel_size({kernel_size_h, kernel_size_w});
-      std::vector<int> strides({kernel_size_h, kernel_size_w});
-      std::vector<int> paddings({padding_h, padding_w});
-      // pooling output shape
-      phi::DenseTensor out_level;
-      std::vector<int64_t> output_shape_vec(
-          {in_x->dims()[0], in_x->dims()[1], bins, bins});
-      framework::DDim output_shape(common::make_ddim(output_shape_vec));
-      out_level.mutable_data<T>(output_shape, context.GetPlace());
-      // pooling
-      if (pooling_type == "max") {
-        phi::funcs::Pool2dFunctor<
-            typename framework::ConvertToPhiContext<DeviceContext>::TYPE,
-            phi::funcs::MaxPool<T>,
-            T>
-            pool_forward;
-        phi::funcs::MaxPool<T> max_process;
-        pool_forward(context.template device_context<DeviceContext>(),
-                     *in_x,
-                     kernel_size,
-                     strides,
-                     paddings,
-                     true,
-                     false,
-                     &out_level,
-                     max_process);
-      } else if (pooling_type == "avg") {
-        phi::funcs::Pool2dFunctor<
-            typename framework::ConvertToPhiContext<DeviceContext>::TYPE,
-            phi::funcs::AvgPool<T>,
-            T>
-            pool_forward;
-        phi::funcs::AvgPool<T> avg_process;
-        pool_forward(context.template device_context<DeviceContext>(),
-                     *in_x,
-                     kernel_size,
-                     strides,
-                     paddings,
-                     true,
-                     false,
-                     &out_level,
-                     avg_process);
-      }
-      // flatten pooling output shape
-      int output_flatten_w = in_x->dims()[1] * bins * bins;
-      std::vector<int64_t> output_flatten_shape_vec(
-          {in_x->dims()[0], output_flatten_w});
-      framework::DDim output_flatten_shape(
-          common::make_ddim(output_flatten_shape_vec));
-      out_level.Resize(output_flatten_shape);
-      // concat
-      auto out_level_stride = common::stride(out_level.dims());
-      phi::funcs::StridedMemcpy<T>(
-          context.template device_context<DeviceContext>(),
-          out_level.data<T>(),
-          out_level_stride,
-          out_level.dims(),
-          out_stride,
-          out->data<T>() + output_offset);
-      output_offset += out_level.dims()[1] * out_level_stride[1];
-    }
-  }
-};
-template <typename T, typename DeviceContext>
-class SppGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const phi::DenseTensor* in_x = context.Input<phi::DenseTensor>("X");
-    const phi::DenseTensor* out = context.Input<phi::DenseTensor>("Out");
-    const phi::DenseTensor* out_grad =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    phi::DenseTensor* in_x_grad =
-        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    int pyramid_height = context.template Attr<int>("pyramid_height");
-    std::string pooling_type =
-        context.template Attr<std::string>("pooling_type");
-    auto& device_ctx = context.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<
-        typename framework::ConvertToPhiContext<DeviceContext>::TYPE,
-        T>
-        zero;
-    in_x_grad->mutable_data<T>(context.GetPlace());
-    zero(device_ctx, in_x_grad, static_cast<T>(0));
-    auto out_stride = common::stride(out->dims());
-    int input_h = in_x->dims()[2];
-    int input_w = in_x->dims()[3];
-    size_t out_offset = 0;
-    for (int p = 0; p < pyramid_height; ++p) {
-      int bins = std::pow(2, p);
-      int kernel_size_h = std::ceil(input_h / static_cast<double>(bins));
-      int kernel_size_w = std::ceil(input_w / static_cast<double>(bins));
-      int padding_h = (kernel_size_h * bins - input_h + 1) / 2;
-      int padding_w = (kernel_size_w * bins - input_w + 1) / 2;
-      std::vector<int> kernel_size({kernel_size_h, kernel_size_w});
-      std::vector<int> strides({kernel_size_h, kernel_size_w});
-      std::vector<int> paddings({padding_h, padding_w});
-      // split out and outgrad  ...  to flatten
-      phi::DenseTensor out_level;
-      phi::DenseTensor outgrad_level;
-      int out_flatten_w = in_x->dims()[1] * bins * bins;
-      std::vector<int64_t> out_flatten_shape_vec(
-          {in_x->dims()[0], out_flatten_w});
-      framework::DDim out_flatten_shape(
-          common::make_ddim(out_flatten_shape_vec));
-      out_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
-      outgrad_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
-      auto flatten_stride = common::stride(out_level.dims());
-      // memcpy
-      phi::funcs::StridedMemcpy<T>(
-          context.template device_context<DeviceContext>(),
-          out->data<T>() + out_offset,
-          out_stride,
-          out_level.dims(),
-          flatten_stride,
-          out_level.data<T>());
-
-      phi::funcs::StridedMemcpy<T>(
-          context.template device_context<DeviceContext>(),
-          out_grad->data<T>() + out_offset,
-          out_stride,
-          outgrad_level.dims(),
-          flatten_stride,
-          outgrad_level.data<T>());
-      out_offset += out_level.dims()[1] * out_stride[1];
-      // flatten backward to nchw
-
-      std::vector<int64_t> out_shape_vec({in_x->dims()[0], in_x->dims()[1]});
-      out_shape_vec.push_back(
-          (input_h - kernel_size_h + 2 * padding_h) / kernel_size_h + 1);
-      out_shape_vec.push_back(
-          (input_w - kernel_size_w + 2 * padding_w) / kernel_size_w + 1);
-      framework::DDim out_shape(common::make_ddim(out_shape_vec));
-      out_level.ShareDataWith(out_level);
-      out_level.Resize(out_shape);
-      outgrad_level.ShareDataWith(outgrad_level);
-      outgrad_level.Resize(out_shape);
-      // pooling backward
-      if (pooling_type == "max") {
-        phi::funcs::MaxPool2dGradFunctor<
-            typename framework::ConvertToPhiContext<DeviceContext>::TYPE,
-            T>
-            pool2d_backward;
-        pool2d_backward(context.template device_context<DeviceContext>(),
-                        *in_x,
-                        *&out_level,
-                        *&outgrad_level,
-                        kernel_size,
-                        strides,
-                        paddings,
-                        in_x_grad);
-      } else if (pooling_type == "avg") {
-        phi::funcs::Pool2dGradFunctor<
-            typename framework::ConvertToPhiContext<DeviceContext>::TYPE,
-            phi::funcs::AvgPoolGrad<T>,
-            T>
-            pool_backward;
-        phi::funcs::AvgPoolGrad<T> avg_process;
-        pool_backward(context.template device_context<DeviceContext>(),
-                      *in_x,
-                      *&out_level,
-                      *&outgrad_level,
-                      kernel_size,
-                      strides,
-                      paddings,
-                      true,
-                      false,
-                      in_x_grad,
-                      avg_process);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/squeeze_op.h b/paddle/fluid/operators/squeeze_op.h
index 10ff809d60888..21aed7b00882c 100644
--- a/paddle/fluid/operators/squeeze_op.h
+++ b/paddle/fluid/operators/squeeze_op.h
@@ -45,7 +45,7 @@ framework::DDim GetOutputShape(const std::vector<int> squeeze_dims,
       PADDLE_ENFORCE_GE(
           current,
           0,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Each axis in Attr(axes) should be in the range of [%d, %d]"
               "But current axis is:%d, input tensor's shape = [%s].",
               -in_dims.size(),
@@ -55,7 +55,7 @@ framework::DDim GetOutputShape(const std::vector<int> squeeze_dims,
       PADDLE_ENFORCE_LT(
           current,
           in_dims.size(),
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Each axis in Attr(axes) should be in the range of [%d, %d]"
               "But current axis is:%d, input tensor's shape = [%s].",
               -in_dims.size(),
diff --git a/paddle/fluid/operators/stft_op.cc b/paddle/fluid/operators/stft_op.cc
index 34f6ee854dd7b..b1165bf2bf295 100644
--- a/paddle/fluid/operators/stft_op.cc
+++ b/paddle/fluid/operators/stft_op.cc
@@ -36,20 +36,20 @@ class StftOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         x_rank,
         2,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(X) of StftOp should be a tensor with shape [N, T], "
             "but got rank %s.",
             x_rank));
     PADDLE_ENFORCE_GT(
         hop_length,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Attribute(hop_length) should be greater than 0, but got %s.",
             hop_length));
     PADDLE_ENFORCE_EQ(
         window_size,
         n_fft,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(Window) of StftOp should be equal with n_fft %s, "
             "but got %s.",
             n_fft,
@@ -60,7 +60,7 @@ class StftOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_LE(n_fft,
                       seq_length,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Attribute(frame_length) should be less equal than "
                           "sequence length, but got (%s) > (%s).",
                           n_fft,
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index d8b7e35d6d3a1..6cbb99ff2032f 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -38,20 +38,20 @@ class SumOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GT(
         x_vars.size(),
         0,
-        platform::errors::InvalidArgument("Input[X] should not be empty"));
+        phi::errors::InvalidArgument("Input[X] should not be empty"));
 
     PADDLE_ENFORCE_NOT_NULL(
         x_vars[0],
-        platform::errors::NotFound("Input var[%s] should not be nullptr",
-                                   x_vars_name[0]));
+        phi::errors::NotFound("Input var[%s] should not be nullptr",
+                              x_vars_name[0]));
 
     if (x_vars[0]->IsType<phi::DenseTensor>()) {
       int dtype = -1;
       for (size_t idx = 0; idx < x_vars.size(); ++idx) {
         PADDLE_ENFORCE_NOT_NULL(
             x_vars[idx],
-            platform::errors::NotFound("Input var[%s] should not be nullptr",
-                                       x_vars_name[idx]));
+            phi::errors::NotFound("Input var[%s] should not be nullptr",
+                                  x_vars_name[idx]));
         auto tensor =
             framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_vars[idx]);
         if (!tensor->IsInitialized()) {
@@ -62,13 +62,13 @@ class SumOp : public framework::OperatorWithKernel {
         } else {
           PADDLE_ENFORCE_EQ(dtype,
                             framework::TransToProtoVarType(tensor->dtype()),
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "The inputs type of sum op must be same"));
         }
       }
       PADDLE_ENFORCE_NE(dtype,
                         -1,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "Sum operator should have at least one tensor"));
 
       auto data_type = static_cast<framework::proto::VarType::Type>(dtype);
@@ -108,13 +108,13 @@ class SumOp : public framework::OperatorWithKernel {
           }
         }
       }
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Expected each tensor in Input(x) in sum op has be initialized, but "
           "some tensor in Input(x) is not be initialized, please check your "
           "code.",
           framework::ToTypeName(x_vars[0]->Type())));
     }
-    PADDLE_THROW(platform::errors::InvalidArgument(
+    PADDLE_THROW(phi::errors::InvalidArgument(
         "Expected type of Input(X) must be Tensor,  SelectedRows or "
         "LodTensorArray. But got "
         "unsupport type: %s.",
@@ -164,7 +164,7 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
                << " type is " << ctx->GetInputType("X", static_cast<int>(ind))
                << "\n";
           }
-          PADDLE_THROW(platform::errors::InvalidArgument(
+          PADDLE_THROW(phi::errors::InvalidArgument(
               "Not all inputs are tensor array:\n%s", os.str()));
         }
         var_type = framework::proto::VarType::LOD_TENSOR_ARRAY;
diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h
index 273e2c7b65100..c480bb9bb12e9 100644
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -24,12 +24,12 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/diag_op.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
@@ -67,9 +67,9 @@ struct RealMulComplexFunctor {
     PADDLE_ENFORCE_LT(
         y.imag,
         1e-6,
-        platform::errors::InvalidArgument("The image part of y must to be 0"
-                                          "but got [%d]",
-                                          y.imag));
+        phi::errors::InvalidArgument("The image part of y must to be 0"
+                                     "but got [%d]",
+                                     y.imag));
     return platform::complex<phi::dtype::Real<T>>(x.real * y.real,
                                                   x.imag * y.real);
   }
@@ -79,9 +79,9 @@ static std::vector<int> GetBroadcastShape(InTensors ins) {
   PADDLE_ENFORCE_EQ(
       ins.size(),
       2,
-      platform::errors::InvalidArgument("GetBroadcastShape Receive 2 tensors"
-                                        "but got [%d]",
-                                        ins.size()));
+      phi::errors::InvalidArgument("GetBroadcastShape Receive 2 tensors"
+                                   "but got [%d]",
+                                   ins.size()));
   auto x_dim = ins[0]->dims();
   auto y_dim = ins[1]->dims();
   std::vector<int> broadcast_shape =
@@ -104,7 +104,7 @@ static std::vector<int> GetBroadcastShape(InTensors ins) {
       broadcast_shape[final_rank - i] = x_dim[rank_x - i];
       continue;
     }
-    PADDLE_THROW(platform::errors::InvalidArgument(
+    PADDLE_THROW(phi::errors::InvalidArgument(
         "Wrong Input Shape in broadcast operator: "
         "Input(X)'s shape must follow the broadcast rule with Input(Y)'s "
         "shape, but received [%s] (X) vs [%s] (Y).",
@@ -125,14 +125,14 @@ static inline framework::DDim ComputeAndCheckShapeForConcatOp(
     PADDLE_ENFORCE_EQ(
         inputs_dims[i].size(),
         out_dims.size(),
-        platform::errors::InvalidArgument("The shape of input[0] and input[%d] "
-                                          "is expected to be equal."
-                                          "But received input[0]'s shape = "
-                                          "[%s], input[%d]'s shape = [%s].",
-                                          i,
-                                          inputs_dims[0],
-                                          i,
-                                          inputs_dims[i]));
+        phi::errors::InvalidArgument("The shape of input[0] and input[%d] "
+                                     "is expected to be equal."
+                                     "But received input[0]'s shape = "
+                                     "[%s], input[%d]'s shape = [%s].",
+                                     i,
+                                     inputs_dims[0],
+                                     i,
+                                     inputs_dims[i]));
     for (size_t j = 0; j < in_zero_dims_size; j++) {
       if (j == axis) {
         if (is_runtime) {
@@ -151,7 +151,7 @@ static inline framework::DDim ComputeAndCheckShapeForConcatOp(
           // check all shape in run time
           PADDLE_ENFORCE_EQ(inputs_dims[0][j],
                             inputs_dims[i][j],
-                            platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "The %d-th dimension of input[0] and input[%d] "
                                 "is expected to be equal."
                                 "But received input[0]'s shape = "
@@ -175,7 +175,7 @@ static inline int64_t ComputeAxisForConcatOp(int64_t axis, int64_t rank) {
   PADDLE_ENFORCE_EQ(
       axis >= -rank && axis < rank,
       true,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The axis is expected to be in range of [%d, %d), but got %d",
           -rank,
           rank,
@@ -205,7 +205,7 @@ static std::vector<int64_t> get_broadcast_batch_portion(
     PADDLE_ENFORCE_EQ(
         (x_size == y_size || x_size == 1 || y_size == 1),
         true,
-        platform::errors::PreconditionNotMet(
+        phi::errors::PreconditionNotMet(
             "The size of tensor x (%d) must match the size of tensor y "
             "(%d) at non-singleton dimension %d.",
             x_size,
@@ -337,7 +337,7 @@ struct DeviceIndependenceTensorOperations {
       DITO_TRANSPOSE_RANK_CASE(5);
       DITO_TRANSPOSE_RANK_CASE(6);
       default: {
-        PADDLE_THROW(platform::errors::InvalidArgument(
+        PADDLE_THROW(phi::errors::InvalidArgument(
             "Invalid Rank number, "
             "currently only support rank between 2~6"));
       }
@@ -350,11 +350,11 @@ struct DeviceIndependenceTensorOperations {
                         int padding_value = 0) {
     PADDLE_ENFORCE_EQ(padding_value,
                       0,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Current diag only support padding_value = 0"));
     PADDLE_ENFORCE_EQ(offset,
                       0,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Current diag only support offset = 0,"
                           "you can use DiagOp instead(not recommend)"));
 
@@ -362,7 +362,7 @@ struct DeviceIndependenceTensorOperations {
     int x_rank = x.dims().size();
     std::vector<int> out_shape;
     if (x_rank == 2) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Current diag only support vector"
           "-> diagonalized matrix, not support matrix -> vector,"
           " Use DiagOp instead."));
@@ -371,7 +371,7 @@ struct DeviceIndependenceTensorOperations {
       out_shape.push_back(x.dims()[0]);
     } else {
       PADDLE_THROW(
-          platform::errors::InvalidArgument("Rank must less or equal than 2"));
+          phi::errors::InvalidArgument("Rank must less or equal than 2"));
     }
     ret = Fill({out_shape[0], out_shape[0]}, 0.0);
     T* output = ret.mutable_data<T>(context.GetPlace());
@@ -540,11 +540,11 @@ struct DeviceIndependenceTensorOperations {
     PADDLE_ENFORCE_EQ(
         axes.size(),
         starts.size(),
-        platform::errors::InvalidArgument("Slice Operator Argument Invalided"));
+        phi::errors::InvalidArgument("Slice Operator Argument Invalided"));
     PADDLE_ENFORCE_EQ(
         ends.size(),
         starts.size(),
-        platform::errors::InvalidArgument("Slice Operator Argument Invalided"));
+        phi::errors::InvalidArgument("Slice Operator Argument Invalided"));
     for (unsigned int i = 0; i < axes.size(); ++i) {
       int axis = axes[i];
       if (axis < 0) axis = rank + axis;
@@ -553,7 +553,7 @@ struct DeviceIndependenceTensorOperations {
       int ed = ends[i];
       PADDLE_ENFORCE_GT(ed,
                         st,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "C++ Slice Operation Not Support End < Start"));
       out_shape[axis] = ed - st;
     }
@@ -576,7 +576,7 @@ struct DeviceIndependenceTensorOperations {
       DITO_SLICE_RANK_CASE(5);
       DITO_SLICE_RANK_CASE(6);
       default: {
-        PADDLE_THROW(platform::errors::InvalidArgument(
+        PADDLE_THROW(phi::errors::InvalidArgument(
             "Invalid Rank number, "
             "currently only support rank between 2~6"));
       }
@@ -584,22 +584,6 @@ struct DeviceIndependenceTensorOperations {
     return ret;
   }
 
-  phi::DenseTensor TrilTriu(const phi::DenseTensor& x,
-                            int diagonal,
-                            bool lower) {
-    framework::AttributeMap attrs;
-    attrs["diagonal"] = diagonal;
-    attrs["lower"] = lower;
-    NameInTensorMap inputs({{"X", {&x}}});
-    int x_rank = x.dims().size();
-    PADDLE_ENFORCE_GE(
-        x_rank,
-        2,
-        platform::errors::InvalidArgument("Rank must be at least 2."));
-    std::vector<int> out_shape = common::vectorize<int>(x.dims());
-    return CreateOpRunAndReturnTensor("tril_triu", inputs, attrs, out_shape);
-  }
-
   phi::DenseTensor TriangularSolve(const phi::DenseTensor& x,
                                    const phi::DenseTensor& y,
                                    bool upper,
@@ -714,12 +698,12 @@ struct DeviceIndependenceTensorOperations {
     size_t rank = in->dims().size();
     PADDLE_ENFORCE_EQ(start.size(),
                       rank,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "EigenSliceWrapper function start "
                           "argument must have the same length as input rank."));
     PADDLE_ENFORCE_EQ(end.size(),
                       rank,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "EigenSliceWrapper function end "
                           "argument must have the same length as input rank."));
     auto eigen_place_ptr =
@@ -732,7 +716,7 @@ struct DeviceIndependenceTensorOperations {
       offsets_32bit[i] = start[i];
       extents_32bit[i] = end[i];
     }
-    EigenSlice<std::decay_t<decltype(eigen_place)>, T, D>::Eval(
+    phi::funcs::EigenSlice<std::decay_t<decltype(eigen_place)>, T, D>::Eval(
         eigen_place,
         framework::To32BitIndex(out_t),
         framework::To32BitIndex(in_t),
diff --git a/paddle/fluid/operators/tdm_child_op.cc b/paddle/fluid/operators/tdm_child_op.cc
index 7b9932ffb4a62..e14dc0e316219 100644
--- a/paddle/fluid/operators/tdm_child_op.cc
+++ b/paddle/fluid/operators/tdm_child_op.cc
@@ -60,18 +60,18 @@ class TDMChildOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Inputs(X) of TdmChild should not be null."));
     PADDLE_ENFORCE_EQ(ctx->HasInput("TreeInfo"),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Inputs(TreeInfo) of TdmChild should not be null."));
 
     int child_nums = ctx->Attrs().Get<int>("child_nums");
     PADDLE_ENFORCE_GT(
         child_nums,
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "ValueError: The value of the 'child_nums' must greater than 0. "
             "But received child_nums value = %d, ",
             child_nums));
@@ -82,7 +82,7 @@ class TDMChildOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         info_dims.size(),
         2,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "ShapeError: The dimensions of the 'tree info' must be 2. "
             "But received tree info's dimensions = %d, "
             "tree info's shape = [%s].",
diff --git a/paddle/fluid/operators/tdm_child_op.h b/paddle/fluid/operators/tdm_child_op.h
index 2d849e1849348..3380062743047 100644
--- a/paddle/fluid/operators/tdm_child_op.h
+++ b/paddle/fluid/operators/tdm_child_op.h
@@ -56,7 +56,7 @@ void TDMChildInner(const framework::ExecutionContext &context,
     PADDLE_ENFORCE_LT(
         input_data[input_ids],
         node_nums,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "input id of OP(paddle.incubate.layers.tdm_child) "
             "expected >= 0 and < %ld, but got %ld. Please check input "
             "value.",
@@ -65,7 +65,7 @@ void TDMChildInner(const framework::ExecutionContext &context,
     PADDLE_ENFORCE_LE(
         0,
         input_data[input_ids],
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "input id of OP(paddle.incubate.layers.tdm_child) "
             "expected >= 0 and < %ld, but got %ld. Please check input "
             "value.",
@@ -119,7 +119,7 @@ class TDMChildKernel : public framework::OpKernel<T> {
                             input_type == framework::proto::VarType::INT64;
     PADDLE_ENFORCE_EQ(input_type_match,
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input(X) holds the wrong type, it holds %s, but "
                           "desires to be %s or %s",
                           paddle::framework::DataTypeToString(input_type),
@@ -136,7 +136,7 @@ class TDMChildKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         info_type_match,
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(TreeInfo) holds the wrong type, it holds %s, but "
             "desires to be %s or %s",
             paddle::framework::DataTypeToString(info_type),
@@ -156,7 +156,7 @@ class TDMChildKernel : public framework::OpKernel<T> {
                           output_type == framework::proto::VarType::INT64;
     PADDLE_ENFORCE_EQ(out_type_match,
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Output(Child) & Output(LeafMask) holds the wrong "
                           "type, it holds %s, but "
                           "desires to be %s or %s",
diff --git a/paddle/fluid/operators/tdm_sampler_op.cc b/paddle/fluid/operators/tdm_sampler_op.cc
index d516af7718365..f7877b8268a04 100644
--- a/paddle/fluid/operators/tdm_sampler_op.cc
+++ b/paddle/fluid/operators/tdm_sampler_op.cc
@@ -81,15 +81,15 @@ class TDMSamplerOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Inputs(Input) of TdmSampler should not be null."));
     PADDLE_ENFORCE_EQ(ctx->HasInput("Travel"),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Inputs(Travel) of TdmSampler should not be null."));
     PADDLE_ENFORCE_EQ(ctx->HasInput("Layer"),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Inputs(Layer) of TdmSampler should not be null."));
     auto neg_samples_num_vec =
         ctx->Attrs().Get<std::vector<int>>("neg_samples_num_list");
diff --git a/paddle/fluid/operators/tdm_sampler_op.h b/paddle/fluid/operators/tdm_sampler_op.h
index 52f86d633307b..7dcc72b66a1a6 100644
--- a/paddle/fluid/operators/tdm_sampler_op.h
+++ b/paddle/fluid/operators/tdm_sampler_op.h
@@ -93,7 +93,7 @@ void TDMSamplerInner(const framework::ExecutionContext &context,
     PADDLE_ENFORCE_LT(
         -1,
         input_id,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Variable value (input) of OP(fluid.layers.tdm_sampler) "
             "expected >= 0 and < %ld, but got %ld. Please check input "
             "value.",
@@ -102,7 +102,7 @@ void TDMSamplerInner(const framework::ExecutionContext &context,
     PADDLE_ENFORCE_LT(
         input_id,
         travel_dim[0],
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Variable value (input) of OP(fluid.layers.tdm_sampler) "
             "expected >= 0 and < %ld, but got %ld. Please check input "
             "value.",
@@ -126,7 +126,7 @@ void TDMSamplerInner(const framework::ExecutionContext &context,
       PADDLE_ENFORCE_LE(
           sample_num,
           node_nums - 1,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Neg sample nums id of OP(fluid.layers.tdm_sampler) at layer %ld "
               "expected <= %ld - 1 (positive included), but got %ld. Please "
               "check neg_samples_num_list.",
@@ -163,7 +163,7 @@ void TDMSamplerInner(const framework::ExecutionContext &context,
       PADDLE_ENFORCE_LE(
           positive_node_id,
           node_id_max,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Positive node id of OP(fluid.layers.tdm_sampler) at layer %ld "
               "expected >= %ld and <= %ld, but got %ld. Please check input "
               "value.",
@@ -174,7 +174,7 @@ void TDMSamplerInner(const framework::ExecutionContext &context,
       PADDLE_ENFORCE_LE(
           node_id_min,
           positive_node_id,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Positive node id of OP(fluid.layers.tdm_sampler) at layer %ld "
               "expected >= %ld and <= %ld, but got %ld. Please check input "
               "value.",
@@ -224,7 +224,7 @@ void TDMSamplerInner(const framework::ExecutionContext &context,
         PADDLE_ENFORCE_LE(
             layer_data[layer_offset_lod[layer_idx] + sample_res],
             node_id_max,
-            platform::errors::InvalidArgument(
+            phi::errors::InvalidArgument(
                 "Negative node id of OP(fluid.layers.tdm_sampler) at layer %ld"
                 "expected >= %ld and <= %ld, but got %ld. Please check input "
                 "tdm tree structure and tdm travel info.",
@@ -270,7 +270,7 @@ class TDMSamplerKernel : public framework::OpKernel<T> {
                             input_type == framework::proto::VarType::INT64;
     PADDLE_ENFORCE_EQ(input_type_match,
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input(X) holds the wrong type, it holds %s, but "
                           "desires to be %s or %s",
                           paddle::framework::DataTypeToString(input_type),
@@ -286,7 +286,7 @@ class TDMSamplerKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         travel_type_match,
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(Travel) holds the wrong type, it holds %s, but "
             "desires to be %s or %s",
             paddle::framework::DataTypeToString(travel_type),
@@ -301,7 +301,7 @@ class TDMSamplerKernel : public framework::OpKernel<T> {
                             layer_type == framework::proto::VarType::INT64;
     PADDLE_ENFORCE_EQ(layer_type_match,
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input(Layer) holds the wrong type, it holds %s, but "
                           "desires to be %s or %s",
                           paddle::framework::DataTypeToString(layer_type),
@@ -312,7 +312,7 @@ class TDMSamplerKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         travel_type,
         layer_type,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(Travel) must holds the same type with "
             "Input(Layer), but Travel holds %s, and Layer holds %s",
             paddle::framework::DataTypeToString(travel_type),
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
deleted file mode 100644
index 332008894d5b9..0000000000000
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
+++ /dev/null
@@ -1,263 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/teacher_student_sigmoid_loss_op.h"
-
-#include <memory>
-
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-class TeacherStudentSigmoidLossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("X"), "Input", "X", "teacher_student_sigmoid_loss");
-    OP_INOUT_CHECK(ctx->HasInput("Label"),
-                   "Input",
-                   "Label",
-                   "teacher_student_sigmoid_loss");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Y"), "Output", "Y", "teacher_student_sigmoid_loss");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto label_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(x_dims.size(),
-                      2UL,
-                      platform::errors::InvalidArgument(
-                          "Input(X)'s rank should be 2. But received: "
-                          "Input(X)'s rank is [%d]",
-                          x_dims.size()));
-    PADDLE_ENFORCE_EQ(label_dims.size(),
-                      2UL,
-                      platform::errors::InvalidArgument(
-                          "Input(Label)'s rank should be 2. But "
-                          "received Input(Label)'s rank is [%d]",
-                          label_dims.size()));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[0],
-          label_dims[0],
-          platform::errors::InvalidArgument(
-              "The 1st dimension of Input(X) and Input(Label) should "
-              "be equal. The difference is [%d]: [%d]",
-              x_dims[0],
-              label_dims[0]));
-      PADDLE_ENFORCE_EQ(label_dims[1],
-                        1UL,
-                        platform::errors::InvalidArgument(
-                            "The 2nd dimension of "
-                            "Input(Label) should be 1. But received "
-                            "Input(Label)'s 2nd dim is [%d]",
-                            label_dims[1]));
-    }
-    ctx->SetOutputDim("Y", {x_dims[0], 1});
-    ctx->ShareLoD("X", /*->*/ "Y");
-  }
-
- protected:
-  // Explicitly set that the data type of computation kernel of
-  // teacher_student_sigmoid_loss
-  // is determined by its input "X".
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class TeacherStudentSigmoidLossGradOpMaker
-    : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("teacher_student_sigmoid_loss_grad");
-
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("Label", this->Input("Label"));
-    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
-
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-class TeacherStudentSigmoidLossGradientOp
-    : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("X"), "Input", "X", "teacher_student_sigmoid_loss_grad");
-    OP_INOUT_CHECK(ctx->HasInput("Label"),
-                   "Input",
-                   "X",
-                   "teacher_student_sigmoid_loss_grad");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")),
-                   "Input",
-                   "Y@Grad",
-                   "teacher_student_sigmoid_loss_grad");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")),
-                   "Input",
-                   "X@Grad",
-                   "teacher_student_sigmoid_loss_grad");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto label_dims = ctx->GetInputDim("Label");
-    auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
-    PADDLE_ENFORCE_EQ(
-        x_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "Input(X)'s rank should be 2. But received Input(X)'s rank is [%d]",
-            x_dims.size()));
-    PADDLE_ENFORCE_EQ(dy_dims.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "Input(Y@Grad)'s rank should be 2. But received "
-                          "Input(Y@Grad)'s rank is [%d]",
-                          dy_dims.size()));
-    PADDLE_ENFORCE_EQ(label_dims.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "Input(Label)'s rank should be 2. But received "
-                          "Input(Y@Grad)'s rank is [%d]",
-                          label_dims.size()));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[0],
-          label_dims[0],
-          platform::errors::InvalidArgument(
-              "The 1st dimension of Input(X) and Input(Label) should "
-              "be equal. The difference is [%d]: [%d]",
-              x_dims[0],
-              label_dims[0]));
-      PADDLE_ENFORCE_EQ(
-          x_dims[0],
-          dy_dims[0],
-          platform::errors::InvalidArgument(
-              "The 1st dimension of Input(X) and Input(Y@Grad) should "
-              "be equal. The difference is [%d]: [%d]",
-              x_dims[0],
-              dy_dims[0]));
-      PADDLE_ENFORCE_EQ(
-          dy_dims[1],
-          1,
-          platform::errors::InvalidArgument(
-              "The 2nd dimension of Input(Y@Grad) should be 1. "
-              "But received Input(Y@Grad)'s 2nd dimension is [%d]",
-              dy_dims[1]));
-      PADDLE_ENFORCE_EQ(
-          label_dims[1],
-          1,
-          platform::errors::InvalidArgument(
-              "When Attr(soft_label) == false, the 2nd dimension of "
-              "Input(Label) should be 1. But received Input(Label)'s 2nd "
-              "dimension "
-              "is [%d]",
-              label_dims[1]));
-    }
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    ctx->ShareLoD("X", framework::GradVarName("X"));
-  }
-
- protected:
-  // Explicitly set that the data type of computation kernel of
-  // teacher_student_sigmoid_loss
-  // is determined by its input "X".
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class TeacherStudentSigmoidLossOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(phi::DenseTensor, default phi::DenseTensor<float>), a 2-D "
-             "tensor with shape [N x 1],"
-             " where N is the batch size and D is the output. "
-             "This input is a probability computed by the previous operator, "
-             "which is almost always the result of a softmax operator.");
-    AddInput("Label",
-             "(phi::DenseTensor), the ground truth which is a 2-D tensor. "
-             "Label is a phi::DenseTensor<float> with shape [N x 1]. ");
-    AddOutput("Y",
-              "(phi::DenseTensor, default phi::DenseTensor<float>), a 2-D "
-              "tensor with shape "
-              "[N x 1]. The teacher student sigmoid loss.");
-    AddAttr<float>(
-        "soft_max_up_bound",
-        "fp32, if input > soft_max_up_bound, input will be bound, default 15.0")
-        .SetDefault(15.0);
-    AddAttr<float>("soft_max_lower_bound",
-                   "fp32, if input < soft_max_lower_bound, input will be "
-                   "bound, default -15.0")
-        .SetDefault(-15.0);
-    AddComment(R"DOC(
-TeacherStudentSigmoidLoss Operator.
-
-It's similarity to SigmoidCrossEntropyWithLogits Operator. The difference is that
-we add another label(z') to original.
-        loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + log(1 + exp(-abs(x)))
-        z is click or not
-        z' is teacher value
-        label = {-2, -1, [0, 2]}
-        when z' is not exist, clk = 0 : label = -2;
-        when z' is not exist, clk = 1 : label = -1;
-        when z' is exist , clk = 0 : label = 0 + z';
-        when z' is exist    , clk = 1 : label = 1 + z';
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    teacher_student_sigmoid_loss,
-    ops::TeacherStudentSigmoidLossOp,
-    ops::TeacherStudentSigmoidLossOpMaker,
-    ops::TeacherStudentSigmoidLossGradOpMaker<paddle::framework::OpDesc>,
-    ops::TeacherStudentSigmoidLossGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OPERATOR(teacher_student_sigmoid_loss_grad,
-                  ops::TeacherStudentSigmoidLossGradientOp);
-
-PD_REGISTER_STRUCT_KERNEL(teacher_student_sigmoid_loss,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::TeacherStudentSigmoidLossOpKernel,
-                          float,
-                          double) {}
-PD_REGISTER_STRUCT_KERNEL(teacher_student_sigmoid_loss_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::TeacherStudentSigmoidLossGradOpKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
deleted file mode 100644
index 7ccb9438d4188..0000000000000
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class TeacherStudentSigmoidLossOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    phi::DenseTensor* y = context.Output<phi::DenseTensor>("Y");
-    const phi::DenseTensor* x = context.Input<phi::DenseTensor>("X");
-    const phi::DenseTensor* labels = context.Input<phi::DenseTensor>("Label");
-    T* y_data = y->mutable_data<T>(context.GetPlace());
-    const T* x_data = x->data<T>();
-    const T* label_data = labels->data<T>();
-    int64_t batch_size = x->dims()[0];
-    // loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' +
-    // log(1 + exp(-abs(x)))
-    // z is click or not
-    // z' is value q of feed_fine
-    // label = {-2, -1, [0, 2]}
-    // when z' is not exist, clk = 0 : label = -2;
-    // when z' is not exist, clk = 1 : label = -1;
-    // when z' is exist    , clk = 0 : label = 0 + z';
-    // when z' is exist    , clk = 1 : label = 1 + z';
-    for (int i = 0; i < batch_size; ++i) {
-      if (label_data[i] < -1.0) {
-        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) +
-                    log(1.0 + exp(-fabs(x_data[i])));
-      } else if (label_data[i] < 0.0) {
-        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] +
-                    log(1.0 + exp(-fabs(x_data[i])));
-      } else if (label_data[i] < 1.0) {
-        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) +
-                    log(1.0 + exp(-fabs(x_data[i]))) +
-                    (x_data[i] > 0 ? x_data[i] : 0.0) -
-                    x_data[i] * label_data[i] +
-                    log(1.0 + exp(-fabs(x_data[i])));
-      } else {
-        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] +
-                    log(1.0 + exp(-fabs(x_data[i]))) +
-                    (x_data[i] > 0 ? x_data[i] : 0.0) -
-                    x_data[i] * (label_data[i] - 1.0) +
-                    log(1.0 + exp(-fabs(x_data[i])));
-      }
-    }
-  }
-};
-
-template <typename T, typename DeviceContext>
-class TeacherStudentSigmoidLossGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const phi::DenseTensor* x = context.Input<phi::DenseTensor>("X");
-    const T* x_data = x->data<T>();
-
-    phi::DenseTensor* dx =
-        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    T* dx_data = dx->mutable_data<T>(context.GetPlace());
-
-    const phi::DenseTensor* labels = context.Input<phi::DenseTensor>("Label");
-    const T* label_data = labels->data<T>();
-
-    T soft_max_up_bound =
-        static_cast<T>(context.Attr<float>("soft_max_up_bound"));
-    T soft_max_lower_bound =
-        static_cast<T>(context.Attr<float>("soft_max_lower_bound"));
-
-    int64_t batch_size = x->dims()[0];
-
-    const phi::DenseTensor* dOut =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-
-    const T* dout_data = dOut->data<T>();
-
-    for (int i = 0; i < batch_size; ++i) {
-      T sum_val = x_data[i];
-      if (sum_val > soft_max_up_bound) {
-        sum_val = soft_max_up_bound;
-      } else {
-        if (sum_val < soft_max_lower_bound) {
-          sum_val = soft_max_lower_bound;
-        }
-      }
-
-      T pred = 1.0 / (1.0 + exp(-sum_val));
-      if (label_data[i] < -1.0) {
-        dx_data[i] = 0.0 - pred;
-      } else if (label_data[i] < 0.0) {
-        dx_data[i] = 1.0 - pred;
-      } else {
-        dx_data[i] = label_data[i] - 2.0 * pred;
-      }
-      if (sum_val >= soft_max_up_bound || sum_val <= soft_max_lower_bound) {
-        dx_data[i] = 0;
-      }
-      dx_data[i] *= dout_data[i] * -1;
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/temporal_shift_op.cu b/paddle/fluid/operators/temporal_shift_op.cu
index 51b75832d078a..d03f93e0503ae 100644
--- a/paddle/fluid/operators/temporal_shift_op.cu
+++ b/paddle/fluid/operators/temporal_shift_op.cu
@@ -156,10 +156,10 @@ template <typename T, typename DeviceContext>
 class TemporalShiftOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "This kernel only runs on GPU device."));
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()),
+        true,
+        phi::errors::InvalidArgument("This kernel only runs on GPU device."));
     auto* input = ctx.Input<phi::DenseTensor>("X");
     auto* output = ctx.Output<phi::DenseTensor>("Out");
     int t = ctx.Attr<int>("seg_num");
@@ -275,11 +275,11 @@ PD_REGISTER_STRUCT_KERNEL(temporal_shift,
                           ops::TemporalShiftOpCUDAKernel,
                           float,
                           double,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
 PD_REGISTER_STRUCT_KERNEL(temporal_shift_grad,
                           GPU,
                           ALL_LAYOUT,
                           ops::TemporalShiftGradOpCUDAKernel,
                           float,
                           double,
-                          plat::float16) {}
+                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/tensor_array_to_tensor_op.cc b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
index 69c7446d85d47..f7b5a9a8833d2 100644
--- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc
+++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
@@ -93,9 +93,9 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase {
     PADDLE_ENFORCE_GT(
         n,
         0,
-        platform::errors::InvalidArgument("Input tensorarray size should > 0,"
-                                          "but the received is %d",
-                                          n));
+        phi::errors::InvalidArgument("Input tensorarray size should > 0,"
+                                     "but the received is %d",
+                                     n));
 
     std::string base_name = Inputs("X")[0];
     std::vector<std::string> names;
@@ -229,9 +229,9 @@ class LoDTensorArray2TensorGradOp : public framework::OperatorBase {
     PADDLE_ENFORCE_GT(
         n,
         0,
-        platform::errors::InvalidArgument("Input tensorarray size should > 0, "
-                                          "but the received is: %d. ",
-                                          n));
+        phi::errors::InvalidArgument("Input tensorarray size should > 0, "
+                                     "but the received is: %d. ",
+                                     n));
 
     std::string base_name = Inputs("X")[0];
     std::vector<std::string> names;
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 28dcaf3d43e31..2709d404320bb 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -75,7 +75,7 @@ static void RuntimeStaticShapeCheck(std::vector<int64_t> runtime_input_shape,
   PADDLE_ENFORCE_EQ(
       model_input_shape == runtime_input_shape,
       true,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Input shapes are inconsistent with the model. Expect [%s] in "
           "model description, but got [%s] in runtime. TRT 5 "
           "or lower version "
@@ -101,7 +101,7 @@ static phi::DataType TRT2FluidDataType(nvinfer1::DataType type) {
       return phi::DataType::BOOL;
 #endif
     default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "unknown fluid datatype in Fluid op converter"));
       return phi::DataType::FLOAT32;
   }
@@ -114,7 +114,7 @@ static void RuntimeDynamicShapeCheck(
     const std::vector<int32_t> &max_input_shape) {
   // PADDLE_ENFORCE_EQ(
   //     runtime_input_shape.size(), min_input_shape.size(),
-  //     platform::errors::InvalidArgument(
+  //     phi::errors::InvalidArgument(
   //         "TRT engine runtime input %s dims size(%d) inconsistent "
   //         "with the dynamic shape size(%d)",
   //         x, runtime_input_shape.size(), min_input_shape.size()));
@@ -139,7 +139,7 @@ static void RuntimeDynamicShapeCheck(
   PADDLE_ENFORCE_EQ(is_input_shape_valid(
                         runtime_input_shape, min_input_shape, max_input_shape),
                     true,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "TRT runtime input shape of %s is invalid. Expect "
                         "runtime input shape to be within min/max input shape "
                         "configured in SetTRTDynamicShapeInfo(),"
@@ -362,12 +362,12 @@ class TensorRTEngineOp : public framework::OperatorBase {
           PADDLE_ENFORCE_EQ(
               min_input_shape.count(x),
               true,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "Input %s not found in TRT engine min_input_shape.", x));
           PADDLE_ENFORCE_EQ(
               max_input_shape.count(x),
               true,
-              platform::errors::InvalidArgument(
+              phi::errors::InvalidArgument(
                   "Input %s not found in TRT engine max_input_shape.", x));
           RuntimeDynamicShapeCheck(x,
                                    runtime_input_shape[x],
@@ -560,7 +560,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
         PADDLE_ENFORCE_EQ(
             t.numel(),
             1UL,
-            platform::errors::PreconditionNotMet(
+            phi::errors::PreconditionNotMet(
                 "This tensor must have one element, but got %ld.", t.numel()));
         t_shape.push_back(1);
       }
@@ -571,7 +571,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
       PADDLE_ENFORCE_LT(
           bind_index,
           num_bindings,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Wrong TRT engine input binding index. Expected The "
               "binding index of TRT engine input to be less than "
               "the number of inputs and outputs. Received binding "
@@ -592,7 +592,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
             PADDLE_ENFORCE_EQ(
                 runtime_batch,
                 t_shape[0],
-                platform::errors::InvalidArgument(
+                phi::errors::InvalidArgument(
                     "Inputs of trt subgraphs has different batchsize. "
                     "It's not allowed in static shape mode. "
                     "Check whether the model you are running has multiple trt "
@@ -693,7 +693,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
       auto intrt_type = engine->engine()->getBindingDataType(intrt_index);
       PADDLE_ENFORCE_EQ(indata_type,
                         intrt_type,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The TRT Engine OP's input type [%d] should equal "
                             "to the input data type [%d].",
                             static_cast<int>(intrt_type),
@@ -733,7 +733,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
         buffers[bind_index] = static_cast<void *>(t.data<bool>());
 #endif
       } else {
-        PADDLE_THROW(platform::errors::Fatal(
+        PADDLE_THROW(phi::errors::Fatal(
             "The TRT Engine OP only support "
             "float/double/int32_t/int64_t/float16/bool input."));
       }
@@ -787,14 +787,14 @@ class TensorRTEngineOp : public framework::OperatorBase {
       auto *fluid_v = scope.FindVar(y);
       PADDLE_ENFORCE_NOT_NULL(
           fluid_v,
-          platform::errors::NotFound(
+          phi::errors::NotFound(
               "Output variable %s is not found in TensorRT subgraph.", y));
       auto *fluid_t = fluid_v->GetMutable<phi::DenseTensor>();
       fluid_t->Resize(common::make_ddim(ddim));
 
       PADDLE_ENFORCE_LT(bind_index,
                         num_bindings,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "The binding index in TRT engine should be less "
                             "than the number of bindings, but got binding "
                             "index = %d, number of bindings = %d.",
@@ -813,7 +813,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
       PADDLE_ENFORCE_LE(
           runtime_batch,
           max_batch_size_,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The runtime batch size (%d) is greater than the max batch "
               "size(%d).\n"
               "There are two possible causes for this problem: \n"
@@ -983,7 +983,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
     }
     PADDLE_ENFORCE_NOT_NULL(
         trt_engine_,
-        platform::errors::Fatal(
+        phi::errors::Fatal(
             "The pointer to tensorrt engine should not be null."));
     return trt_engine_;
   }
diff --git a/paddle/fluid/operators/tile_op.cc b/paddle/fluid/operators/tile_op.cc
index 9d961bbd57122..1819b587c7c1c 100644
--- a/paddle/fluid/operators/tile_op.cc
+++ b/paddle/fluid/operators/tile_op.cc
@@ -181,7 +181,7 @@ class TileCompositeGradOpMaker : public prim::CompositeGradOpMakerBase {
     auto repeat_times = this->Attr<std::vector<int>>("repeat_times");
     if (tensor_repeat_times.is_initialized() ||
         tensor_repeat_times_attr.is_initialized()) {
-      PADDLE_THROW(platform::errors::Unimplemented(
+      PADDLE_THROW(phi::errors::Unimplemented(
           "We don't support RepeatTimes from tensor or repeat_times_tensor for "
           "tile composite grad for now. "));
     } else {
diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index 32ee384f841d6..99b311960e77b 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -24,17 +24,17 @@ class TopkOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of TopkOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("X"),
+        true,
+        phi::errors::InvalidArgument("Input(X) of TopkOp should not be null."));
     PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Output(Out) of TopkOp should not be null."));
     PADDLE_ENFORCE_EQ(ctx->HasOutput("Indices"),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Output(Indices) of TopkOp should not be null."));
 
     auto input_dims = ctx->GetInputDim("X");
@@ -42,18 +42,18 @@ class TopkOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_GE(k,
                       1,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Attribute k must be >= 1, but got k is %d.", k));
     PADDLE_ENFORCE_GE(
         input_dims.size(),
         1,
-        platform::errors::InvalidArgument("input must have >= 1d shape"));
+        phi::errors::InvalidArgument("input must have >= 1d shape"));
 
     if (ctx->IsRuntime()) {
       PADDLE_ENFORCE_GE(
           input_dims[input_dims.size() - 1],
           k,
-          platform::errors::InvalidArgument("input must have >= k columns"));
+          phi::errors::InvalidArgument("input must have >= k columns"));
     }
 
     framework::DDim dims = input_dims;
@@ -104,19 +104,19 @@ class TopkOpGrad : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("X"),
         true,
-        platform::errors::InvalidArgument("Input(X) should be not null"));
+        phi::errors::InvalidArgument("Input(X) should be not null"));
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("Indices"),
         true,
-        platform::errors::InvalidArgument("Input(Indices) should be not null"));
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Grad Input(Out) should be not null"));
+        phi::errors::InvalidArgument("Input(Indices) should be not null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput(framework::GradVarName("Out")),
+        true,
+        phi::errors::InvalidArgument("Grad Input(Out) should be not null"));
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput(framework::GradVarName("X")),
         true,
-        platform::errors::InvalidArgument("Grad Output(X) should be not null"));
+        phi::errors::InvalidArgument("Grad Output(X) should be not null"));
 
     auto x_dims = ctx->GetInputDim("X");
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index 003f670133e45..1bb53891d8151 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -23,7 +23,7 @@ limitations under the License. */
 #endif
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/top_k_op.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/top_k_function_cuda.h"
 // set cub base traits in order to handle float16
 
@@ -61,10 +61,9 @@ template <typename DeviceContext, typename T>
 class TopkOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()),
-        true,
-        platform::errors::InvalidArgument("It must use CUDAPlace."));
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()),
+                      true,
+                      phi::errors::InvalidArgument("It must use CUDAPlace."));
     auto* input = ctx.Input<phi::DenseTensor>("X");
     auto* output = ctx.Output<phi::DenseTensor>("Out");
     auto* indices = ctx.Output<phi::DenseTensor>("Indices");
@@ -124,12 +123,12 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
                                                         gridx,
                                                         input_height));
         default:
-          PADDLE_THROW(platform::errors::Fatal(
+          PADDLE_THROW(phi::errors::Fatal(
               "the input k has error when use getMaxLength function to get the "
               "maxLength."));
       });
       default:
-        PADDLE_THROW(platform::errors::Unavailable(
+        PADDLE_THROW(phi::errors::Unavailable(
             "Calculation error occurred in TopK Operator's CUDA Kernel."));
     }
   }
@@ -139,10 +138,9 @@ template <typename DeviceContext, typename T>
 class TopkOpGradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(context.GetPlace()),
-        true,
-        platform::errors::InvalidArgument("It must use CUDAPlace."));
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(context.GetPlace()),
+                      true,
+                      phi::errors::InvalidArgument("It must use CUDAPlace."));
     auto* x = context.Input<phi::DenseTensor>("X");
     auto* out_grad =
         context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
@@ -169,7 +167,7 @@ class TopkOpGradCUDAKernel : public framework::OpKernel<T> {
               x_grad_data, indices_data, out_grad_data, row, col, k));
       default:
         PADDLE_THROW(
-            platform::errors::Unavailable("Error occurs when Assign Grad."));
+            phi::errors::Unavailable("Error occurs when Assign Grad."));
     }
   }
 };
@@ -184,8 +182,7 @@ REGISTER_OP_CUDA_KERNEL(
     paddle::operators::TopkOpCUDAKernel<phi::GPUContext, double>,
     paddle::operators::TopkOpCUDAKernel<phi::GPUContext, int>,
     paddle::operators::TopkOpCUDAKernel<phi::GPUContext, int64_t>,
-    paddle::operators::TopkOpCUDAKernel<phi::GPUContext,
-                                        paddle::platform::float16>);
+    paddle::operators::TopkOpCUDAKernel<phi::GPUContext, phi::dtype::float16>);
 
 REGISTER_OP_CUDA_KERNEL(
     top_k_grad,
@@ -194,4 +191,4 @@ REGISTER_OP_CUDA_KERNEL(
     paddle::operators::TopkOpGradCUDAKernel<phi::GPUContext, int>,
     paddle::operators::TopkOpGradCUDAKernel<phi::GPUContext, int64_t>,
     paddle::operators::TopkOpGradCUDAKernel<phi::GPUContext,
-                                            paddle::platform::float16>);
+                                            phi::dtype::float16>);
diff --git a/paddle/fluid/operators/top_k_op_xpu.cc b/paddle/fluid/operators/top_k_op_xpu.cc
index fff713236e9a6..48902ed3d8bd5 100644
--- a/paddle/fluid/operators/top_k_op_xpu.cc
+++ b/paddle/fluid/operators/top_k_op_xpu.cc
@@ -92,5 +92,5 @@ class TopkXPUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_XPU_KERNEL(top_k,
                        ops::TopkXPUKernel<float>,
-                       ops::TopkXPUKernel<paddle::platform::float16>);
+                       ops::TopkXPUKernel<phi::dtype::float16>);
 #endif
diff --git a/paddle/fluid/operators/transfer_layout_op.cc b/paddle/fluid/operators/transfer_layout_op.cc
index 9df0a1f3e36ed..a23461475397b 100644
--- a/paddle/fluid/operators/transfer_layout_op.cc
+++ b/paddle/fluid/operators/transfer_layout_op.cc
@@ -52,7 +52,7 @@ class TransferLayoutOp : public framework::OperatorWithKernel {
     if (in_tensor->layout() != DataLayout::ONEDNN) {
       PADDLE_ENFORCE_EQ(in_tensor->IsInitialized(),
                         true,
-                        platform::errors::PreconditionNotMet(
+                        phi::errors::PreconditionNotMet(
                             "The tensor of Input(X) is not initialized."));
     }
     auto place =
diff --git a/paddle/fluid/operators/transfer_layout_op.h b/paddle/fluid/operators/transfer_layout_op.h
index 2736171626121..1b4ef2d1b5abb 100644
--- a/paddle/fluid/operators/transfer_layout_op.h
+++ b/paddle/fluid/operators/transfer_layout_op.h
@@ -74,7 +74,7 @@ class TransferLayoutFunctor {
       PADDLE_ENFORCE_NE(
           in_layout,
           out_layout,
-          platform::errors::PreconditionNotMet(
+          phi::errors::PreconditionNotMet(
               "No layout transform needed between two oneDNN OPKernels."));
 
       if (in_layout != DataLayout::ONEDNN && out_layout == DataLayout::ONEDNN) {
@@ -136,7 +136,7 @@ class TransferLayoutFunctor {
     PADDLE_ENFORCE_EQ(
         common::arity(in.dims()),
         4,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input dimension arity only can be 4, the input dimension is %s.",
             in.dims()));
 
diff --git a/paddle/fluid/operators/unbind_op.h b/paddle/fluid/operators/unbind_op.h
index 7a5bf4d34c47c..ea2c6d4ee2bb8 100644
--- a/paddle/fluid/operators/unbind_op.h
+++ b/paddle/fluid/operators/unbind_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/utils.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/strided_memcpy.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
index 6b84fd1612e65..bcff52e1af6d7 100644
--- a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
@@ -38,7 +38,7 @@ inline void UniformRealDistribution(T *data,
 }
 
 template <>
-inline void UniformRealDistribution(paddle::platform::bfloat16 *data,
+inline void UniformRealDistribution(phi::dtype::bfloat16 *data,
                                     const int64_t &size,
                                     const float &min,
                                     const float &max,
@@ -48,7 +48,7 @@ inline void UniformRealDistribution(paddle::platform::bfloat16 *data,
   auto engine = phi::GetCPURandomEngine(seed);
 
   for (int64_t i = 0; i < size; ++i) {
-    data[i] = static_cast<paddle::platform::bfloat16>(dist(*engine));
+    data[i] = static_cast<phi::dtype::bfloat16>(dist(*engine));
   }
 }
 }  // namespace
@@ -85,7 +85,7 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
       tensor = out_var->GetMutable<phi::DenseTensor>();
       if (!new_shape.empty()) tensor->Resize(common::make_ddim(new_shape));
     } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Expected type of Output(out) in uniform_random_op must be Tensor, "
           "SelectedRows. But got "
           "unsupport type: %s.",
@@ -110,7 +110,7 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_GT(
           size,
           (diag_num - 1) * (diag_step + 1),
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "ShapeInvalid: the diagonal's elements is equal (num-1) "
               "* (step-1) with num %d, step %d,"
               "It should be smaller than %d, but received %d",
diff --git a/paddle/fluid/operators/uniform_random_batch_size_like_op.cu b/paddle/fluid/operators/uniform_random_batch_size_like_op.cu
index 0cf50142c04a0..0b81c690d573f 100644
--- a/paddle/fluid/operators/uniform_random_batch_size_like_op.cu
+++ b/paddle/fluid/operators/uniform_random_batch_size_like_op.cu
@@ -45,7 +45,7 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
       tensor = out_var->GetMutable<phi::DenseTensor>();
       if (!new_shape.empty()) tensor->Resize(common::make_ddim(new_shape));
     } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Expected type of Output(out) in uniform_random_op must be "
           "phi::DenseTensor, "
           "SelectedRows. But got "
diff --git a/paddle/fluid/operators/uniform_random_inplace_op_xpu.cc b/paddle/fluid/operators/uniform_random_inplace_op_xpu.cc
index f1afd8ef3e213..6a773c60997ea 100644
--- a/paddle/fluid/operators/uniform_random_inplace_op_xpu.cc
+++ b/paddle/fluid/operators/uniform_random_inplace_op_xpu.cc
@@ -50,7 +50,7 @@ class XPUUniformRandomInplaceKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_GT(
           size,
           (diag_num - 1) * (diag_step + 1),
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "ShapeInvalid: the diagonal's elements is equal (num-1) "
               "* (step-1) with num %d, step %d,"
               "It should be smaller than %d, but received %d",
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index 458794223dc74..2dbab83a2f528 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -60,7 +60,7 @@ inline std::vector<int64_t> GetNewDataFromShapeTensor(
     }
     return vec_new_data;
   } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
+    PADDLE_THROW(phi::errors::InvalidArgument(
         "Expected dtype of ShapeTensor must be int32, int64. But got "
         "unsupport dtype: %s.",
         new_data_tensor->dtype()));
@@ -76,7 +76,7 @@ inline std::vector<int64_t> GetNewDataFromShapeTensorList(
     PADDLE_ENFORCE_EQ(
         tensor->dims(),
         common::make_ddim({1}),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Shape of dim tensor in uniform_random_op should be [1]"
             "But received tensor's dim=%s.",
             tensor->dims()));
@@ -100,7 +100,7 @@ inline std::vector<int64_t> GetNewDataFromShapeTensorList(
         vec_new_shape.push_back(*tensor->data<int64_t>());
       }
     } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Expected dtype of ShapeTensorList of %d-th must be int32, int64. "
           "But got "
           "unsupport dtype: %s.",
diff --git a/paddle/fluid/operators/unique_op.h b/paddle/fluid/operators/unique_op.h
index 4d7a9eb5e4937..47bd4674c9a29 100644
--- a/paddle/fluid/operators/unique_op.h
+++ b/paddle/fluid/operators/unique_op.h
@@ -55,7 +55,7 @@ struct UniqueOpFunctor {
     PADDLE_ENFORCE_LT(
         in_->numel(),
         pow(2, 31),
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The num of Input(X) elements should be less then INT_MAX, "
             "but received num is %d.",
             in_->numel()));
@@ -84,7 +84,7 @@ struct UniqueOpFunctor {
                               index_type == framework::proto::VarType::INT64;
       PADDLE_ENFORCE_EQ(index_type_match,
                         true,
-                        platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "Index holds the wrong type, it holds %s, "
                             "but desires to be %s or %s",
                             paddle::framework::DataTypeToString(index_type),
@@ -406,7 +406,7 @@ class UniqueKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_LE(
           x->numel(),
           INT_MAX,
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The number of elements in Input(X) should be less than or "
               "equal to INT_MAX, but received num is %d. Please set `dtype` to "
               "int64.",
diff --git a/paddle/fluid/operators/unique_with_counts_op.cc b/paddle/fluid/operators/unique_with_counts_op.cc
deleted file mode 100644
index 5272158805d71..0000000000000
--- a/paddle/fluid/operators/unique_with_counts_op.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/unique_with_counts_op.h"
-
-namespace paddle {
-namespace operators {
-
-class UniqueWithCountsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "unique_with_counts");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Out"), "Output", "Out", "unique_with_counts");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Index"), "Output", "Index", "unique_with_counts");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Count"), "Output", "Count", "unique_with_counts");
-
-    auto in_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(
-        in_dims.size(),
-        1,
-        platform::errors::InvalidArgument("The Input(X) should be 1-D Tensor, "
-                                          "But now the dims of Input(X) is %d.",
-                                          in_dims.size()));
-
-    ctx->SetOutputDim("Out", {-1});
-    ctx->SetOutputDim("Index", in_dims);
-    ctx->SetOutputDim("Count", {-1});
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          platform::CPUPlace());
-  }
-};
-
-class UniqueWithCountsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input tensor. It should be a 1-D tensor.");
-    AddAttr<int>("dtype", "data type for output index");
-    AddOutput("Out", "A unique subsequence for input tensor.");
-    AddOutput("Index",
-              "An index tensor pointing to unique subsequence, which has "
-              "identical shape with input tensor and the data type is set by "
-              "the attr `dtype`");
-    AddOutput("Count", "A subsequence for the count of unique index");
-    AddComment(R"DOC(
-    Return a unique subsequence for 1-D input tensor, index tensor pointing to this unique subsequence,
-    and the subsequence for the count of unique index.
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(unique_with_counts,
-                             ops::UniqueWithCountsOp,
-                             ops::UniqueWithCountsOpMaker);
-PD_REGISTER_STRUCT_KERNEL(unique_with_counts,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::UniqueWithCountsKernel,
-                          float,
-                          double,
-                          int32_t,
-                          int64_t) {}
diff --git a/paddle/fluid/operators/unique_with_counts_op.h b/paddle/fluid/operators/unique_with_counts_op.h
deleted file mode 100644
index 4b1fef5e22447..0000000000000
--- a/paddle/fluid/operators/unique_with_counts_op.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cmath>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/unique_op.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class UniqueWithCountsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto data_type = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
-    auto* x = context.Input<phi::DenseTensor>("X");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-    auto* index = context.Output<phi::DenseTensor>("Index");
-    auto* count = context.Output<phi::DenseTensor>("Count");
-    framework::VisitDataType(data_type,
-                             UniqueOpFunctor<T>(out, index, x, count));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index 07136f7bd4f31..b19533b005a94 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -29,22 +29,22 @@ register_unity_group(
   bmm_op.cc
   bpr_loss_op.cc
   cast_op.cc
-  mkldnn/cast_mkldnn_op.cc
+  onednn/cast_onednn_op.cc
   cholesky_op.cc
   chunk_eval_op.cc
   clip_by_norm_op.cc
   clip_op.cc
   coalesce_tensor_op.cc
-  mkldnn/activation_mkldnn_op.cc
-  mkldnn/interpolate_mkldnn_op.cc
-  mkldnn/pool_mkldnn_op.cc
-  mkldnn/softmax_mkldnn_op.cc)
+  onednn/activation_onednn_op.cc
+  onednn/interpolate_onednn_op.cc
+  onednn/pool_onednn_op.cc
+  onednn/softmax_onednn_op.cc)
 register_unity_group(
   cc
   center_loss_op.cc
-  mkldnn/concat_mkldnn_op.cc
-  mkldnn/conv_mkldnn_op.cc
-  mkldnn/conv_transpose_mkldnn_op.cc
+  onednn/concat_onednn_op.cc
+  onednn/conv_onednn_op.cc
+  onednn/conv_transpose_onednn_op.cc
   correlation_op.cc
   cos_sim_op.cc
   crf_decoding_op.cc
@@ -69,7 +69,7 @@ register_unity_group(
   delete_var_op.cc
   dequantize_abs_max_op.cc
   dequantize_op.cc
-  mkldnn/dequantize_mkldnn_op.cc)
+  onednn/dequantize_onednn_op.cc)
 register_unity_group(
   cc
   dequeue_op.cc
@@ -92,7 +92,7 @@ register_unity_group(
   expand_v2_op.cc
   fake_dequantize_op.cc
   fc_op.cc
-  mkldnn/fc_mkldnn_op.cc
+  onednn/fc_onednn_op.cc
   fill_any_like_op.cc
   fill_constant_batch_size_like_op.cc
   fill_constant_op.cc
@@ -105,7 +105,7 @@ register_unity_group(
   gather_nd_op.cc
   gather_tree_op.cc
   gaussian_random_batch_size_like_op.cc
-  mkldnn/gaussian_random_mkldnn_op.cc
+  onednn/gaussian_random_onednn_op.cc
   group_norm_op.cc
   gru_op.cc)
 register_unity_group(
@@ -143,7 +143,7 @@ register_unity_group(
   log_softmax_op.cc
   lookup_table_dequant_op.cc
   lrn_op.cc
-  mkldnn/lrn_mkldnn_op.cc
+  onednn/lrn_onednn_op.cc
   lstm_unit_op.cc)
 register_unity_group(
   cc
@@ -152,10 +152,9 @@ register_unity_group(
   masked_select_op.cc
   match_matrix_tensor_op.cc
   matmul_op.cc
-  mkldnn/matmul_mkldnn_op.cc
+  onednn/matmul_onednn_op.cc
   max_sequence_len_op.cc
   maxout_op.cc
-  merge_lod_tensor_op.cc
   merge_selected_rows_op.cc
   meshgrid_op.cc)
 register_unity_group(
@@ -204,7 +203,7 @@ register_unity_group(
   cc
   push_dense_op.cc
   quantize_op.cc
-  mkldnn/quantize_mkldnn_op.cc
+  onednn/quantize_onednn_op.cc
   queue_generator_op.cc
   range_op.cc
   rank_attention_op.cc
@@ -212,7 +211,7 @@ register_unity_group(
   recurrent_op.cc
   reorder_lod_tensor_by_rank_op.cc
   requantize_op.cc
-  mkldnn/requantize_mkldnn_op.cc
+  onednn/requantize_onednn_op.cc
   reshape_op.cc
   reverse_op.cc)
 register_unity_group(
@@ -224,7 +223,7 @@ register_unity_group(
   save_combine_op.cc
   save_op.cc
   scale_op.cc
-  mkldnn/scale_mkldnn_op.cc
+  onednn/scale_onednn_op.cc
   scatter_nd_add_op.cc
   scatter_op.cc
   seed_op.cc
@@ -247,7 +246,6 @@ register_unity_group(
 register_unity_group(
   cc
   spectral_norm_op.cc
-  split_lod_tensor_op.cc
   split_op.cc
   split_selected_rows_op.cc
   spp_op.cc
@@ -256,7 +254,7 @@ register_unity_group(
   stack_op.cc
   strided_slice_op.cc
   sum_op.cc
-  mkldnn/sum_mkldnn_op.cc
+  onednn/sum_onednn_op.cc
   tdm_child_op.cc
   tdm_sampler_op.cc
   teacher_student_sigmoid_loss_op.cc
@@ -269,7 +267,7 @@ register_unity_group(
   top_k_v2_op.cc
   trace_op.cc
   transpose_op.cc
-  mkldnn/transpose_mkldnn_op.cc
+  onednn/transpose_onednn_op.cc
   unbind_op.cc
   unfold_op.cc)
 register_unity_group(
diff --git a/paddle/fluid/operators/unzip_op.cc b/paddle/fluid/operators/unzip_op.cc
deleted file mode 100644
index b1b3d42282c40..0000000000000
--- a/paddle/fluid/operators/unzip_op.cc
+++ /dev/null
@@ -1,154 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/unzip_op.h"
-
-#include <memory>
-
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-class unzipOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "lod");
-    OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "lod");
-    auto lod_dims = ctx->GetInputDim("lod");
-    PADDLE_ENFORCE_EQ(
-        lod_dims.size(),
-        1UL,
-        platform::errors::InvalidArgument(
-            "Input(X)'s rank should be 1, but got %d", lod_dims.size()));
-    auto len = static_cast<int64_t>(ctx->Attrs().Get<int>("len"));
-    ctx->SetOutputDim("Y", {lod_dims[0] - 1, len});
-  }
-
- protected:
-  // Explicitly set that the data type of computation kernel of
-  // unzip
-  // is determined by its input "X".
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-class unzipGradientOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "unzipGradient");
-    OP_INOUT_CHECK(ctx->HasInput("lod"), "Input", "unzip", "unzipGradient");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")),
-                   "Input",
-                   framework::GradVarName("Y"),
-                   "unzipGradient");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output",
-                   framework::GradVarName("X"),
-                   "unzipGradient");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto lod_dims = ctx->GetInputDim("lod");
-    PADDLE_ENFORCE_EQ(
-        x_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "Expect Input(X)'s rank == 2, but got %d", x_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        lod_dims.size(),
-        1,
-        platform::errors::InvalidArgument(
-            "Expect Input(X)'s rank == 1, but got %d", lod_dims.size()));
-
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    ctx->ShareLoD("X", framework::GradVarName("X"));
-  }
-
- protected:
-  // Explicitly set that the data type of computation kernel of
-  // unzip
-  // is determined by its input "X".
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Y")),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-class unzipOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(LodTensor, default LodTensor<float>)");
-    AddInput("lod", "(Tensor),  a 1-D Tensor with shape [K]");
-    AddAttr<int>("len", "The len of each original Tensor").SetDefault(1);
-    AddOutput("Y",
-              "(LodTensor, default LodTensor<float>), a 2-D tensor with shape "
-              "[K-1 x len].");
-    AddComment(R"DOC(
-unzip Operator.
-)DOC");
-  }
-};
-
-template <typename T>
-class unzipGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("unzip_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("lod", this->Input("lod"));
-    op->SetAttr("len", this->GetAttr("len"));
-    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(unzip,
-                  ops::unzipOp,
-                  ops::unzipOpMaker,
-                  ops::unzipGradOpMaker<paddle::framework::OpDesc>,
-                  ops::unzipGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OPERATOR(unzip_grad, ops::unzipGradientOp);
-
-PD_REGISTER_STRUCT_KERNEL(unzip,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::unzipOpKernel,
-                          int64_t,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
-PD_REGISTER_STRUCT_KERNEL(unzip_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::unzipGradOpKernel,
-                          int64_t,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
diff --git a/paddle/fluid/operators/unzip_op.cu b/paddle/fluid/operators/unzip_op.cu
deleted file mode 100644
index 39d80e8c6ce92..0000000000000
--- a/paddle/fluid/operators/unzip_op.cu
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/operators/unzip_op.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/backends/gpu/gpu_primitives.h"
-namespace paddle {
-namespace operators {
-
-using phi::PADDLE_CUDA_NUM_THREADS;
-
-template <typename T, typename LodType>
-__global__ void unzipKernel(
-    const T* X, const LodType* lod, T* Y, size_t col_size, size_t n) {
-  CUDA_KERNEL_LOOP(i, n) {
-    int lod_idx = i / col_size;
-    int len = lod[lod_idx + 1] - lod[lod_idx];
-    if (i >= lod_idx * col_size + len) {
-      Y[i] = 0;
-    } else {
-      Y[i] = X[lod[lod_idx] + i % col_size];
-    }
-  }
-}
-
-template <typename T, typename DeviceContext, typename LodType = int64_t>
-class unzipCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto* x = context.Input<phi::DenseTensor>("X");
-    const T* x_data = x->data<T>();
-
-    const auto* lod = context.Input<phi::DenseTensor>("lod");
-    const LodType* lod_data = lod->data<LodType>();
-
-    auto col_size = context.Attr<int>("len");
-    auto row_size = lod->dims()[0] - 1;
-    auto y_numel = col_size * row_size;
-
-    auto* y = context.Output<phi::DenseTensor>("Y");
-    T* y_data = y->mutable_data<T>(context.GetPlace());
-
-    // for Input X do not have lod Information.
-    auto stream = context.template device_context<phi::GPUContext>().stream();
-    unzipKernel<<<(y_numel + PADDLE_CUDA_NUM_THREADS - 1) /
-                      PADDLE_CUDA_NUM_THREADS,
-                  PADDLE_CUDA_NUM_THREADS,
-                  0,
-                  stream>>>(x_data, lod_data, y_data, col_size, y_numel);
-  }
-};
-
-template <typename T, typename DeviceContext>
-class unzipGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_THROW(phi::errors::Unimplemented("unzip_grad is unimplemented"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-PD_REGISTER_STRUCT_KERNEL(unzip,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::unzipCUDAKernel,
-                          float,
-                          double,
-                          plat::float16,
-                          bool,
-                          int,
-                          int64_t,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
-PD_REGISTER_STRUCT_KERNEL(unzip_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::unzipGradCUDAKernel,
-                          float,
-                          double,
-                          plat::float16,
-                          bool,
-                          int,
-                          int64_t,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
diff --git a/paddle/fluid/operators/unzip_op.h b/paddle/fluid/operators/unzip_op.h
deleted file mode 100644
index 6829d00dccf56..0000000000000
--- a/paddle/fluid/operators/unzip_op.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class unzipOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_THROW(phi::errors::Unimplemented("unzip is unimplemented"));
-  }
-};
-
-template <typename T, typename DeviceContext>
-class unzipGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_THROW(phi::errors::Unimplemented("unzip_grad is unimplemented"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/utils.h b/paddle/fluid/operators/utils.h
index cecd2e2931af6..feca2d9c722ac 100644
--- a/paddle/fluid/operators/utils.h
+++ b/paddle/fluid/operators/utils.h
@@ -29,13 +29,13 @@ inline std::vector<T> GetDataFromTensorList(
   std::vector<T> vec_new_data;
   for (size_t i = 0; i < list_tensor.size(); ++i) {
     auto tensor = list_tensor[i];
-    PADDLE_ENFORCE_EQ(tensor->dims(),
-                      common::make_ddim({1}),
-                      platform::errors::InvalidArgument(
-                          "The shape of Tensor in list must be [1]. "
-                          "But received its shape "
-                          "is [%s]",
-                          tensor->dims()));
+    PADDLE_ENFORCE_EQ(
+        tensor->dims(),
+        common::make_ddim({1}),
+        phi::errors::InvalidArgument("The shape of Tensor in list must be [1]. "
+                                     "But received its shape "
+                                     "is [%s]",
+                                     tensor->dims()));
 
     if (framework::TransToProtoVarType(tensor->dtype()) ==
         framework::proto::VarType::INT32) {
@@ -57,7 +57,7 @@ inline std::vector<T> GetDataFromTensorList(
         vec_new_data.push_back(static_cast<T>(*tensor->data<int64_t>()));
       }
     } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "The dtype of Tensor in list must be int32 or int64, but received: "
           "%s",
           tensor->dtype()));
diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc
index 86e3fc3420ed6..e8d69083e532e 100644
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
@@ -66,33 +66,33 @@ void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE_EQ(
       ctx->HasInput("X"),
       true,
-      platform::errors::NotFound("X(Input) of VarConv2dOP is not found."));
+      phi::errors::NotFound("X(Input) of VarConv2dOP is not found."));
   PADDLE_ENFORCE_EQ(
       ctx->HasInput("W"),
       true,
-      platform::errors::NotFound("W(Input) of VarConv2dOP is not found."));
+      phi::errors::NotFound("W(Input) of VarConv2dOP is not found."));
   PADDLE_ENFORCE_EQ(
       ctx->HasInput("ROW"),
       true,
-      platform::errors::NotFound("Input(ROW) of VarConv2dOP is not found."));
+      phi::errors::NotFound("Input(ROW) of VarConv2dOP is not found."));
   PADDLE_ENFORCE_EQ(
       ctx->HasInput("COLUMN"),
       true,
-      platform::errors::NotFound("Input(COLUMN) of VarConv2dOP is not found."));
+      phi::errors::NotFound("Input(COLUMN) of VarConv2dOP is not found."));
   PADDLE_ENFORCE_EQ(
       ctx->HasOutput("Out"),
       true,
-      platform::errors::NotFound("Out(Output) of VarConv2dOP is not found."));
+      phi::errors::NotFound("Out(Output) of VarConv2dOP is not found."));
   PADDLE_ENFORCE_EQ(
       ctx->HasOutput("Col"),
       true,
-      platform::errors::NotFound("Col(Output) of VarConv2dOP is not found."));
+      phi::errors::NotFound("Col(Output) of VarConv2dOP is not found."));
 
   auto x_dims = ctx->GetInputDim("X");
   PADDLE_ENFORCE_EQ(
       x_dims.size(),
       2,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The rank of X(Input) can't be less than 2, but received rank is %u.",
           x_dims.size()));
 
@@ -101,7 +101,7 @@ void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE_EQ(
       w_dims.size(),
       2,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Input W should be a 2-D tensor, but its actual dimension is %u.",
           w_dims.size()));
   int output_channel = ctx->Attrs().Get<int>("OutputChannel");
@@ -111,7 +111,7 @@ void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE_EQ(
       w_dims[0],
       output_channel,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Input W's dimension[0] should be equal to OutputChannel, the "
           "dimension[0] is %d, OutputChannel is %d.",
           w_dims[0],
@@ -119,7 +119,7 @@ void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE_EQ(
       w_dims[1],
       input_channel * kernel_h * kernel_w,
-      platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Input W's dimension[1] should be equal to InputChannel * StrideH * "
           "StrideW, the dimension[1] is %d, expected value is %d.",
           w_dims[1],
@@ -131,17 +131,17 @@ void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const {
     const auto& x_lod = x_var->Get<phi::DenseTensor>().lod();
     PADDLE_ENFORCE_EQ(!x_lod.empty(),
                       true,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The Input(X) phi::DenseTensor of VarConv2dOP "
                           "does not contain LoD information."));
 
-    PADDLE_ENFORCE_GE(x_lod.size(),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "The Input(X)'s lod info is corrupted."));
+    PADDLE_ENFORCE_GE(
+        x_lod.size(),
+        1,
+        phi::errors::InvalidArgument("The Input(X)'s lod info is corrupted."));
     PADDLE_ENFORCE_EQ(x_dims[0],
                       static_cast<int64_t>(x_lod[0].back()),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The Input(X)'s lod info mismatches the actual "
                           "tensor shape, input lod is %s, tensor shape is %s.",
                           x_lod,
@@ -153,7 +153,7 @@ void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const {
     PADDLE_ENFORCE_EQ(
         !row_lod.empty(),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The Input(ROW) phi::DenseTensor of VarConv2dOP does not "
             "contain LoD information."));
 
@@ -163,7 +163,7 @@ void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const {
     PADDLE_ENFORCE_EQ(
         !col_lod.empty(),
         true,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The Input(COLUMN) phi::DenseTensor of VarConv2dOP does not "
             "contain LoD information."));
   } else {
@@ -370,17 +370,17 @@ class VarConv2dGradMaker : public framework::SingleGradOpMaker<T> {
 };
 
 void VarConv2dOpGrad::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
-                    true,
-                    platform::errors::NotFound(
-                        "Input(X) of SequencePadGradOp is not found."));
-  PADDLE_ENFORCE_EQ(ctx->HasInput("W"),
-                    true,
-                    platform::errors::NotFound(
-                        "Input(W) of SequencePadGradOp is not found."));
+  PADDLE_ENFORCE_EQ(
+      ctx->HasInput("X"),
+      true,
+      phi::errors::NotFound("Input(X) of SequencePadGradOp is not found."));
+  PADDLE_ENFORCE_EQ(
+      ctx->HasInput("W"),
+      true,
+      phi::errors::NotFound("Input(W) of SequencePadGradOp is not found."));
   PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")),
                     true,
-                    platform::errors::NotFound(
+                    phi::errors::NotFound(
                         "Input(Out@GRAD) of SequencePadGradOp is not found."));
 
   if (ctx->HasOutput(framework::GradVarName("X"))) {
diff --git a/paddle/fluid/operators/xpu_api_wrapper.h b/paddle/fluid/operators/xpu_api_wrapper.h
deleted file mode 100644
index c23fb1ae02ab4..0000000000000
--- a/paddle/fluid/operators/xpu_api_wrapper.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#ifdef PADDLE_WITH_XPU
-#include "paddle/phi/kernels/xpu/xpu_api_wrapper.h"
-
-namespace paddle {
-namespace operators {
-
-using float16 = typename XPUTypeTrait<paddle::platform::float16>::Type;
-
-}  // namespace operators
-}  // namespace paddle
-#endif
diff --git a/paddle/fluid/pir/CMakeLists.txt b/paddle/fluid/pir/CMakeLists.txt
index 9e883ef21af9a..7647a7efdf660 100644
--- a/paddle/fluid/pir/CMakeLists.txt
+++ b/paddle/fluid/pir/CMakeLists.txt
@@ -2,3 +2,4 @@ add_subdirectory(dialect)
 add_subdirectory(transforms)
 add_subdirectory(drr)
 add_subdirectory(utils)
+add_subdirectory(serialize_deserialize)
diff --git a/paddle/fluid/pir/dialect/CMakeLists.txt b/paddle/fluid/pir/dialect/CMakeLists.txt
index 59db81550bb8b..0b2fc8c47b75f 100644
--- a/paddle/fluid/pir/dialect/CMakeLists.txt
+++ b/paddle/fluid/pir/dialect/CMakeLists.txt
@@ -110,7 +110,7 @@ set(generated_files_pd_op
     "${pir_bwd_op_source_file}"
     "${pir_update_op_source_file}")
 
-if(WITH_MKLDNN)
+if(WITH_ONEDNN)
   set(pir_op_onednn_yaml ${parsed_op_dir}/onednn.parsed.yaml)
 
   set(pd_onednn_op_yaml_file
@@ -250,7 +250,7 @@ set(op_dialect_srcs
     ${api_source_file}
     ${PADDLE_SOURCE_DIR}/paddle/fluid/pir/transforms/shape_optimization_pass.cc)
 
-if(WITH_MKLDNN)
+if(WITH_ONEDNN)
   set(op_dialect_srcs
       ${op_dialect_srcs} ${onednn_op_source_file} ${op_onednn_info_file}
       ${CMAKE_CURRENT_SOURCE_DIR}/operator/ir/manual_onednn_op.cc)
@@ -263,7 +263,14 @@ file(GLOB_RECURSE dist_dialect_srcs
 # if(WITH_DISTRIBUTE) FIXME in next PR
 set(op_dialect_srcs ${op_dialect_srcs} ${dist_dialect_srcs})
 # endif()
-set(op_dialect_deps phi common pir type_info string_helper)
+set(op_dialect_deps
+    phi
+    common
+    pir
+    type_info
+    string_helper
+    global_utils
+    amp)
 if(WITH_ROCM)
   set(op_dialect_deps ${op_dialect_deps} global_utils)
 endif()
@@ -283,13 +290,13 @@ set(op_dialect_vjp_srcs
     ${op_vjp_source_file}
     ${PADDLE_SOURCE_DIR}/paddle/fluid/primitive/base/decomp_trans.cc)
 
-if(WITH_MKLDNN)
+if(WITH_ONEDNN)
   set(op_dialect_vjp_srcs
       ${op_dialect_vjp_srcs}
       ${CMAKE_CURRENT_SOURCE_DIR}/operator/ir/op_onednn_dialect.cc)
 endif()
 
-set(op_dialect_vjp_deps primitive_vjp_experimental op_dialect)
+set(op_dialect_vjp_deps primitive_vjp_experimental op_dialect prim_utils)
 
 cc_library(
   op_dialect_vjp
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_api.cc b/paddle/fluid/pir/dialect/distributed/ir/dist_api.cc
index 3382fa18b9090..6ba2b16d00df2 100644
--- a/paddle/fluid/pir/dialect/distributed/ir/dist_api.cc
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_api.cc
@@ -45,23 +45,20 @@ pir::Value shard_tensor(const pir::Value& x,
   return shard_tensor_op.out();
 }
 
-pir::Value reshard(const pir::Value& x,
-                   const phi::distributed::ProcessMesh& process_mesh,
-                   const std::vector<int64_t>& dims_mapping) {
+pir::Value reshard(
+    const pir::Value& x,
+    const phi::distributed::ProcessMesh& process_mesh,
+    const std::vector<int64_t>& dims_mapping,
+    const flat_hash_map<int64_t, phi::ReduceType>& partial_status) {
   pir::IrContext* ctx = pir::IrContext::Instance();
-  // TODO(ywt01) get partial_status by func parameter
-  paddle::flat_hash_map<int64_t, phi::ReduceType> partial_status;
   TensorDistAttribute tensor_dist_attr =
       TensorDistAttribute::get(ctx, process_mesh, dims_mapping, partial_status);
-
-  auto reshard_op = ApiBuilder::Instance().GetBuilder()->Build<ReShardOp>(
-      x, tensor_dist_attr);
-  return reshard_op.result(0);
+  return reshard(x, tensor_dist_attr);
 }
 
 pir::Value reshard(const pir::Value& x,
                    const TensorDistAttribute& tensor_dist_attr) {
-  auto reshard_op = ApiBuilder::Instance().GetBuilder()->Build<ReShardOp>(
+  auto reshard_op = ApiBuilder::Instance().GetBuilder()->Build<ReshardOp>(
       x, tensor_dist_attr);
   return reshard_op.result(0);
 }
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_api.h b/paddle/fluid/pir/dialect/distributed/ir/dist_api.h
index 18aa1bb32ca64..5706afa63c165 100644
--- a/paddle/fluid/pir/dialect/distributed/ir/dist_api.h
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_api.h
@@ -29,9 +29,11 @@ pir::Value shard_tensor(const pir::Value& x,
                         const phi::distributed::ProcessMesh& process_mesh,
                         const std::vector<int64_t>& dims_mapping);
 
-pir::Value reshard(const pir::Value& x,
-                   const phi::distributed::ProcessMesh& process_mesh,
-                   const std::vector<int64_t>& dims_mapping);
+pir::Value reshard(
+    const pir::Value& x,
+    const phi::distributed::ProcessMesh& process_mesh,
+    const std::vector<int64_t>& dims_mapping,
+    const flat_hash_map<int64_t, phi::ReduceType>& partial_status = {});
 
 pir::Value reshard(const pir::Value& x,
                    const TensorDistAttribute& tensor_dist_attr);
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h b/paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h
index 2b2be781c9ca8..9725206f5eaf4 100644
--- a/paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h
@@ -75,6 +75,11 @@ class TensorDistAttribute : public pir::AttrBase<TensorDistAttribute,
 
   const flat_hash_map<int64_t, phi::ReduceType>& partial_status() const;
 
+  // construct a new attribute with new mesh attribute.
+  TensorDistAttribute CopyWithNewMesh(ProcessMeshAttribute mesh) const {
+    return get(ir_context(), mesh, dims_mapping(), partial_status());
+  }
+
   static TensorDistAttribute get(
       pir::IrContext* ctx,
       ProcessMeshAttribute mesh,
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.cc b/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.cc
index 0ea42bf6e093d..5834ba6262f3f 100644
--- a/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.cc
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.cc
@@ -35,7 +35,7 @@ void DistDialect::initialize() {
                      TensorDistAttribute,
                      OperationDistAttribute>();
   RegisterTypes<DistDenseTensorType>();
-  RegisterOps<ShardTensorOp, ReShardOp>();
+  RegisterOps<ShardTensorOp, ReshardOp>();
 }
 
 void DistDialect::PrintType(pir::Type type, std::ostream &os) const {
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_interface.h b/paddle/fluid/pir/dialect/distributed/ir/dist_interface.h
index 6fca7d4442b7c..c3fe93521da14 100644
--- a/paddle/fluid/pir/dialect/distributed/ir/dist_interface.h
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_interface.h
@@ -28,13 +28,17 @@ class IR_API DistTypeInterface
     /// Defined these methods with the interface.
     explicit Concept(pir::Type (*local_type)(pir::Type),
                      ProcessMeshAttribute (*process_mesh_attr)(pir::Type),
-                     TensorDistAttribute (*tensor_dist_attr)(pir::Type))
+                     TensorDistAttribute (*tensor_dist_attr)(pir::Type),
+                     pir::Type (*copy_with_new_mesh)(pir::Type,
+                                                     ProcessMeshAttribute mesh))
         : local_type(local_type),
           process_mesh_attr(process_mesh_attr),
-          tensor_dist_attr(tensor_dist_attr) {}
+          tensor_dist_attr(tensor_dist_attr),
+          copy_with_new_mesh(copy_with_new_mesh) {}
     pir::Type (*local_type)(pir::Type);
     ProcessMeshAttribute (*process_mesh_attr)(pir::Type);
     TensorDistAttribute (*tensor_dist_attr)(pir::Type);
+    pir::Type (*copy_with_new_mesh)(pir::Type, ProcessMeshAttribute mesh);
   };
 
   template <class ConcreteType>
@@ -50,7 +54,15 @@ class IR_API DistTypeInterface
       return pir::cast<ConcreteType>(type).tensor_dist_attr();
     }
 
-    Model() : Concept(local_type, process_mesh_attr, tensor_dist_attr) {}
+    static Type CopyWithNewMesh(Type type, ProcessMeshAttribute mesh) {
+      return pir::cast<ConcreteType>(type).CopyWithNewMesh(mesh);
+    }
+
+    Model()
+        : Concept(local_type,
+                  process_mesh_attr,
+                  tensor_dist_attr,
+                  CopyWithNewMesh) {}
   };
 
   DistTypeInterface(pir::Type type, Concept *impl)
@@ -66,6 +78,10 @@ class IR_API DistTypeInterface
     return impl_->tensor_dist_attr(*this);
   }
 
+  DistTypeInterface CopyWithNewMesh(ProcessMeshAttribute mesh) {
+    return DistTypeInterface(impl_->copy_with_new_mesh(*this, mesh), impl_);
+  }
+
  private:
   Concept *impl_;
 };
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_op.cc b/paddle/fluid/pir/dialect/distributed/ir/dist_op.cc
index cc06461e66d55..d419ea7d4d165 100644
--- a/paddle/fluid/pir/dialect/distributed/ir/dist_op.cc
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_op.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/pir/dialect/distributed/ir/dist_op.h"
 #include "paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h"
 #include "paddle/fluid/pir/dialect/distributed/ir/dist_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/api_builder.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
@@ -28,7 +29,7 @@ namespace paddle {
 namespace dialect {
 
 const char* ShardTensorOp::attributes_name[1] = {"op_dist_attr"};
-const char* ReShardOp::attributes_name[1] = {"op_dist_attr"};
+const char* ReshardOp::attributes_name[1] = {"op_dist_attr"};
 
 void ShardTensorOp::VerifySig() {
   VLOG(4)
@@ -159,8 +160,54 @@ void ShardTensorOp::Build(pir::Builder& builder,
   ::pir::PassStopGradientsDefaultly(argument);
 }
 
-void ReShardOp::VerifySig() {
-  VLOG(4) << "Start Verifying inputs, outputs and attributes for: ReShardOp.";
+OpInfoTuple ReshardOp::GetOpInfo() {
+  return OpInfoTuple(
+      {OpInputInfo()}, {}, {OpOutputInfo()}, OpRunTimeInfo(), "reshard");
+}
+
+std::vector<std::vector<pir::Value>> ReshardOp::Vjp(
+    pir::Operation* op,
+    const std::vector<std::vector<pir::Value>>& inputs_,
+    const std::vector<std::vector<pir::Value>>& outputs,
+    const std::vector<std::vector<pir::Value>>& out_grads,
+    const std::vector<std::vector<bool>>& stop_gradients) {
+  VLOG(6) << "Start call vjp for reshard op.";
+  PADDLE_ENFORCE_EQ(
+      inputs_.size(),
+      1,
+      common::errors::InvalidArgument("reshard op's inputs' size should be 1"));
+  PADDLE_ENFORCE_EQ(inputs_[0].size(),
+                    1,
+                    common::errors::InvalidArgument(
+                        "reshard op's inputs[0]'s size should be 1"));
+  auto dist_type = inputs_[0][0].type().dyn_cast<DistTypeInterface>();
+
+  PADDLE_ENFORCE_NOT_NULL(
+      dist_type,
+      common::errors::InvalidArgument(
+          "Currently, reshard op's inputs type must be dist type."));
+
+  PADDLE_ENFORCE_EQ(out_grads.size(),
+                    1,
+                    common::errors::InvalidArgument(
+                        "reshard op's outputs  grad size should be 1"));
+
+  PADDLE_ENFORCE_EQ(out_grads[0].size(),
+                    1,
+                    common::errors::InvalidArgument(
+                        "reshard op's outputs grad[0] size should be 1"));
+
+  auto& builder = *ApiBuilder::Instance().GetBuilder();
+
+  auto grad_op =
+      builder.Build<ReshardOp>(out_grads[0][0], dist_type.tensor_dist_attr());
+
+  VLOG(6) << "End call vjp for reshard op.";
+
+  return {std::vector<pir::Value>{grad_op->result(0)}};
+}
+void ReshardOp::VerifySig() {
+  VLOG(4) << "Start Verifying inputs, outputs and attributes for: ReshardOp.";
   VLOG(4) << "Verifying inputs:";
   {
     auto input_size = num_operands();
@@ -224,11 +271,11 @@ void ReShardOp::VerifySig() {
   VLOG(4) << "End Verifying for: ShardTensorOp.";
 }
 
-void ReShardOp::Build(pir::Builder& builder,
+void ReshardOp::Build(pir::Builder& builder,
                       pir::OperationArgument& argument,
                       pir::Value input,
                       TensorDistAttribute tensor_dist_attr) {
-  VLOG(4) << "Start build ReShardOp";
+  VLOG(4) << "Start build ReshardOp";
 
   paddle::dialect::DistDenseTensorType input_tensor_type;
   if (input.type().isa<paddle::dialect::DistDenseTensorType>()) {
@@ -270,10 +317,11 @@ void ReShardOp::Build(pir::Builder& builder,
       tensor_dist_attr,
       local_shape);
   argument.AddOutput(out_dist_tensor_type);
+  ::pir::PassStopGradientsDefaultly(argument);
 }
 
 }  // namespace dialect
 }  // namespace paddle
 
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::ShardTensorOp)
-IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::ReShardOp)
+IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::ReshardOp)
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_op.h b/paddle/fluid/pir/dialect/distributed/ir/dist_op.h
index 7ae81a0040702..638fb430eaf4e 100644
--- a/paddle/fluid/pir/dialect/distributed/ir/dist_op.h
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_op.h
@@ -15,6 +15,8 @@
 #pragma once
 #include <vector>
 
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/interface/vjp.h"
 #include "paddle/pir/include/core/builder.h"
 #include "paddle/pir/include/core/builtin_type.h"
 #include "paddle/pir/include/core/op_base.h"
@@ -39,7 +41,7 @@ class ShardTensorOp : public pir::Op<ShardTensorOp> {
   void VerifySig();
 };
 
-class ReShardOp : public pir::Op<ReShardOp> {
+class ReshardOp : public pir::Op<ReshardOp, VjpInterface, OpYamlInfoInterface> {
  public:
   using Op::Op;
   static const char* name() { return "dist_op.reshard"; }
@@ -49,10 +51,19 @@ class ReShardOp : public pir::Op<ReShardOp> {
                              pir::OperationArgument& argument,  // NOLINT
                              pir::Value input,
                              TensorDistAttribute tensor_dist_attr);
+
+  static OpInfoTuple GetOpInfo();
+  static std::vector<std::vector<pir::Value>> Vjp(
+      pir::Operation* op,
+      const std::vector<std::vector<pir::Value>>& inputs_,
+      const std::vector<std::vector<pir::Value>>& outputs,
+      const std::vector<std::vector<pir::Value>>& out_grads,
+      const std::vector<std::vector<bool>>& stop_gradients);
+
   void VerifySig();
 };
 }  // namespace dialect
 }  // namespace paddle
 
 IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::ShardTensorOp)
-IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::ReShardOp)
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::ReshardOp)
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_tools.cc b/paddle/fluid/pir/dialect/distributed/ir/dist_tools.cc
index 9741a76714816..d4e6daf22149b 100644
--- a/paddle/fluid/pir/dialect/distributed/ir/dist_tools.cc
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_tools.cc
@@ -86,5 +86,29 @@ TensorDistAttribute CvtToPirDistAttr(
                                   attr.partial_status());
 }
 
+void CopyLeafOpToMesh(pir::Value value, ProcessMeshAttribute mesh_attr) {
+  if (auto dist_type = value.type().dyn_cast<DistTypeInterface>()) {
+    if (dist_type.process_mesh_attr() == mesh_attr) {
+      return;
+    }
+    if (auto op = value.defining_op()) {
+      if (op->num_operands() != 0u || op->num_results() != 1u) {
+        return;
+      }
+      pir::IrMapping ir_mapping;
+      auto new_op = op->Clone(ir_mapping);
+      op->GetParent()->insert(*op, new_op);
+      value.ReplaceAllUsesWith(new_op->result(0));
+      dist_type = dist_type.CopyWithNewMesh(mesh_attr);
+      value.set_type(dist_type);
+      op->set_attribute(
+          kAttrOpDistAttr,
+          OperationDistAttribute::get(dist_type.ir_context(),
+                                      mesh_attr,
+                                      {},
+                                      {dist_type.tensor_dist_attr()}));
+    }
+  }
+}
 }  // namespace dialect
 }  // namespace paddle
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_tools.h b/paddle/fluid/pir/dialect/distributed/ir/dist_tools.h
index 24d8d2d2143b0..bbd62617c8cf0 100644
--- a/paddle/fluid/pir/dialect/distributed/ir/dist_tools.h
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_tools.h
@@ -31,5 +31,17 @@ phi::distributed::DistMetaTensor CvtToDistMetaTensor(DistDenseTensorType type);
 TensorDistAttribute CvtToPirDistAttr(
     const phi::distributed::ArgDistAttr& dist_attr);
 
+///
+/// When the following conditions are met:
+///    1. The value's type is dist type.
+///    2. The value type's mesh is not equal to mesh_attr argument.
+///    3. The operation that defines the value contains no inputs and 1 output.
+/// The function first clones the definition operation and replaces the use of
+/// the original value with the cloned ouput， Secondly, the mesh of the
+/// original operation and value is updated with the 'mesh_attr' argument.
+/// Otherwise, the function does nothing.
+///
+void CopyLeafOpToMesh(pir::Value value, ProcessMeshAttribute mesh_attr);
+
 }  // namespace dialect
 }  // namespace paddle
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_type.cc b/paddle/fluid/pir/dialect/distributed/ir/dist_type.cc
index 5753608c85256..d1b70c24a1c56 100644
--- a/paddle/fluid/pir/dialect/distributed/ir/dist_type.cc
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_type.cc
@@ -60,7 +60,7 @@ common::DDim InferLocalDDim(const common::DDim& global_ddim,
   return local_ddim;
 }
 
-auto DistDenseTensorType::local_type() const -> Type {
+pir::DenseTensorType DistDenseTensorType::local_type() const {
   return pir::DenseTensorType::get(pir::IrContext::Instance(),
                                    dtype(),
                                    local_ddim(),
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_type.h b/paddle/fluid/pir/dialect/distributed/ir/dist_type.h
index 2344a97399e34..c83904a02aef9 100644
--- a/paddle/fluid/pir/dialect/distributed/ir/dist_type.h
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_type.h
@@ -45,8 +45,8 @@ class DistDenseTensorType
   const LoD& lod() const { return dense_tensor_type().lod(); }
   size_t offset() const { return dense_tensor_type().offset(); }
 
-  Type prim_type() { return dense_tensor_type(); }
-  Type local_type() const;
+  pir::DenseTensorType prim_type() { return dense_tensor_type(); }
+  pir::DenseTensorType local_type() const;
 
   ProcessMeshAttribute process_mesh_attr() const {
     return tensor_dist_attr().process_mesh_attr();
@@ -61,6 +61,13 @@ class DistDenseTensorType
     return tensor_dist_attr().partial_status();
   }
 
+  DistDenseTensorType CopyWithNewMesh(ProcessMeshAttribute mesh) {
+    return get(ir_context(),
+               dense_tensor_type(),
+               tensor_dist_attr().CopyWithNewMesh(mesh),
+               local_ddim());
+  }
+
   static DistDenseTensorType get(pir::IrContext* ctx,
                                  pir::DenseTensorType dense_tensor_type,
                                  TensorDistAttribute tensor_dist_attr,
diff --git a/paddle/fluid/pir/dialect/distributed/transforms/dist_to_dense_pass.cc b/paddle/fluid/pir/dialect/distributed/transforms/dist_to_dense_pass.cc
new file mode 100644
index 0000000000000..086feecffa396
--- /dev/null
+++ b/paddle/fluid/pir/dialect/distributed/transforms/dist_to_dense_pass.cc
@@ -0,0 +1,136 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/dialect/distributed/transforms/dist_to_dense_pass.h"
+
+#include <iostream>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/common/flags.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h"
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_dialect.h"
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_interface.h"
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/pir/include/core/attribute.h"
+
+using paddle::dialect::DistDenseTensorType;
+
+COMMON_DECLARE_bool(print_ir);
+
+namespace paddle {
+namespace dialect {
+
+inline pir::Type CastToLocalType(pir::Type dist_type) {
+  return dist_type.dyn_cast<DistTypeInterface>().local_type();
+}
+
+inline bool IsDistType(pir::Type type) { return type.isa<DistTypeInterface>(); }
+
+void ProcessDistBlock(pir::Block* block) {
+  for (auto iter = block->begin(); iter != block->end(); ++iter) {
+    pir::Operation* op_item = &(*iter);
+    VLOG(6) << "dist_to_dense main loop over op [" << op_item->name() << "].";
+
+    for (size_t i = 0; i < op_item->num_results(); ++i) {
+      auto result = op_item->result(i);
+      auto origin_type = result.type();
+      if (IsDistType(origin_type)) {
+        auto local_type = CastToLocalType(origin_type);
+        result.set_type(local_type);
+      } else if (origin_type) {  // skip if <<NULL TYPE>>
+        // TODO(2024-Q2) not all value are dist type
+        PADDLE_THROW(platform::errors::PreconditionNotMet(
+            "The op [%s]'s [%d]th result is not Dist type.",
+            op_item->name(),
+            i));
+      }
+    }
+    // TODO(2024-Q2) not all op are dist type
+    // PADDLE_ENFORCE_EQ(
+    //     (op_item->HasAttribute(kAttrOpDistAttr) &&
+    //      op_item->attribute(kAttrOpDistAttr)
+    //          .isa<paddle::dialect::OperationDistAttribute>()),
+    //     true,
+    //     common::errors::PreconditionNotMet("The op [%s] has not
+    //     op_dist_attr.",
+    //                                        op_item->name()));
+    if (op_item->HasAttribute(kAttrOpDistAttr)) {
+      op_item->erase_attribute(kAttrOpDistAttr);
+    }
+
+    // TODO(2024-Q2) Handle other special dist op in future.
+  }
+}
+
+/* Verification:
+    1. no operator has not OperatorDistAttr.
+    2. all Values (Results) are DenseTensorType.
+    3. no shard_tensor / reshard in block.
+*/
+void VerifyDenseBlock(pir::Block* block) {
+  for (auto iter = block->begin(); iter != block->end(); ++iter) {
+    pir::Operation* op_item = &(*iter);
+
+    for (size_t i = 0; i < op_item->num_results(); ++i) {
+      auto result = op_item->result(i);
+
+      PADDLE_ENFORCE_EQ(
+          IsDistType(result.type()),
+          false,
+          phi::errors::PreconditionNotMet(
+              "Block op [%s] still contain dist type.", op_item->name()));
+    }
+
+    PADDLE_ENFORCE_EQ(
+        op_item->HasAttribute(kAttrOpDistAttr),
+        false,
+        common::errors::PreconditionNotMet(
+            "The op [%s] still has op_dist_attr.", op_item->name()));
+  }
+}
+
+std::shared_ptr<pir::Program> DistToDensePass(pir::Program* prog) {
+  if (FLAGS_print_ir) {
+    VLOG(0) << "IR before DistToDense Pass = " << *prog;
+  }
+
+  pir::IrMapping mapper;
+  auto new_prog = prog->Clone(mapper);
+
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<OperatorDialect>();
+  ctx->GetOrRegisterDialect<DistDialect>();
+
+  ProcessDistBlock(new_prog->block());
+  VLOG(6) << "IR before VerifyDenseBlock Pass = " << *new_prog;
+  VerifyDenseBlock(new_prog->block());
+
+  if (FLAGS_print_ir) {
+    VLOG(0) << "IR after DistToDense Pass = " << *new_prog;
+  }
+
+  return new_prog;
+}
+
+}  // namespace dialect
+}  // namespace paddle
diff --git a/paddle/cinn/frontend/group_cluster/cluster_policy/general_topo_policy.h b/paddle/fluid/pir/dialect/distributed/transforms/dist_to_dense_pass.h
similarity index 65%
rename from paddle/cinn/frontend/group_cluster/cluster_policy/general_topo_policy.h
rename to paddle/fluid/pir/dialect/distributed/transforms/dist_to_dense_pass.h
index c7cfc23feb89e..970aaaa564271 100644
--- a/paddle/cinn/frontend/group_cluster/cluster_policy/general_topo_policy.h
+++ b/paddle/fluid/pir/dialect/distributed/transforms/dist_to_dense_pass.h
@@ -11,15 +11,18 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #pragma once
-#include "paddle/cinn/frontend/group_cluster/cluster_policy/policy_manager.h"
 
-namespace cinn::frontend::group_cluster::policy {
+#include "paddle/pir/include/core/program.h"
+
+namespace paddle {
+namespace dialect {
+
+TEST_API std::shared_ptr<pir::Program> DistToDensePass(pir::Program* prog);
+
+void ProcessDistBlock(pir::Block* block);
 
-class GeneralTopoPolicy final : virtual public Policy {
- public:
-  bool CanFuse(const PatternNodePtr upstream, const PatternNodePtr downstream);
-};
+void VerifyDenseBlock(pir::Block* block);
 
-}  // namespace cinn::frontend::group_cluster::policy
+}  // namespace dialect
+}  // namespace paddle
diff --git a/paddle/fluid/pir/dialect/distributed/transforms/mix_to_dist_pass.cc b/paddle/fluid/pir/dialect/distributed/transforms/mix_to_dist_pass.cc
index 60d42984c57b6..fe0f5254d2b5f 100644
--- a/paddle/fluid/pir/dialect/distributed/transforms/mix_to_dist_pass.cc
+++ b/paddle/fluid/pir/dialect/distributed/transforms/mix_to_dist_pass.cc
@@ -44,7 +44,7 @@ inline bool IsShardTensorOp(pir::Operation* op) {
   return op_name.find("shard_tensor") != op_name.npos;
 }
 
-void ProcessBlock(pir::Block* block) {
+void ProcessMixBlock(pir::Block* block) {
   std::vector<pir::Operation*> deleted_ops;
 
   for (auto iter = block->begin(); iter != block->end(); ++iter) {
@@ -80,6 +80,15 @@ void ProcessBlock(pir::Block* block) {
 
       shard_operand_define_op->set_attribute(
           kAttrOpDistAttr, op_item->attribute(kAttrOpDistAttr));
+      // set stop gradient and persistable
+      if (op_item->HasAttribute(kAttrStopGradients)) {
+        shard_operand_define_op->set_attribute(
+            kAttrStopGradients, op_item->attribute(kAttrStopGradients));
+      }
+      if (op_item->HasAttribute(kAttrIsPersistable)) {
+        shard_operand_define_op->set_attribute(
+            kAttrIsPersistable, op_item->attribute(kAttrIsPersistable));
+      }
       deleted_ops.push_back(op_item);
     }
 
@@ -98,7 +107,7 @@ void ProcessBlock(pir::Block* block) {
     2. all Values (Results) are DistDenseTensorType.
     3. no shard_tensor in block.
 */
-void VerifyBlock(pir::Block* block) {
+void VerifyDistBlock(pir::Block* block) {
   for (auto iter = block->begin(); iter != block->end(); ++iter) {
     pir::Operation* op_item = &(*iter);
     PADDLE_ENFORCE_EQ(paddle::dialect::IsShardTensorOp(op_item),
@@ -135,8 +144,8 @@ std::shared_ptr<pir::Program> MixToDistPass(pir::Program* prog) {
   ctx->GetOrRegisterDialect<OperatorDialect>();
   ctx->GetOrRegisterDialect<DistDialect>();
 
-  ProcessBlock(new_prog->block());
-  VerifyBlock(new_prog->block());
+  ProcessMixBlock(new_prog->block());
+  VerifyDistBlock(new_prog->block());
 
   if (FLAGS_print_ir) {
     std::cout << "IR after MixToDist Pass = " << *new_prog << std::endl;
diff --git a/paddle/fluid/pir/dialect/distributed/transforms/mix_to_dist_pass.h b/paddle/fluid/pir/dialect/distributed/transforms/mix_to_dist_pass.h
index 978f64f12d2b1..7212fd79ab9fe 100644
--- a/paddle/fluid/pir/dialect/distributed/transforms/mix_to_dist_pass.h
+++ b/paddle/fluid/pir/dialect/distributed/transforms/mix_to_dist_pass.h
@@ -22,9 +22,9 @@ namespace dialect {
 
 TEST_API std::shared_ptr<pir::Program> MixToDistPass(pir::Program* prog);
 
-void ProcessBlock(pir::Block* block);
+void ProcessMixBlock(pir::Block* block);
 
-void VerifyBlock(pir::Block* block);
+void VerifyDistBlock(pir::Block* block);
 
 }  // namespace dialect
 }  // namespace paddle
diff --git a/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py b/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py
index 4d37aaf829861..6ca4b6d18680b 100644
--- a/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py
+++ b/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py
@@ -38,19 +38,19 @@
     "leaky_relu",
     "log_softmax",
     "mean",
+    "p_norm",
     "pow",
+    "reciprocal",
     "relu",
     "relu6",
     "sigmoid",
     "silu",
     "swiglu",
     "softmax",
-    "sqrt",
     "square",
     "squeeze",
     "stack",
     "unsqueeze",
-    "tile",
 ]
 
 # come into effect in generated file op_decomp.cc
@@ -72,19 +72,19 @@
     "leaky_relu",
     "log_softmax",
     "mean",
+    "p_norm",
     "pow",
+    "reciprocal",
     "relu",
     "relu6",
     "sigmoid",
     "silu",
     "swiglu",
     "softmax",
-    "sqrt",
     "square",
     "squeeze",
     "stack",
     "unsqueeze",
-    "tile",
 ]
 
 
diff --git a/paddle/fluid/pir/dialect/op_generator/op_build_gen.py b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
index 99daa1a8c1585..ee45bdf338270 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
@@ -387,9 +387,7 @@ def GenBuildAttributes(
                 op_attribute_type=op_non_mutable_attribute_type_list[idx],
                 attr=op_non_mutable_attribute_name_list[idx],
             )
-        attr_str += """  argument_attributes.insert({{"{attr_name}", attr_{attr_name}}});\n""".format(
-            attr_name=op_non_mutable_attribute_name_list[idx]
-        )
+        attr_str += f"""  argument_attributes.insert({{"{op_non_mutable_attribute_name_list[idx]}", attr_{op_non_mutable_attribute_name_list[idx]}}});\n"""
 
     return attr_str
 
@@ -558,15 +556,11 @@ def GenBuildOutputs(
         # is a vector<Tensor>
         if 'pir::VectorType' in op_input_type_list[idx]:
             if op_input_optional_list[idx] == 'false':
-                build_output_str += "  pir::VectorType {name} = {name}_.type().dyn_cast<pir::VectorType>(); (void){name};\n".format(
-                    name=op_input_name_list[idx]
-                )
+                build_output_str += f"  pir::VectorType {op_input_name_list[idx]} = {op_input_name_list[idx]}_.type().dyn_cast<pir::VectorType>(); (void){op_input_name_list[idx]};\n"
         # is a Tensor
         else:
             if op_input_optional_list[idx] == 'false':
-                build_output_str += "  {type} {name} = {name}_.type().dyn_cast<{type}>(); (void){name};\n".format(
-                    type=op_input_type_list[idx], name=op_input_name_list[idx]
-                )
+                build_output_str += f"  {op_input_type_list[idx]} {op_input_name_list[idx]} = {op_input_name_list[idx]}_.type().dyn_cast<{op_input_type_list[idx]}>(); (void){op_input_name_list[idx]};\n"
 
     # Prepare mutable attributes
     if mutable_attr_is_input:
@@ -826,13 +820,11 @@ def gen_build_func_str(
         op_non_mutable_attribute_type_list,
     )
 
-    build_outputs_str = """
-  std::vector<pir::Type> argument_outputs = {op_name}::InferMeta(argument_inputs, &argument_attributes);
+    build_outputs_str = f"""
+  std::vector<pir::Type> argument_outputs = {op_info.class_name}::InferMeta(argument_inputs, &argument_attributes);
   argument.AddAttributes(argument_attributes);
   argument.AddOutputs(argument_outputs.begin(), argument_outputs.end());
-  ::pir::PassStopGradientsDefaultly(argument);""".format(
-        op_name=op_info.class_name
-    )
+  ::pir::PassStopGradientsDefaultly(argument);"""
 
     GET_ATTRIBUTES_FROM_MAP_TEMPLATE = """
   PADDLE_ENFORCE_NE(
diff --git a/paddle/fluid/pir/dialect/op_generator/op_gen.py b/paddle/fluid/pir/dialect/op_generator/op_gen.py
index c264bd246ce60..ebe06caab438a 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_gen.py
@@ -54,6 +54,7 @@
     'BatchNormOp',
     'FetchOp',
     'FullIntArrayOp',
+    'FusedConv2dAddActOp',
     'MatmulOp',
     'SoftmaxOp',
     'ReshapeOp',
@@ -86,6 +87,7 @@
 #include "paddle/fluid/pir/dialect/operator/interface/decomp.h"
 #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_symbolic_shape.h"
 #include "paddle/fluid/pir/dialect/operator/interface/infermeta.h"
+#include "paddle/fluid/pir/dialect/operator/interface/layout_transformation.h"
 #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
 #include "paddle/fluid/pir/dialect/operator/interface/parse_kernel_key.h"
 #include "paddle/fluid/pir/dialect/operator/interface/vjp.h"
@@ -180,6 +182,7 @@ class {TEST_API} {op_name} : public pir::Op<{op_name}{interfaces}{traits}> {{
 CC_FILE_TEMPLATE = """// This file is generated by "paddle/fluid/pir/dialect/op_generator/op_gen.py"
 #include "{h_file}"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
+#include "paddle/fluid/pir/dialect/operator/interface/layout_transformation.h"
 #include "paddle/fluid/pir/dialect/operator/ir/ir_meta_tensor.h"
 #include "paddle/fluid/pir/dialect/operator/ir/ir_selected_rows.h"
 #include "paddle/fluid/pir/dialect/operator/ir/ir_tensor.h"
@@ -1507,9 +1510,7 @@ def AutoCodeGen(
                             muta_attr_is_input=True,
                         )
 
-                        build_mutable_attr_is_input = "static void Build({build_args});".format(
-                            build_args=build_args_with_muta_attr_is_input_for_declare
-                        )
+                        build_mutable_attr_is_input = f"static void Build({build_args_with_muta_attr_is_input_for_declare});"
                 if (op_invoke_map is not None) and (
                     op_invoke_map['func'] in op_info_items
                 ):
diff --git a/paddle/fluid/pir/dialect/op_generator/op_infermeta_func_gen.py b/paddle/fluid/pir/dialect/op_generator/op_infermeta_func_gen.py
index 5e0b696507fa5..61e43c53e6d1b 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_infermeta_func_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_infermeta_func_gen.py
@@ -111,9 +111,7 @@ def get_infermeta_inputs_str(
         # is a vector<Tensor>
         if 'pir::VectorType' in op_input_type_list[idx]:
             if op_input_optional_list[idx] == 'false':
-                infermeta_inputs_str += "  pir::VectorType {name} = {name}_.type().dyn_cast<pir::VectorType>(); (void){name};\n".format(
-                    name=op_input_name_list[idx]
-                )
+                infermeta_inputs_str += f"  pir::VectorType {op_input_name_list[idx]} = {op_input_name_list[idx]}_.type().dyn_cast<pir::VectorType>(); (void){op_input_name_list[idx]};\n"
         # is a Tensor
         else:
             if op_input_optional_list[idx] == 'false':
@@ -611,10 +609,17 @@ def GenDistBranch(args, op_info):
   // Auto Parallel condition
   ProcessMeshAttribute op_mesh;
   if(HasDistInput(input_values, &op_mesh)) {{
+    {}
     CvtAllInputsToDist(input_values, op_mesh);
     auto ctx = pir::IrContext::Instance();
     std::vector<TensorDistAttribute> operand_dist_attrs, result_dist_attrs;"""
-    dist_branch_str = TEMPLATE.format()
+
+    extra_call = ""
+    for name in op_info.spmd_params:
+        if name == "learning_rate":
+            extra_call = "CopyLeafOpToMesh(learning_rate_, op_mesh);"
+            break
+    dist_branch_str = TEMPLATE.format(extra_call)
     infer_spmd_args_list = []
     # Prepare inputs_meta_tensor & attributes for infer spmd
     for name in op_info.spmd_params:
diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
index 5ad1c5b562740..8ba3d64ad39a3 100644
--- a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
@@ -125,6 +125,7 @@
     'add_n_',
     'all_reduce',
     'all_reduce_',
+    'assign_pos',
     'batch_fc',
     'barrier',
     'c_allgather',
@@ -140,8 +141,12 @@
     'c_softmax_with_cross_entropy',
     'c_split',
     'decayed_adagrad',
+    'distributed_fused_lamb',
+    'distributed_fused_lamb_',
     'distributed_push_sparse',
     'distributed_lookup_table',
+    'dgc_momentum',
+    'dgc',
     'dpsgd',
     'embedding_grad_sparse',
     'ftrl',
diff --git a/paddle/fluid/pir/dialect/op_generator/vjp_interface_black_list.py b/paddle/fluid/pir/dialect/op_generator/vjp_interface_black_list.py
index 4b2bbc3c54999..c0620d4dbdc43 100644
--- a/paddle/fluid/pir/dialect/op_generator/vjp_interface_black_list.py
+++ b/paddle/fluid/pir/dialect/op_generator/vjp_interface_black_list.py
@@ -22,7 +22,13 @@
 # remove this file and support Vjp methods
 # code gen.
 
+# Operators which only has composite implementation should be added below.
+# For example
+# * `silu_double_grad` only has composite implementation, so `silu_grad` was added below.
+# * `log_double_grad` has both composite and kernel implementation, so `log_grad` should not be added below.
 
 vjp_interface_black_list = [
     'silu_grad',
+    'exp_grad',
+    'abs_double_grad',
 ]
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/binary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/binary_infer_sym.cc
index 42b3567290cda..49e62dbf59503 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/binary_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/binary_infer_sym.cc
@@ -38,7 +38,7 @@ inline void UpdatePaddingAndDilation(
   symbol::DimExpr one{1};
   symbol::DimExpr two{2};
   if (padding_algorithm == "SAME") {
-    symbol::DimExprBuilder builder{nullptr};
+    symbol::DimExprBuilder builder;
     for (size_t i = 0; i < data_dims.size(); ++i) {
       symbol::DimExpr out_size = (data_dims[i] + strides[i] - 1) / strides[i];
       symbol::DimExpr pad_sum = builder.Max(
@@ -205,7 +205,9 @@ bool SparseWeightEmbeddingOpInferSymbolicShape(
 bool ExpandAsOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
+      op->name() +
+      " 's InferSymbolicShape interface is NOT implemented "
+      "now because of the lack of necessary information."));
   return true;
 }
 
@@ -354,8 +356,16 @@ bool KronOpInferSymbolicShape(pir::Operation *op,
 
 bool MaskedSelectOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
+  const std::vector<symbol::DimExpr> &out_dims = [&] {
+    std::vector<symbol::DimExpr> out_dims;
+    symbol::DimExpr out_shape =
+        shape_analysis->GetNextSymName();  // unknown until runtime
+    out_dims.push_back(out_shape);
+    return out_dims;
+  }();
+  // TODO(fty1777): Add constrains between the shapes of x and mask
+  shape_analysis->SetShapeOrDataForValue(
+      op->result(0), symbol::TensorShapeOrDataDimExprs{out_dims});
   return true;
 }
 
@@ -416,9 +426,10 @@ bool MatmulOpInferSymbolicShape(
   } else if (ndims_x < ndims_y) {
     out_dims.assign(y_dims.begin(), y_dims.end() - 2);
   } else {
-    symbol::DimExprBuilder builder{nullptr};
+    symbol::DimExprBuilder builder;
     for (size_t i = 0; i < ndims_x - 2; ++i) {
       out_dims.emplace_back(builder.Broadcast(x_dims[i], y_dims[i]));
+      shape_analysis->AddBroadcastableCstr(x_dims[i], y_dims[i]);
     }
   }
 
@@ -440,21 +451,17 @@ bool MatmulOpInferSymbolicShape(
 
   if ((ndims_x == ndims_y) && ndims_x >= 2) {
     if (transpose_x_attr == false && transpose_y_attr == false) {
-      shape_analysis->DimExprBuilder().CstrEq(x_dims[ndims_x - 1],
-                                              y_dims[ndims_x - 2]);
+      shape_analysis->AddEqualCstr(x_dims[ndims_x - 1], y_dims[ndims_x - 2]);
     } else if (transpose_x_attr == false && transpose_y_attr == true) {
-      shape_analysis->DimExprBuilder().CstrEq(x_dims[ndims_x - 1],
-                                              y_dims[ndims_x - 1]);
+      shape_analysis->AddEqualCstr(x_dims[ndims_x - 1], y_dims[ndims_x - 1]);
     } else if (transpose_x_attr == true && transpose_y_attr == false) {
-      shape_analysis->DimExprBuilder().CstrEq(x_dims[ndims_x - 2],
-                                              y_dims[ndims_x - 2]);
+      shape_analysis->AddEqualCstr(x_dims[ndims_x - 2], y_dims[ndims_x - 2]);
     } else {
-      shape_analysis->DimExprBuilder().CstrEq(x_dims[ndims_x - 2],
-                                              y_dims[ndims_x - 1]);
+      shape_analysis->AddEqualCstr(x_dims[ndims_x - 2], y_dims[ndims_x - 1]);
     }
 
     for (size_t i = 0; i < ndims_x - 2; ++i) {
-      shape_analysis->DimExprBuilder().CstrEq(x_dims[i], y_dims[i]);
+      shape_analysis->AddEqualCstr(x_dims[i], y_dims[i]);
     }
   }
   return true;
@@ -462,8 +469,12 @@ bool MatmulOpInferSymbolicShape(
 
 bool SearchsortedOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
+  // The shape of output is the same as input `values` (op->operand_source(1))
+  const symbol::ShapeOrDataDimExprs &operand_shape_or_data =
+      shape_analysis->GetShapeOrDataForValue(op->operand_source(1));
+  // TODO(fty1777): Add constrains between the shapes of `sorted_sequence` and
+  // `values`
+  shape_analysis->SetShapeOrDataForValue(op->result(0), operand_shape_or_data);
   return true;
 }
 
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
index 35d4992539111..5302e1f76cdc2 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
@@ -85,9 +85,6 @@ bool ConcatOpInferSymbolicShape(
     return out_dims;
   };
 
-  VLOG(3) << "constraints size:"
-          << shape_analysis->DimExprBuilder().constraints().size();
-
   symbol::ShapeOrDataDimExprs shape_data{
       symbol::TensorShapeOrDataDimExprs(GetOutDimExprs())};
 
@@ -129,6 +126,18 @@ bool ReshapeOpInferSymbolicShape(
   std::vector<int> shape =
       paddle::dialect::details::GetVectorAttr<int>(op, "shape");
 
+  const symbol::ShapeOrDataDimExprs &x_dim_expr =
+      shape_analysis->GetShapeOrDataForValue(op->operand_source(0));
+  if (x_dim_expr.data().has_value()) {
+    if (shape.size() == 1 && shape.front() == 1) {
+      shape_analysis->SetShapeOrDataForValue(
+          op->result(0),
+          symbol::TensorShapeOrDataDimExprs(std::vector<symbol::DimExpr>{1},
+                                            x_dim_expr.data().value()));
+      return true;
+    }
+  }
+
   const auto &GetProduct = [&](const auto &dim_exprs, const auto &Filter) {
     symbol::DimExpr product{1};
     for (const auto &dim_expr : dim_exprs) {
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/element_wise_binary.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/element_wise_binary.cc
index 170143307dc06..e220d06f99020 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/element_wise_binary.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/element_wise_binary.cc
@@ -15,37 +15,19 @@
 #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/element_wise_binary.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 
-bool ShouldUseData(pir::Value val) {
-  if (!val.defining_op()) return false;
-  if (val.defining_op()->isa<paddle::dialect::ShapeOp>()) {
-    return true;
-  }
-  return false;
-}
-
 bool InferSymbolicShapeElementWiseBinary(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  const auto &x_shapeordata =
+    pir::Operation *op,
+    pir::ShapeConstraintIRAnalysis *shape_analysis,
+    const std::function<symbol::DimExpr(const symbol::DimExpr &,
+                                        const symbol::DimExpr &)>
+        &DataComputeFunc = nullptr) {
+  const auto &x_shape =
       shape_analysis->GetShapeOrDataForValue(op->operand_source(0));
-  std::vector<symbol::DimExpr> shape_0;
-  // For ElementWiseBinary ops, if the input tensor is from full op, the value
-  // of fullop is useless, only the shape need doing broadcast
-  if (ShouldUseData(op->operand_source(0)) &&
-      x_shapeordata.data().has_value()) {
-    shape_0 = x_shapeordata.data().value();
-  } else {
-    shape_0 = x_shapeordata.shape();
-  }
+  std::vector<symbol::DimExpr> shape_0 = x_shape.shape();
 
-  const auto &y_shapeordata =
+  const auto &y_shape =
       shape_analysis->GetShapeOrDataForValue(op->operand_source(1));
-  std::vector<symbol::DimExpr> shape_1;
-  if (ShouldUseData(op->operand_source(1)) &&
-      y_shapeordata.data().has_value()) {
-    shape_1 = y_shapeordata.data().value();
-  } else {
-    shape_1 = y_shapeordata.shape();
-  }
+  std::vector<symbol::DimExpr> shape_1 = y_shape.shape();
 
   int diff = shape_0.size() - shape_1.size();
   if (diff > 0) {
@@ -60,7 +42,7 @@ bool InferSymbolicShapeElementWiseBinary(
 
   const std::vector<symbol::DimExpr> shapes = [&] {
     std::vector<symbol::DimExpr> shapes;
-    symbol::DimExprBuilder builder{nullptr};
+    symbol::DimExprBuilder builder;
     for (size_t i = 0; i < shape_0.size(); i++) {
       if (shape_0[i] == shape_1[i]) {
         shapes.emplace_back(shape_0[i]);
@@ -70,17 +52,45 @@ bool InferSymbolicShapeElementWiseBinary(
         shapes.emplace_back(shape_0[i]);
       } else {
         shapes.emplace_back(builder.Broadcast(shape_0[i], shape_1[i]));
+        shape_analysis->AddBroadcastableCstr(shape_0[i], shape_1[i]);
       }
     }
     return shapes;
   }();
 
-  // TODO(lanxianghit): fill data when the operation is on shape computation
-  // std::vector<symbol::DimExpr> data;
-  symbol::ShapeOrDataDimExprs shape_data{
-      symbol::TensorShapeOrDataDimExprs(shapes)};
-  shape_analysis->SetShapeOrDataForValue(op->result(0), shape_data);
-
+  if (x_shape.data() && y_shape.data() && DataComputeFunc) {
+    PADDLE_ENFORCE_LE(
+        x_shape.shape().size(),
+        1,
+        common::errors::InvalidArgument("When compute data, the rank of x "
+                                        "should be 0 or 1, but now recevied %d",
+                                        x_shape.shape().size()));
+    PADDLE_ENFORCE_LE(
+        y_shape.shape().size(),
+        1,
+        common::errors::InvalidArgument("When compute data, the rank of y "
+                                        "should be 0 or 1, but now recevied %d",
+                                        y_shape.shape().size()));
+    PADDLE_ENFORCE_EQ(x_shape.data()->size(),
+                      y_shape.data()->size(),
+                      common::errors::InvalidArgument(
+                          "When compute data, the size of x and y should be "
+                          "equal, but now recevied %d and %d",
+                          x_shape.data()->size(),
+                          y_shape.data()->size()));
+    std::vector<symbol::DimExpr> out_data;
+    for (size_t i = 0; i < x_shape.data()->size(); ++i) {
+      out_data.emplace_back(
+          DataComputeFunc(x_shape.data()->at(i), y_shape.data()->at(i)));
+    }
+    symbol::ShapeOrDataDimExprs shape_data{
+        symbol::TensorShapeOrDataDimExprs(shapes, out_data)};
+    shape_analysis->SetShapeOrDataForValue(op->result(0), shape_data);
+  } else {
+    symbol::ShapeOrDataDimExprs shape_data{
+        symbol::TensorShapeOrDataDimExprs(shapes)};
+    shape_analysis->SetShapeOrDataForValue(op->result(0), shape_data);
+  }
   return true;
 }
 
@@ -91,14 +101,45 @@ bool InferSymbolicShapeElementWiseBinary(
   }
 
 namespace paddle::dialect {
-OP_ELEMENT_WISE_BINARY(Add)
+
+bool AddOpInferSymbolicShape(pir::Operation *op,
+                             pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(
+      op,
+      shape_analysis,
+      [](const symbol::DimExpr &x, const symbol::DimExpr &y) { return x + y; });
+}
+
+bool DivideOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(
+      op,
+      shape_analysis,
+      [](const symbol::DimExpr &x, const symbol::DimExpr &y) { return x / y; });
+}
+
+bool MultiplyOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(
+      op,
+      shape_analysis,
+      [](const symbol::DimExpr &x, const symbol::DimExpr &y) { return x * y; });
+}
+
+bool SubtractOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(
+      op,
+      shape_analysis,
+      [](const symbol::DimExpr &x, const symbol::DimExpr &y) { return x - y; });
+}
+
 OP_ELEMENT_WISE_BINARY(Add_)
 OP_ELEMENT_WISE_BINARY(BitwiseAnd)
 OP_ELEMENT_WISE_BINARY(BitwiseAnd_)
 OP_ELEMENT_WISE_BINARY(BitwiseXor)
 OP_ELEMENT_WISE_BINARY(BitwiseXor_)
 OP_ELEMENT_WISE_BINARY(Complex)
-OP_ELEMENT_WISE_BINARY(Divide)
 OP_ELEMENT_WISE_BINARY(Divide_)
 OP_ELEMENT_WISE_BINARY(ElementwisePow)
 OP_ELEMENT_WISE_BINARY(Fmax)
@@ -119,7 +160,6 @@ OP_ELEMENT_WISE_BINARY(LogicalXor)
 OP_ELEMENT_WISE_BINARY(LogicalXor_)
 OP_ELEMENT_WISE_BINARY(Maximum)
 OP_ELEMENT_WISE_BINARY(Minimum)
-OP_ELEMENT_WISE_BINARY(Multiply)
 OP_ELEMENT_WISE_BINARY(MultiplySr)
 OP_ELEMENT_WISE_BINARY(MultiplySr_)
 OP_ELEMENT_WISE_BINARY(Multiply_)
@@ -127,7 +167,6 @@ OP_ELEMENT_WISE_BINARY(NotEqual)
 OP_ELEMENT_WISE_BINARY(NotEqual_)
 OP_ELEMENT_WISE_BINARY(Remainder)
 OP_ELEMENT_WISE_BINARY(Remainder_)
-OP_ELEMENT_WISE_BINARY(Subtract)
 OP_ELEMENT_WISE_BINARY(Subtract_)
 
 }  // namespace paddle::dialect
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_slice_utils.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_slice_utils.h
index 345c55e1a116b..a1d6f5845802e 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_slice_utils.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_slice_utils.h
@@ -159,13 +159,19 @@ inline ShapeOrData SliceRawInferSymbolicShape(
     // Currently, we DO NOT support the case that any element in `axes` `starts`
     // or `ends` is a Symbol.
     auto vec_int64 = details::VecExpr2Int64(starts);
-    IR_ENFORCE(vec_int64.has_value(),
-               "for slice op, all the elements in `starts` must be int64_t");
+    PADDLE_ENFORCE_EQ(
+        vec_int64.has_value(),
+        true,
+        phi::errors::InvalidArgument(
+            "for slice op, all the elements in `starts` must be int64_t"));
     std::vector<int64_t> starts_int = vec_int64.value();
 
     vec_int64 = details::VecExpr2Int64(ends);
-    IR_ENFORCE(vec_int64.has_value(),
-               "for slice op, all the elements in `ends` must be int64_t");
+    PADDLE_ENFORCE_EQ(
+        vec_int64.has_value(),
+        true,
+        phi::errors::InvalidArgument(
+            "for slice op, all the elements in `ends` must be int64_t"));
     std::vector<int64_t> ends_int = vec_int64.value();
 
     const int64_t start =
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.cc
index 30730170e23a2..1026005ab7fc8 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.cc
@@ -104,8 +104,8 @@ void BuildCstrEqForTensorListAlongAxis(
     const symbol::TensorListShapeOrDataDimExprs &shape_data_list,
     int axis) {
   for (size_t i = 1; i < shape_data_list.size(); ++i) {
-    shape_analysis->DimExprBuilder().CstrEq(shape_data_list[0].shape()[axis],
-                                            shape_data_list[i].shape()[axis]);
+    shape_analysis->AddEqualCstr(shape_data_list[0].shape()[axis],
+                                 shape_data_list[i].shape()[axis]);
   }
 }
 
@@ -114,7 +114,7 @@ void BuildCstrEqForTensorListAlongAxis(
     const std::vector<pir::Value> &values,
     int axis) {
   for (size_t i = 1; i < values.size(); ++i) {
-    shape_analysis->DimExprBuilder().CstrEq(
+    shape_analysis->AddEqualCstr(
         shape_analysis->GetShapeOrDataForValue(values[0]).shape()[axis],
         shape_analysis->GetShapeOrDataForValue(values[i]).shape()[axis]);
   }
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h
index 7984fc3be4e46..42164c3c21254 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h
@@ -47,6 +47,11 @@ struct AttributeTrait<int> {
   using value_type = ::pir::Int32Attribute;
 };
 
+template <>
+struct AttributeTrait<float> {
+  using value_type = ::pir::FloatAttribute;
+};
+
 template <typename T = int64_t>
 std::vector<T> GetVectorAttr(const ::pir::Operation *op,
                              const std::string &name) {
@@ -82,8 +87,10 @@ inline ExprVec GetExprVecFromData(const ShapeOrData &shapeordata) {
     TensorListExprs list =
         shapeordata.dyn_cast<symbol::TensorListShapeOrDataDimExprs>();
     for (size_t i = 0; i < list.size(); i++) {
-      for (auto expr : list[i].data().value()) {
-        result.emplace_back(expr);
+      if (list[i].data().has_value()) {
+        for (auto expr : list[i].data().value()) {
+          result.emplace_back(expr);
+        }
       }
     }
     return result;
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc
index e96ede7488814..b5bb10f4f173e 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc
@@ -14,12 +14,174 @@
 
 #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h"
 #include "paddle/common/ddim.h"
+#include "paddle/common/layout.h"
 #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_slice_utils.h"
 #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 
 namespace paddle::dialect {
 
+bool BicubicInterpOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  const symbol::ShapeOrDataDimExprs &x =
+      shape_analysis->GetShapeOrDataForValue(op->operand_source(0));
+
+  const auto &attributes = op->attributes();
+
+  const std::string data_format =
+      attributes.at("data_format").dyn_cast<pir::StrAttribute>().AsString();
+  int out_d = attributes.at("out_d").dyn_cast<pir::Int32Attribute>().data();
+  int out_h = attributes.at("out_h").dyn_cast<pir::Int32Attribute>().data();
+  int out_w = attributes.at("out_w").dyn_cast<pir::Int32Attribute>().data();
+
+  std::vector<int> size_tensor;
+  if (out_d != -1) size_tensor.push_back(out_d);
+  if (out_h != -1) size_tensor.push_back(out_h);
+  if (out_w != -1) size_tensor.push_back(out_w);
+
+  const DataLayout data_layout = common::StringToDataLayout(data_format);
+
+  if (x.shape().size() == 3) {
+    // shape check for 1D interpolate for input tensor shape NCHW
+    if (!size_tensor.empty()) {
+      // top priority size
+      std::vector<symbol::DimExpr> dim_out;
+      if (data_layout == DataLayout::kNCHW) {
+        dim_out = {x.shape()[0], x.shape()[1], symbol::DimExpr{out_w}};
+      } else {
+        dim_out = {x.shape()[0], symbol::DimExpr{out_w}, x.shape()[2]};
+      }
+
+      symbol::ShapeOrDataDimExprs shape_data{
+          symbol::TensorShapeOrDataDimExprs(dim_out)};
+
+      pir::Value res = op->result(0);
+      shape_analysis->SetShapeOrDataForValue(res, shape_data);
+      return true;
+    }
+
+    symbol::DimExpr out_w_tmp{0};
+    const auto &next_sym = shape_analysis->GetNextSymName();
+    out_w_tmp = symbol::DimExpr(next_sym);
+
+    std::vector<symbol::DimExpr> dim_out;
+    if (data_layout == DataLayout::kNCHW) {
+      dim_out = {x.shape()[0], x.shape()[1], out_w_tmp};
+    } else {
+      dim_out = {x.shape()[0], out_w_tmp, x.shape()[2]};
+    }
+
+    symbol::ShapeOrDataDimExprs shape_data{
+        symbol::TensorShapeOrDataDimExprs(dim_out)};
+
+    pir::Value res = op->result(0);
+    shape_analysis->SetShapeOrDataForValue(res, shape_data);
+    return true;
+  } else if (x.shape().size() == 4) {
+    // shape check for 2D interpolate for input tensor shape NCHW
+    if (!size_tensor.empty()) {
+      // top priority size
+      std::vector<symbol::DimExpr> dim_out;
+      if (data_layout == DataLayout::kNCHW) {
+        dim_out = {x.shape()[0],
+                   x.shape()[1],
+                   symbol::DimExpr{out_h},
+                   symbol::DimExpr{out_w}};
+      } else {
+        dim_out = {x.shape()[0],
+                   symbol::DimExpr{out_h},
+                   symbol::DimExpr{out_w},
+                   x.shape()[3]};
+      }
+
+      symbol::ShapeOrDataDimExprs shape_data{
+          symbol::TensorShapeOrDataDimExprs(dim_out)};
+      pir::Value res = op->result(0);
+      shape_analysis->SetShapeOrDataForValue(res, shape_data);
+      return true;
+    }
+
+    symbol::DimExpr out_h_tmp{0};
+    symbol::DimExpr out_w_tmp{0};
+    const auto &next_sym = shape_analysis->GetNextSymName();
+    out_h_tmp = symbol::DimExpr(next_sym);
+    out_w_tmp = symbol::DimExpr(next_sym);
+
+    std::vector<symbol::DimExpr> dim_out;
+    if (data_layout == DataLayout::kNCHW) {
+      dim_out = {x.shape()[0], x.shape()[1], out_h_tmp, out_w_tmp};
+    } else {
+      dim_out = {x.shape()[0], out_h_tmp, out_w_tmp, x.shape()[3]};
+    }
+
+    symbol::ShapeOrDataDimExprs shape_data{
+        symbol::TensorShapeOrDataDimExprs(dim_out)};
+
+    pir::Value res = op->result(0);
+    shape_analysis->SetShapeOrDataForValue(res, shape_data);
+    return true;
+  } else if (x.shape().size() == 5) {
+    // shape check for 3D interpolate for input tensor shape NCDHW
+    if (!size_tensor.empty()) {
+      // top priority size
+      std::vector<symbol::DimExpr> dim_out;
+      if (data_layout == DataLayout::kNCHW) {
+        dim_out = {x.shape()[0],
+                   x.shape()[1],
+                   symbol::DimExpr{out_d},
+                   symbol::DimExpr{out_h},
+                   symbol::DimExpr{out_w}};
+      } else {
+        dim_out = {x.shape()[0],
+                   symbol::DimExpr{out_d},
+                   symbol::DimExpr{out_h},
+                   symbol::DimExpr{out_w},
+                   x.shape()[4]};
+      }
+
+      symbol::ShapeOrDataDimExprs shape_data{
+          symbol::TensorShapeOrDataDimExprs(dim_out)};
+
+      pir::Value res = op->result(0);
+      shape_analysis->SetShapeOrDataForValue(res, shape_data);
+      return true;
+    }
+
+    symbol::DimExpr out_d_tmp{0};
+    symbol::DimExpr out_h_tmp{0};
+    symbol::DimExpr out_w_tmp{0};
+    const auto &next_sym = shape_analysis->GetNextSymName();
+    out_d_tmp = symbol::DimExpr(next_sym);
+    out_h_tmp = symbol::DimExpr(next_sym);
+    out_w_tmp = symbol::DimExpr(next_sym);
+
+    std::vector<symbol::DimExpr> dim_out;
+
+    if (data_layout == DataLayout::kNCHW) {
+      dim_out = {x.shape()[0], x.shape()[1], out_d_tmp, out_h_tmp, out_w_tmp};
+    } else {
+      dim_out = {x.shape()[0], out_d_tmp, out_h_tmp, out_w_tmp, x.shape()[4]};
+    }
+
+    symbol::ShapeOrDataDimExprs shape_data{
+        symbol::TensorShapeOrDataDimExprs(dim_out)};
+
+    pir::Value res = op->result(0);
+    shape_analysis->SetShapeOrDataForValue(res, shape_data);
+    return true;
+
+  } else {
+    PADDLE_THROW(phi::errors::Fatal("Input(X) dimension must be 3, 4 or 5!"));
+  }
+
+  return true;
+}
+
+bool BilinearInterpOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return BicubicInterpOpInferSymbolicShape(op, shape_analysis);
+}
+
 bool ConcatOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   pir::Value operand_source = op->operand_source(0);
@@ -41,8 +203,10 @@ bool ConcatOpInferSymbolicShape(
 
   if (shape_data_list[0].data().has_value()) {
     if (rank == 1) {
-      ExprVec data = details::GetExprVecFromData(
-          shape_analysis->GetShapeOrDataForValue(operand_source));
+      const auto &s_or_d =
+          shape_analysis->GetShapeOrDataForValue(operand_source);
+      ExprVec data = details::GetExprVecFromData(s_or_d);
+
       const std::vector<symbol::DimExpr> shape{std::int64_t(data.size())};
       symbol::ShapeOrDataDimExprs shape_data{
           symbol::TensorShapeOrDataDimExprs(shape, data)};
@@ -95,7 +259,7 @@ bool ConcatOpInferSymbolicShape(
 
 bool FullWithTensorOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  pir::Value operand_source = op->operand_source(0);
+  pir::Value operand_source = op->operand_source(1);
   const symbol::ShapeOrDataDimExprs &operand_shape_or_data =
       shape_analysis->GetShapeOrDataForValue(operand_source);
 
@@ -147,11 +311,54 @@ bool LinspaceOpInferSymbolicShape(
   shape_analysis->SetShapeOrDataForValue(op->result(0), shape_data);
   return true;
 }
+
+bool LinearInterpOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return BicubicInterpOpInferSymbolicShape(op, shape_analysis);
+}
+
 bool LogspaceOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return LinspaceOpInferSymbolicShape(op, shape_analysis);
 }
 
+bool NearestInterpOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return BicubicInterpOpInferSymbolicShape(op, shape_analysis);
+}
+
+bool MeshgridOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  const symbol::TensorListShapeOrDataDimExprs &shape_data_list =
+      shape_analysis->GetShapeOrDataForValue(op->operand_source(0))
+          .dyn_cast<symbol::TensorListShapeOrDataDimExprs>();
+
+  const symbol::ShapeOrDataDimExprs sym_shape_dim_exprs = [&] {
+    symbol::TensorListShapeOrDataDimExprs shape_dim_exprs_list;
+    std::vector<symbol::DimExpr> vec;
+
+    for (auto &shape_data : shape_data_list) {
+      if (shape_data.shape().size() == 0) {
+        vec.emplace_back(1);
+      } else {
+        vec.emplace_back(shape_data.shape()[0]);
+      }
+    }
+
+    auto shape_dim_exprs = symbol::TensorShapeOrDataDimExprs(vec);
+
+    for (size_t i = 0; i < shape_data_list.size(); i++) {
+      shape_dim_exprs_list.emplace_back(shape_dim_exprs);
+    }
+
+    return symbol::ShapeOrDataDimExprs(shape_dim_exprs_list);
+  }();
+
+  pir::Value res = op->result(0);
+  shape_analysis->SetShapeOrDataForValue(res, sym_shape_dim_exprs);
+  return true;
+}
+
 bool StackOpInferSymbolicShape(pir::Operation *op,
                                pir::ShapeConstraintIRAnalysis *shape_analysis) {
   pir::Value operand_source = op->operand_source(0);
@@ -196,6 +403,11 @@ bool StackOpInferSymbolicShape(pir::Operation *op,
   return true;
 }
 
+bool TrilinearInterpOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return BicubicInterpOpInferSymbolicShape(op, shape_analysis);
+}
+
 bool WhereOpInferSymbolicShape(pir::Operation *op,
                                pir::ShapeConstraintIRAnalysis *shape_analysis) {
   shape_analysis->SetShapeOrDataForValue(
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h
index f2907bed0a4fd..be528d31139cf 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h
@@ -18,12 +18,18 @@
 
 namespace paddle::dialect {
 
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(BicubicInterp)
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(BilinearInterp)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Concat)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(FullWithTensor)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(FlashAttn)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Linspace)
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(LinearInterp)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Logspace)
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(Meshgrid)
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(NearestInterp)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Stack)
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(TrilinearInterp)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Where)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Where_)
 
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/nullary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/nullary_infer_sym.cc
index 0bec3266bfb30..0e294991449c1 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/nullary_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/nullary_infer_sym.cc
@@ -80,6 +80,27 @@ bool AssignValueOpInferSymbolicShape(
     sym_dims.emplace_back(symbol::DimExpr(static_cast<int64_t>(dim)));
   }
 
+  const auto &attributes = op->attributes();
+  std::vector<int64_t> values;
+  for (size_t i = 0;
+       i < attributes.at("values").dyn_cast<pir::ArrayAttribute>().size();
+       i++) {
+    values.push_back(attributes.at("values")
+                         .dyn_cast<pir::ArrayAttribute>()
+                         .at(i)
+                         .dyn_cast<paddle::dialect::ScalarAttribute>()
+                         .data()
+                         .to<int64_t>());
+  }
+  if (values.size() == 1) {
+    std::vector<symbol::DimExpr> data{values[0]};
+
+    symbol::ShapeOrDataDimExprs shape_data{
+        symbol::TensorShapeOrDataDimExprs(sym_dims, data)};
+    shape_analysis->SetShapeOrDataForValue(op->result(0), shape_data);
+    return true;
+  }
+
   symbol::ShapeOrDataDimExprs shape_data{
       symbol::TensorShapeOrDataDimExprs(sym_dims)};
   shape_analysis->SetShapeOrDataForValue(op->result(0), shape_data);
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_result.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_result.cc
index 04e5032098367..e96d10018c1d1 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_result.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_result.cc
@@ -99,6 +99,7 @@ OP_SAME_OPERANDS_AND_RESULT(PutAlongAxis)
 OP_SAME_OPERANDS_AND_RESULT(PutAlongAxis_)
 OP_SAME_OPERANDS_AND_RESULT(Real)
 OP_SAME_OPERANDS_AND_RESULT(Relu)
+OP_SAME_OPERANDS_AND_RESULT(Relu6)
 OP_SAME_OPERANDS_AND_RESULT(Relu_)
 OP_SAME_OPERANDS_AND_RESULT(Roll)
 OP_SAME_OPERANDS_AND_RESULT(Round)
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_result.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_result.h
index 41363fbe70604..96073b1271a32 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_result.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_result.h
@@ -90,6 +90,7 @@ OP_DECLARE_INFER_SYMBOLIC_SHAPE(PutAlongAxis)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(PutAlongAxis_)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Real)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Relu)
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(Relu6)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Relu_)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Roll)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Round)
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc
index 9f7b688f2825c..4dab7e358f05e 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc
@@ -216,7 +216,7 @@ bool DiagonalOpInferSymbolicShape(
   out_dims.erase(out_dims.begin() + std::max(axis1_, axis2_));
   out_dims.erase(out_dims.begin() + std::min(axis1_, axis2_));
 
-  symbol::DimExprBuilder builder{nullptr};
+  symbol::DimExprBuilder builder;
   symbol::DimExpr zero{0};
   symbol::DimExpr res_shape;
   symbol::DimExpr offset_sym{offset};
@@ -330,8 +330,41 @@ bool MinOpInferSymbolicShape(pir::Operation *op,
 
 bool PadOpInferSymbolicShape(pir::Operation *op,
                              pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
+  // input(0): Tensor x
+  const auto &x_shape_or_data =
+      shape_analysis->GetShapeOrDataForValue(op->operand_source(0));
+  PADDLE_ENFORCE_EQ(x_shape_or_data.data().has_value(),
+                    false,
+                    phi::errors::InvalidArgument(
+                        "InferSymbolicShape of PadOp only support input with "
+                        "value now."));
+  const auto &x_dims_sym = x_shape_or_data.shape();
+  const size_t rank = x_dims_sym.size();
+
+  // input(1): int[] paddings
+  std::vector<int> paddings =
+      paddle::dialect::details::GetVectorAttr<int>(op, "paddings");
+  PADDLE_ENFORCE_EQ(rank * 2,
+                    paddings.size(),
+                    phi::errors::InvalidArgument(
+                        "The size of paddings should be 2 * input's rank. But "
+                        "got paddings.size() = %d, input's rank = %d.",
+                        paddings.size(),
+                        rank));
+
+  // output
+  const auto &out_dims = [&] {
+    std::vector<symbol::DimExpr> out_dims;
+    out_dims.reserve(rank);
+    for (size_t i = 0; i < rank; ++i) {
+      out_dims.push_back(x_dims_sym[i] + paddings[2 * i] + paddings[2 * i + 1]);
+    }
+    return out_dims;
+  }();
+
+  shape_analysis->SetShapeOrDataForValue(
+      op->result(0), symbol::TensorShapeOrDataDimExprs(out_dims));
+
   return true;
 }
 
@@ -416,22 +449,24 @@ symbol::ShapeOrDataDimExprs CreateShapeOrDataForXShape(
 
 bool ReshapeOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  pir::Value operand_source = op->operand_source(0);
-  if (shape_analysis->GetShapeOrDataForValue(operand_source)
-          .data()
-          .has_value()) {
-    const symbol::ShapeOrDataDimExprs &operand_shape_or_data =
-        shape_analysis->GetShapeOrDataForValue(operand_source);
-    shape_analysis->SetShapeOrDataForValue(op->result(0),
-                                           operand_shape_or_data);
-    return true;
+  const symbol::ShapeOrDataDimExprs &x_dim_expr =
+      shape_analysis->GetShapeOrDataForValue(op->operand_source(0));
+  const symbol::ShapeOrDataDimExprs &shape_dim_expr =
+      shape_analysis->GetShapeOrDataForValue(op->operand_source(1));
+  if (x_dim_expr.data().has_value()) {
+    const auto &shape_data = details::GetExprVecFromData(shape_dim_expr);
+    auto IsOne = [](const symbol::DimExpr &expr) {
+      return expr.isa<int64_t>() && expr.dyn_cast<int64_t>() == 1;
+    };
+    if (shape_data.size() == 1 && IsOne(shape_data.at(0))) {
+      shape_analysis->SetShapeOrDataForValue(
+          op->result(0),
+          symbol::TensorShapeOrDataDimExprs(shape_data,
+                                            x_dim_expr.data().value()));
+      return true;
+    }
   }
 
-  pir::Value operand_source_shape = op->operand_source(1);
-
-  const symbol::ShapeOrDataDimExprs &operand_shape_or_data =
-      shape_analysis->GetShapeOrDataForValue(operand_source_shape);
-
   const auto &GetProduct = [&](const auto &dim_exprs, const auto &Filter) {
     symbol::DimExpr product{1};
     for (const auto &dim_expr : dim_exprs) {
@@ -463,7 +498,7 @@ bool ReshapeOpInferSymbolicShape(
     const auto &numel =
         GetProduct(original_shape, [](const auto &) { return true; });
 
-    ExprVec target_shape = details::GetExprVecFromData(operand_shape_or_data);
+    ExprVec target_shape = details::GetExprVecFromData(shape_dim_expr);
     const auto &product_exclude_minus_one =
         GetProduct(target_shape, IsNotMinusOne);
 
@@ -487,7 +522,7 @@ bool ReshapeOpInferSymbolicShape(
 
   shape_analysis->SetShapeOrDataForValue(op->result(0), shape_data);
 
-  const auto &x_shape = [&] {
+  const auto UNUSED &x_shape = [&] {
     std::vector<symbol::DimExpr> x_shape{symbol::DimExpr(0)};
     const auto &original_shape =
         shape_analysis->GetShapeOrDataForValue(op->operand_source(0)).shape();
@@ -499,7 +534,7 @@ bool ReshapeOpInferSymbolicShape(
   shape_analysis->SetShapeOrDataForValue(
       op->result(1),
       CreateShapeOrDataForXShape(
-          shape_analysis->GetShapeOrDataForValue(operand_source)));
+          shape_analysis->GetShapeOrDataForValue(op->operand_source(0))));
   return true;
 }
 
@@ -585,6 +620,8 @@ bool SplitOpInferSymbolicShape(pir::Operation *op,
                      .dyn_cast<paddle::dialect::ScalarAttribute>()
                      .data()
                      .to<int64_t>();
+  size_t rank = x_dims_sym.size();
+  axis = axis >= 0 ? axis : std::max(int64_t(0), int64_t(axis + rank));
 
   // sections
   const std::vector<symbol::DimExpr> &sections_sym = [&] {
@@ -629,8 +666,7 @@ bool SplitOpInferSymbolicShape(pir::Operation *op,
     const bool &all_sections_sym_not_minus_one =
         All(sections_sym, IsNotMinusOne);
     if (all_sections_sym_not_minus_one) {
-      shape_analysis->DimExprBuilder().CstrEq(x_dims_sym[axis],
-                                              sum_exclude_minus_one);
+      shape_analysis->AddEqualCstr(x_dims_sym[axis], sum_exclude_minus_one);
     }
 
     symbol::TensorListShapeOrDataDimExprs shape_data_list;
@@ -840,7 +876,7 @@ bool TransposeOpInferSymbolicShape(
 
   int x_rank = x_dims.size();
 
-  const std::vector<int32_t> formatted_axis = [op, x_rank, &perm] {
+  const std::vector<int32_t> formatted_axis = [x_rank, &perm] {
     std::vector<int32_t> out(perm.size(), 0);
     std::transform(perm.begin(),
                    perm.end(),
@@ -878,10 +914,13 @@ bool Transpose_OpInferSymbolicShape(
 
 bool SqueezeOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  IR_ENFORCE(op->num_operands() == 2,
-             "SqueezeOpInferSymbolicShape ONLY support num_operands() == 2 "
-             "now, but got %d operands",
-             op->num_operands());
+  PADDLE_ENFORCE_EQ(
+      op->num_operands(),
+      2,
+      phi::errors::InvalidArgument(
+          "SqueezeOpInferSymbolicShape ONLY support num_operands() == 2 "
+          "now, but got %d operands",
+          op->num_operands()));
 
   auto x_shape_or_data =
       shape_analysis->GetShapeOrDataForValue(op->operand_source(0));
@@ -904,10 +943,13 @@ bool SqueezeOpInferSymbolicShape(
 
   std::vector<int> squeeze_dims;
   for (auto squeeze_dim : squeeze_dims_sym) {
-    IR_ENFORCE(squeeze_dim.Has<std::int64_t>(),
-               "in SqueezeOpInferSymbolicShape, axes must be known int type, "
-               "but got: %s",
-               symbol::ToString(squeeze_dim));
+    PADDLE_ENFORCE_EQ(
+        squeeze_dim.Has<std::int64_t>(),
+        true,
+        phi::errors::InvalidArgument(
+            "in SqueezeOpInferSymbolicShape, axes must be known int type, "
+            "but got: %s",
+            symbol::ToString(squeeze_dim)));
     squeeze_dims.emplace_back(
         static_cast<int>(squeeze_dim.Get<std::int64_t>()));
   }
@@ -970,31 +1012,186 @@ bool Squeeze_OpInferSymbolicShape(
 
 bool UnbindOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
+  // input
+  const auto &x_shape_or_data =
+      shape_analysis->GetShapeOrDataForValue(op->operand_source(0));
+  PADDLE_ENFORCE_EQ(
+      x_shape_or_data.data().has_value(),
+      false,
+      phi::errors::InvalidArgument(
+          "InferSymbolicShape of UnbindOp only support input with "
+          "value now."));
+  const auto &x_dims_sym = x_shape_or_data.shape();
+
+  // axis
+  int axis = op->attributes().at("axis").dyn_cast<pir::Int32Attribute>().data();
+  int rank = x_dims_sym.size();
+  axis = axis >= 0 ? axis : axis + rank;
+
+  // output
+  const symbol::TensorListShapeOrDataDimExprs &output_shape_data_list = [&] {
+    symbol::TensorListShapeOrDataDimExprs shape_data_list;
+    std::vector<symbol::DimExpr> output_dims_sym = x_dims_sym;
+
+    const symbol::DimExpr &unbound_dim = x_dims_sym.at(axis);
+    PADDLE_ENFORCE_EQ(unbound_dim.isa<int64_t>(),
+                      true,
+                      phi::errors::InvalidArgument(
+                          "InferSymbolicShape of UnbindOp only support unbound "
+                          "dim with constant length!"));
+    output_dims_sym.erase(output_dims_sym.begin() + axis);
+    const int64_t unbound_dim_length = unbound_dim.dyn_cast<int64_t>();
+
+    for (uint32_t idx = 0; idx < unbound_dim_length; idx++) {
+      shape_data_list.push_back(
+          symbol::TensorShapeOrDataDimExprs(output_dims_sym));
+    }
+    return shape_data_list;
+  }();
+
+  shape_analysis->SetShapeOrDataForValue(
+      op->result(0), symbol::ShapeOrDataDimExprs{output_shape_data_list});
+
   return true;
 }
 
 bool UniqueOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
+  const auto &x_shape_or_data =
+      shape_analysis->GetShapeOrDataForValue(op->operand_source(0));
+  PADDLE_ENFORCE_EQ(
+      x_shape_or_data.data().has_value(),
+      false,
+      phi::errors::InvalidArgument(
+          "InferSymbolicShape of UniqueOp only support input with "
+          "value now."));
+  const auto &x_dims_sym = x_shape_or_data.shape();
+  const size_t rank = x_dims_sym.size();
+  std::vector<int> axes =
+      paddle::dialect::details::GetVectorAttr<int>(op, "axis");
+
+  symbol::DimExpr unique_dim_sym =
+      shape_analysis->GetNextSymName();  // unknown until runtime
+
+  const std::vector<symbol::DimExpr> &counts_dims = [&] {
+    std::vector<symbol::DimExpr> out_dims;
+    out_dims.push_back(unique_dim_sym);
+    return out_dims;
+  }();
+
+  const std::vector<symbol::DimExpr> &index_dims = counts_dims;
+
+  const std::vector<symbol::DimExpr> &out_dims = [&] {
+    if (axes.empty()) {
+      return counts_dims;
+    }
+    std::vector<symbol::DimExpr> out_dims = x_dims_sym;
+    int axis = axes[0];
+    axis = axis >= 0 ? axis : axis + rank;
+    out_dims[axis] = unique_dim_sym;
+    return out_dims;
+  }();
+
+  const std::vector<symbol::DimExpr> &inverse_dims = [&] {
+    std::vector<symbol::DimExpr> inverse_dims;
+    if (axes.empty()) {
+      // flatten before unique
+      symbol::DimExpr product{1};
+      for (const auto &x_dim : x_dims_sym) {
+        product = product * x_dim;
+      }
+      inverse_dims.push_back(product);
+    } else {
+      int axis = axes[0];
+      axis = axis >= 0 ? axis : axis + rank;
+      inverse_dims.push_back(x_dims_sym[axis]);
+    }
+    return inverse_dims;
+  }();
+
+  shape_analysis->SetShapeOrDataForValue(
+      op->result(0), symbol::TensorShapeOrDataDimExprs{out_dims});
+  shape_analysis->SetShapeOrDataForValue(
+      op->result(1), symbol::TensorShapeOrDataDimExprs{index_dims});
+  shape_analysis->SetShapeOrDataForValue(
+      op->result(2), symbol::TensorShapeOrDataDimExprs{inverse_dims});
+  shape_analysis->SetShapeOrDataForValue(
+      op->result(3), symbol::TensorShapeOrDataDimExprs{counts_dims});
+
   return true;
 }
 
 bool UniqueConsecutiveOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
+  const auto &x_shape_or_data =
+      shape_analysis->GetShapeOrDataForValue(op->operand_source(0));
+  PADDLE_ENFORCE_EQ(
+      x_shape_or_data.data().has_value(),
+      false,
+      phi::errors::InvalidArgument(
+          "InferSymbolicShape of UniqueConsecutiveOp only support input with "
+          "value now."));
+  const auto &x_dims_sym = x_shape_or_data.shape();
+  const size_t rank = x_dims_sym.size();
+  std::vector<int> axes =
+      paddle::dialect::details::GetVectorAttr<int>(op, "axis");
+
+  symbol::DimExpr unique_dim_sym =
+      shape_analysis->GetNextSymName();  // unknown until runtime
+
+  const std::vector<symbol::DimExpr> &counts_dims = [&] {
+    std::vector<symbol::DimExpr> out_dims;
+    out_dims.push_back(unique_dim_sym);
+    return out_dims;
+  }();
+
+  const std::vector<symbol::DimExpr> &out_dims = [&] {
+    if (axes.empty()) {
+      return counts_dims;
+    }
+    std::vector<symbol::DimExpr> out_dims = x_dims_sym;
+    int axis = axes[0];
+    axis = axis >= 0 ? axis : axis + rank;
+    out_dims[axis] = unique_dim_sym;
+    return out_dims;
+  }();
+
+  const std::vector<symbol::DimExpr> &inverse_dims = [&] {
+    std::vector<symbol::DimExpr> inverse_dims;
+    if (axes.empty()) {
+      // flatten before unique
+      symbol::DimExpr product{1};
+      for (const auto &x_dim : x_dims_sym) {
+        product = product * x_dim;
+      }
+      inverse_dims.push_back(product);
+    } else {
+      int axis = axes[0];
+      axis = axis >= 0 ? axis : axis + rank;
+      inverse_dims.push_back(x_dims_sym[axis]);
+    }
+    return inverse_dims;
+  }();
+
+  shape_analysis->SetShapeOrDataForValue(
+      op->result(0), symbol::TensorShapeOrDataDimExprs{out_dims});
+  shape_analysis->SetShapeOrDataForValue(
+      op->result(1), symbol::TensorShapeOrDataDimExprs{inverse_dims});
+  shape_analysis->SetShapeOrDataForValue(
+      op->result(2), symbol::TensorShapeOrDataDimExprs{counts_dims});
+
   return true;
 }
 
 bool UnsqueezeOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  IR_ENFORCE(op->num_operands() == 2,
-             "UnsqueezeOp InferSymbolicShape ONLY support num_operands() == 2 "
-             "now, but got %d operands",
-             op->num_operands());
+  PADDLE_ENFORCE_EQ(
+      op->num_operands(),
+      2,
+      phi::errors::InvalidArgument(
+          "UnsqueezeOp InferSymbolicShape ONLY support num_operands() == 2 "
+          "now, but got %d operands",
+          op->num_operands()));
 
   auto x_shape_or_data =
       shape_analysis->GetShapeOrDataForValue(op->operand_source(0));
@@ -1023,10 +1220,13 @@ bool UnsqueezeOpInferSymbolicShape(
 
   int cur_output_rank = x_dims_size;
   for (auto axis_expr : axes_sym) {
-    IR_ENFORCE(axis_expr.Has<std::int64_t>(),
-               "in UnsqueezeOpInferSymbolicShape, axes must be known int type, "
-               "but got: %s",
-               symbol::ToString(axis_expr));
+    PADDLE_ENFORCE_EQ(
+        axis_expr.Has<std::int64_t>(),
+        true,
+        phi::errors::InvalidArgument(
+            "in UnsqueezeOpInferSymbolicShape, axes must be known int type, "
+            "but got: %s",
+            symbol::ToString(axis_expr)));
     int axis = static_cast<int>(axis_expr.Get<std::int64_t>());
     int cur = axis < 0 ? axis + cur_output_rank + 1 : axis;
 
diff --git a/paddle/cinn/frontend/group_cluster/cluster_policy/shardable_axes_policy/shardable_axes_policy.cc b/paddle/fluid/pir/dialect/operator/interface/layout_transformation.cc
similarity index 54%
rename from paddle/cinn/frontend/group_cluster/cluster_policy/shardable_axes_policy/shardable_axes_policy.cc
rename to paddle/fluid/pir/dialect/operator/interface/layout_transformation.cc
index 36835406267a3..c6c1401f32d5c 100644
--- a/paddle/cinn/frontend/group_cluster/cluster_policy/shardable_axes_policy/shardable_axes_policy.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/layout_transformation.cc
@@ -12,14 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/cinn/frontend/group_cluster/cluster_policy/shardable_axes_policy/shardable_axes_policy.h"
+#include "paddle/fluid/pir/dialect/operator/interface/layout_transformation.h"
 
-namespace cinn::frontend::group_cluster::policy {
+namespace paddle {
+namespace dialect {
 
-bool ShardableAxesPolicy::CanFuse(const PatternNodePtr upstream,
-                                  const PatternNodePtr downstream) {
-  // TODO(wuzhanfei) shardable axes policy
-  return false;
+template <>
+common::DataLayout PreferLayoutImpl<FusedConv2dAddActOp>(pir::Operation* op) {
+  return common::DataLayout::NHWC;
 }
 
-}  // namespace cinn::frontend::group_cluster::policy
+template <>
+void RewriteByLayoutImpl<FusedConv2dAddActOp>(pir::Operation* op,
+                                              common::DataLayout new_layout) {
+  return;
+}
+
+}  // namespace dialect
+}  // namespace paddle
+IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::LayoutTransformationInterface)
diff --git a/paddle/fluid/pir/dialect/operator/interface/layout_transformation.h b/paddle/fluid/pir/dialect/operator/interface/layout_transformation.h
new file mode 100644
index 0000000000000..71678029fb48c
--- /dev/null
+++ b/paddle/fluid/pir/dialect/operator/interface/layout_transformation.h
@@ -0,0 +1,106 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/pir/dialect/operator/interface/layout_transformation.hpp"
+
+#include "paddle/common/enforce.h"
+#include "paddle/common/layout.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/pir/include/core/op_base.h"
+#include "paddle/pir/include/core/type_name.h"
+
+namespace paddle {
+namespace dialect {
+
+class LayoutTransformationInterface
+    : public pir::OpInterfaceBase<LayoutTransformationInterface> {
+ public:
+  using PreferLayoutFn = common::DataLayout (*)(pir::Operation*);
+  using RewriteByLayoutFn = void (*)(pir::Operation*, common::DataLayout);
+  using RelevantInputsFn = std::vector<pir::Value> (*)(pir::Operation*);
+  using RelevantOutputsFn = std::vector<pir::Value> (*)(pir::Operation*);
+
+  struct Concept {
+    explicit Concept(PreferLayoutFn prefer_layout,
+                     RewriteByLayoutFn rewrite_by_layout,
+                     RelevantInputsFn relevant_inputs,
+                     RelevantOutputsFn relevant_outputs)
+        : prefer_layout(prefer_layout),
+          rewrite_by_layout(rewrite_by_layout),
+          relevant_inputs(relevant_inputs),
+          relevant_outputs(relevant_outputs) {}
+
+    PreferLayoutFn prefer_layout;
+    RewriteByLayoutFn rewrite_by_layout;
+    RelevantInputsFn relevant_inputs;
+    RelevantOutputsFn relevant_outputs;
+  };
+
+  template <typename ConcreteOp>
+  struct Model : public Concept {
+    static common::DataLayout PreferLayoutModel(pir::Operation* op) {
+      return PreferLayoutImpl<ConcreteOp>(op);
+    }
+
+    static void RewriteByLayoutModel(pir::Operation* op,
+                                     common::DataLayout new_layout) {
+      RewriteByLayoutImpl<ConcreteOp>(op, new_layout);
+    }
+
+    static std::vector<pir::Value> RelevantInputsModel(pir::Operation* op) {
+      return RelevantInputsImpl<ConcreteOp>(op);
+    }
+
+    static std::vector<pir::Value> RelevantOutputsModel(pir::Operation* op) {
+      return RelevantOutputsImpl<ConcreteOp>(op);
+    }
+
+    Model()
+        : Concept(PreferLayoutModel,
+                  RewriteByLayoutModel,
+                  RelevantInputsModel,
+                  RelevantOutputsModel) {}
+  };
+
+  LayoutTransformationInterface(pir::Operation* op, Concept* impl)
+      : pir::OpInterfaceBase<LayoutTransformationInterface>(op), impl_(impl) {}
+
+  common::DataLayout PreferLayout(pir::Operation* op) {
+    return impl_->prefer_layout(op);
+  }
+
+  void RewriteByLayout(pir::Operation* op, common::DataLayout new_layout) {
+    impl_->rewrite_by_layout(op, new_layout);
+  }
+
+  std::vector<pir::Value> RelevantInputs(pir::Operation* op) {
+    return impl_->relevant_inputs(op);
+  }
+
+  std::vector<pir::Value> RelevantOutputs(pir::Operation* op) {
+    return impl_->relevant_outputs(op);
+  }
+
+ private:
+  Concept* impl_;
+};
+
+}  // namespace dialect
+}  // namespace paddle
+
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::LayoutTransformationInterface)
diff --git a/paddle/fluid/pir/dialect/operator/interface/layout_transformation.hpp b/paddle/fluid/pir/dialect/operator/interface/layout_transformation.hpp
new file mode 100644
index 0000000000000..c1860cbbac108
--- /dev/null
+++ b/paddle/fluid/pir/dialect/operator/interface/layout_transformation.hpp
@@ -0,0 +1,60 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/common/enforce.h"
+#include "paddle/common/layout.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/pir/include/core/operation.h"
+#include "paddle/pir/include/core/type_name.h"
+
+namespace paddle {
+namespace dialect {
+
+template <typename ConcreteOp>
+common::DataLayout PreferLayoutImpl(pir::Operation* op) {
+  return common::DataLayout::ALL_LAYOUT;
+}
+
+template <typename ConcreteOp>
+void RewriteByLayoutImpl(pir::Operation* op, common::DataLayout new_layout) {
+  PADDLE_THROW(common::errors::Unimplemented(
+      "Op %s should have a specialized RewriteByLayout function",
+      pir::get_type_name<ConcreteOp>()));
+}
+
+template <typename ConcreteOp>
+std::vector<pir::Value> RelevantInputsImpl(pir::Operation* op) {
+  return op->operands_source();
+}
+
+template <typename ConcreteOp>
+std::vector<pir::Value> RelevantOutputsImpl(pir::Operation* op) {
+  return op->results();
+}
+
+class FusedConv2dAddActOp;
+template <>
+common::DataLayout PreferLayoutImpl<FusedConv2dAddActOp>(pir::Operation*);
+extern template common::DataLayout PreferLayoutImpl<FusedConv2dAddActOp>(
+    pir::Operation*);
+template <>
+void RewriteByLayoutImpl<FusedConv2dAddActOp>(pir::Operation*,
+                                              common::DataLayout);
+extern template void RewriteByLayoutImpl<FusedConv2dAddActOp>(
+    pir::Operation*, common::DataLayout);
+
+}  // namespace dialect
+}  // namespace paddle
diff --git a/paddle/fluid/pir/dialect/operator/ir/api_builder.cc b/paddle/fluid/pir/dialect/operator/ir/api_builder.cc
index a86e19ccfe0a6..939f91154de5b 100644
--- a/paddle/fluid/pir/dialect/operator/ir/api_builder.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/api_builder.cc
@@ -22,11 +22,17 @@ namespace dialect {
 ApiBuilder::ApiBuilder()
     : ctx_(pir::IrContext::Instance()),
       builder_(std::make_shared<pir::Builder>(ctx_)) {
-  IR_ENFORCE(builder_ != nullptr, "api builder construct error!");
+  PADDLE_ENFORCE_NE(
+      builder_,
+      nullptr,
+      phi::errors::InvalidArgument("api builder construct error!"));
 }
 
 void ApiBuilder::SetProgram(pir::Program* program) {
-  IR_ENFORCE(program != nullptr, "argument of program is nullptr");
+  PADDLE_ENFORCE_NE(
+      program,
+      nullptr,
+      phi::errors::InvalidArgument("argument of program is nullptr"));
   builder_->SetInsertionPointToBlockEnd(program->block());
 }
 
@@ -50,8 +56,10 @@ void ApiBuilder::SetParameter(const std::string& name,
 }
 
 void ApiBuilder::LoadInsertionPoint() {
-  IR_ENFORCE(!insertion_point_stack_.empty(),
-             "insertion_point_stack_ is empty.");
+  PADDLE_ENFORCE_EQ(
+      !insertion_point_stack_.empty(),
+      true,
+      phi::errors::InvalidArgument("insertion_point_stack_ is empty."));
   builder_->set_insertion_point(insertion_point_stack_.top());
   insertion_point_stack_.pop();
 }
diff --git a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
index f674c35096018..ef9ecc2bd8ff7 100644
--- a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
@@ -765,14 +765,14 @@ bool WhileOp::InferSymbolicShape(
       }
       if (input_arg_shape[j] ==
           yield_value_shape[j]) {  // Dim isn't changed in while
-        shape_analysis->DimExprBuilder().CstrEq(original_input_shape[j],
-                                                input_arg_shape[j]);
+        shape_analysis->AddEqualCstr(original_input_shape[j],
+                                     input_arg_shape[j]);
         continue;
       }
       if (original_input_shape.size() == yield_value_shape.size() &&
           original_input_shape[j] == yield_value_shape[j]) {
-        shape_analysis->DimExprBuilder().CstrEq(original_input_shape[j],
-                                                input_arg_shape[j]);
+        shape_analysis->AddEqualCstr(original_input_shape[j],
+                                     input_arg_shape[j]);
         continue;
       }
     }
@@ -827,16 +827,27 @@ void HasElementsOp::Build(pir::Builder &builder,             // NOLINT
 void HasElementsOp::VerifySig() {
   VLOG(4) << "Verifying inputs, outputs ,attributes for: HasElementsOp.";
   // Verify inputs:
-  IR_ENFORCE(num_operands() == 1u, "The size of inputs must equal to 1.");
-  IR_ENFORCE(operand_type(0).isa<pir::ContainerType>(),
-             "The first input of cf.has_elements must be container type.");
+  PADDLE_ENFORCE_EQ(
+      num_operands(),
+      1u,
+      phi::errors::InvalidArgument("The size of inputs must equal to 1."));
+  PADDLE_ENFORCE_EQ(
+      operand_type(0).isa<pir::ContainerType>(),
+      true,
+      phi::errors::InvalidArgument(
+          "The first input of cf.has_elements must be container type."));
 
   // No attributes should be verify.
 
   // Verify outputs:
-  IR_ENFORCE(num_results() == 1u, "The size of outputs must be equal to 1.");
-  IR_ENFORCE((*this)->result_type(0).isa<DenseTensorType>(),
-             "The type of cf.has_elements' output is not correct.");
+  PADDLE_ENFORCE_EQ(
+      num_results(),
+      1u,
+      phi::errors::InvalidArgument("The size of outputs must be equal to 1."));
+  PADDLE_ENFORCE_EQ((*this)->result_type(0).isa<DenseTensorType>(),
+                    true,
+                    phi::errors::InvalidArgument(
+                        "The type of cf.has_elements' output is not correct."));
 }
 
 const char *AssertOp::attributes_name[1] = {"summarize"};
@@ -886,51 +897,69 @@ void AssertOp::VerifySig() {
   VLOG(4) << "Verifying inputs:";
   {
     auto input_size = num_operands();
-    IR_ENFORCE(input_size == 2u,
-               "The size %d of inputs must be equal to 2.",
-               input_size);
+    PADDLE_ENFORCE_EQ(
+        input_size,
+        2u,
+        phi::errors::InvalidArgument(
+            "The size %d of inputs must be equal to 2.", input_size));
 
     if ((*this)->operand_source(0).type().isa<pir::DenseTensorType>()) {
-      IR_ENFORCE((*this)
-                     ->operand_source(0)
-                     .type()
-                     .dyn_cast<pir::DenseTensorType>()
-                     .dtype()
-                     .isa<pir::BoolType>(),
-                 "Type validation failed for the 0th input, it should be a "
-                 "bool DenseTensorType.");
+      PADDLE_ENFORCE_EQ(
+          (*this)
+              ->operand_source(0)
+              .type()
+              .dyn_cast<pir::DenseTensorType>()
+              .dtype()
+              .isa<pir::BoolType>(),
+          true,
+          phi::errors::InvalidArgument(
+              "Type validation failed for the 0th input, it should be a "
+              "bool DenseTensorType."));
     }
 
     if (auto vec_type =
             (*this)->operand(1).type().dyn_cast<pir::VectorType>()) {
       for (size_t i = 0; i < vec_type.size(); ++i) {
-        IR_ENFORCE(vec_type[i].isa<paddle::dialect::DenseTensorType>() ||
-                       vec_type[i].isa<paddle::dialect::SelectedRowsType>(),
-                   "Type validation failed for the 1th input.");
+        PADDLE_ENFORCE_EQ(
+            vec_type[i].isa<paddle::dialect::DenseTensorType>() ||
+                vec_type[i].isa<paddle::dialect::SelectedRowsType>(),
+            true,
+            phi::errors::InvalidArgument(
+                "Type validation failed for the 1th input."));
       }
     } else {
-      IR_ENFORCE(
+      PADDLE_ENFORCE_EQ(
           (*this)->operand(1).type().isa<paddle::dialect::DenseTensorType>() ||
               (*this)
                   ->operand(1)
                   .type()
                   .isa<paddle::dialect::SelectedRowsType>(),
-          "Type validation failed for the 1th input.");
+          true,
+          phi::errors::InvalidArgument(
+              "Type validation failed for the 1th input."));
     }
   }
   VLOG(4) << "Verifying attributes:";
   {
     auto &attributes = this->attributes();
-    IR_ENFORCE(attributes.count("summarize") > 0, "summarize does not exist.");
-    IR_ENFORCE(attributes.at("summarize").isa<pir::Int64Attribute>(),
-               "Type of attribute: summarize is not pir::Int64Attribute.");
+    PADDLE_ENFORCE_GT(
+        attributes.count("summarize"),
+        0,
+        phi::errors::InvalidArgument("summarize does not exist."));
+    PADDLE_ENFORCE_EQ(
+        attributes.at("summarize").isa<pir::Int64Attribute>(),
+        true,
+        phi::errors::InvalidArgument(
+            "Type of attribute: summarize is not pir::Int64Attribute."));
   }
   VLOG(4) << "Verifying outputs:";
   {
     auto output_size = num_results();
-    IR_ENFORCE(output_size == 0u,
-               "The size %d of outputs must be equal to 0.",
-               output_size);
+    PADDLE_ENFORCE_EQ(
+        output_size,
+        0u,
+        phi::errors::InvalidArgument(
+            "The size %d of outputs must be equal to 0.", output_size));
     // Outputs num is 0, not need to check outputs type.
   }
   VLOG(4) << "End Verifying for: AssertOp.";
@@ -941,74 +970,104 @@ void SelectInputOp::VerifySig() {
   VLOG(4) << "Verifying inputs:";
   {
     auto in_size = num_operands();
-    IR_ENFORCE(in_size == 3u, "Size %d of inputs must be 3.", in_size);
+    PADDLE_ENFORCE_EQ(
+        in_size,
+        3u,
+        phi::errors::InvalidArgument("Size %d of inputs must be 3.", in_size));
     auto input1 = (*this)->operand_source(1).type();
     auto input2 = (*this)->operand_source(2).type();
     if (input1.isa<paddle::dialect::DenseTensorType>() &&
         input2.isa<paddle::dialect::DenseTensorType>()) {
       auto tensor1 = input1.dyn_cast<paddle::dialect::DenseTensorType>();
       auto tensor2 = input2.dyn_cast<paddle::dialect::DenseTensorType>();
-      IR_ENFORCE(
-          tensor1.dtype() == tensor2.dtype(),
-          "The 1st input dtype %s should be equal to 2ed input dtype %s.",
+      PADDLE_ENFORCE_EQ(
           tensor1.dtype(),
-          tensor2.dtype());
-      IR_ENFORCE(tensor1.data_layout() == tensor2.data_layout(),
-                 "The 1st input data_layout %s should be equal to 2ed input "
-                 "data_layout %s.",
-                 tensor1.data_layout(),
-                 tensor2.data_layout());
-      IR_ENFORCE(tensor1.lod() == tensor2.lod(),
-                 "The 1st input lod %s should be equal to 2ed input lod %s.",
-                 tensor1.lod(),
-                 tensor2.lod());
-      IR_ENFORCE(
-          tensor1.offset() == tensor2.offset(),
-          "The 1st input offset %s should be equal to 2ed input offset %s.",
+          tensor2.dtype(),
+          phi::errors::InvalidArgument(
+              "The 1st input dtype %s should be equal to 2ed input dtype %s.",
+              tensor1.dtype(),
+              tensor2.dtype()));
+      PADDLE_ENFORCE_EQ(
+          tensor1.data_layout(),
+          tensor2.data_layout(),
+          phi::errors::InvalidArgument(
+              "The 1st input data_layout %s should be equal to 2ed input "
+              "data_layout %s.",
+              tensor1.data_layout(),
+              tensor2.data_layout()));
+      PADDLE_ENFORCE_EQ(
+          tensor1.lod(),
+          tensor2.lod(),
+          phi::errors::InvalidArgument(
+              "The 1st input lod %s should be equal to 2ed input lod %s.",
+              tensor1.lod(),
+              tensor2.lod()));
+      PADDLE_ENFORCE_EQ(
           tensor1.offset(),
-          tensor2.offset());
+          tensor2.offset(),
+          phi::errors::InvalidArgument(
+              "The 1st input offset %s should be equal to 2ed input offset %s.",
+              tensor1.offset(),
+              tensor2.offset()));
     } else if (input1.isa<paddle::dialect::AllocatedDenseTensorType>() &&
                input2.isa<paddle::dialect::AllocatedDenseTensorType>()) {
       auto tensor1 =
           input1.dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
       auto tensor2 =
           input1.dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
-      IR_ENFORCE(
-          tensor1.dtype() == tensor2.dtype(),
-          "The 1st input dtype %s should be equal to 2ed input dtype %s.",
+      PADDLE_ENFORCE_EQ(
           tensor1.dtype(),
-          tensor2.dtype());
-      IR_ENFORCE(tensor1.data_layout() == tensor2.data_layout(),
-                 "The 1st input data_layout %s should be equal to 2ed input "
-                 "data_layout %s.",
-                 tensor1.data_layout(),
-                 tensor2.data_layout());
-      IR_ENFORCE(tensor1.lod() == tensor2.lod(),
-                 "The 1st input lod %s should be equal to 2ed input lod %s.",
-                 tensor1.lod(),
-                 tensor2.lod());
-      IR_ENFORCE(
-          tensor1.offset() == tensor2.offset(),
-          "The 1st input offset %s should be equal to 2ed input offset %s.",
+          tensor2.dtype(),
+          phi::errors::InvalidArgument(
+              "The 1st input dtype %s should be equal to 2ed input dtype %s.",
+              tensor1.dtype(),
+              tensor2.dtype()));
+      PADDLE_ENFORCE_EQ(
+          tensor1.data_layout(),
+          tensor2.data_layout(),
+          phi::errors::InvalidArgument(
+              "The 1st input data_layout %s should be equal to 2ed input "
+              "data_layout %s.",
+              tensor1.data_layout(),
+              tensor2.data_layout()));
+      PADDLE_ENFORCE_EQ(
+          tensor1.lod(),
+          tensor2.lod(),
+          phi::errors::InvalidArgument(
+              "The 1st input lod %s should be equal to 2ed input lod %s.",
+              tensor1.lod(),
+              tensor2.lod()));
+      PADDLE_ENFORCE_EQ(
           tensor1.offset(),
-          tensor2.offset());
-      IR_ENFORCE(
-          tensor1.place() == tensor2.place(),
-          "The 1st input place %s should be equal to 2ed input place %s.",
+          tensor2.offset(),
+          phi::errors::InvalidArgument(
+              "The 1st input offset %s should be equal to 2ed input offset %s.",
+              tensor1.offset(),
+              tensor2.offset()));
+      PADDLE_ENFORCE_EQ(
           tensor1.place(),
-          tensor2.place());
+          tensor2.place(),
+          phi::errors::InvalidArgument(
+              "The 1st input place %s should be equal to 2ed input place %s.",
+              tensor1.place(),
+              tensor2.place()));
     } else {
-      IR_ENFORCE(input1 == input2,
-                 "The 1st input type %s should be equal to 2ed input type %s.",
-                 input1,
-                 input2);
+      PADDLE_ENFORCE_EQ(
+          input1,
+          input2,
+          phi::errors::InvalidArgument(
+              "The 1st input type %s should be equal to 2ed input type %s.",
+              input1,
+              input2));
     }
   }
   VLOG(4) << "Verifying outputs:";
   {
     auto out_size = num_results();
-    IR_ENFORCE(
-        out_size == 1u, "Size %d of outputs must be equal to 1.", out_size);
+    PADDLE_ENFORCE_EQ(out_size,
+                      1u,
+                      phi::errors::InvalidArgument(
+                          "Size %d of outputs must be equal to 1.", out_size));
   }
   VLOG(4) << "End Verifying for: AssignArray_Op.";
 }
@@ -1061,13 +1120,18 @@ void SelectOutputOp::VerifySig() {
   VLOG(4) << "Verifying inputs:";
   {
     auto in_size = num_operands();
-    IR_ENFORCE(in_size == 2u, "Size %d of inputs must be 2.", in_size);
+    PADDLE_ENFORCE_EQ(
+        in_size,
+        2u,
+        phi::errors::InvalidArgument("Size %d of inputs must be 2.", in_size));
   }
   VLOG(4) << "Verifying outputs:";
   {
     auto out_size = num_results();
-    IR_ENFORCE(
-        out_size == 2u, "Size %d of outputs must be equal to 2.", out_size);
+    PADDLE_ENFORCE_EQ(out_size,
+                      2u,
+                      phi::errors::InvalidArgument(
+                          "Size %d of outputs must be equal to 2.", out_size));
 
     auto out1 = (*this)->result(0).type();
     auto out2 = (*this)->result(1).type();
@@ -1075,58 +1139,83 @@ void SelectOutputOp::VerifySig() {
         out2.isa<paddle::dialect::DenseTensorType>()) {
       auto tensor1 = out1.dyn_cast<paddle::dialect::DenseTensorType>();
       auto tensor2 = out2.dyn_cast<paddle::dialect::DenseTensorType>();
-      IR_ENFORCE(
-          tensor1.dtype() == tensor2.dtype(),
-          "The 1st input dtype %s should be equal to 2ed input dtype %s.",
+      PADDLE_ENFORCE_EQ(
           tensor1.dtype(),
-          tensor2.dtype());
-      IR_ENFORCE(tensor1.data_layout() == tensor2.data_layout(),
-                 "The 1st input data_layout %s should be equal to 2ed input "
-                 "data_layout %s.",
-                 tensor1.data_layout(),
-                 tensor2.data_layout());
-      IR_ENFORCE(tensor1.lod() == tensor2.lod(),
-                 "The 1st input lod %s should be equal to 2ed input lod %s.",
-                 tensor1.lod(),
-                 tensor2.lod());
-      IR_ENFORCE(
-          tensor1.offset() == tensor2.offset(),
-          "The 1st input offset %s should be equal to 2ed input offset %s.",
+          tensor2.dtype(),
+          phi::errors::InvalidArgument(
+              "The 1st input dtype %s should be equal to 2ed input dtype %s.",
+              tensor1.dtype(),
+              tensor2.dtype()));
+      PADDLE_ENFORCE_EQ(
+          tensor1.data_layout(),
+          tensor2.data_layout(),
+          phi::errors::InvalidArgument(
+              "The 1st input data_layout %s should be equal to 2ed input "
+              "data_layout %s.",
+              tensor1.data_layout(),
+              tensor2.data_layout()));
+      PADDLE_ENFORCE_EQ(
+          tensor1.lod(),
+          tensor2.lod(),
+          phi::errors::InvalidArgument(
+              "The 1st input lod %s should be equal to 2ed input lod %s.",
+              tensor1.lod(),
+              tensor2.lod()));
+      PADDLE_ENFORCE_EQ(
           tensor1.offset(),
-          tensor2.offset());
+          tensor2.offset(),
+          phi::errors::InvalidArgument(
+              "The 1st input offset %s should be equal to 2ed input offset %s.",
+              tensor1.offset(),
+              tensor2.offset()));
     } else if (out1.isa<paddle::dialect::AllocatedDenseTensorType>() &&
                out2.isa<paddle::dialect::AllocatedDenseTensorType>()) {
       auto tensor1 = out1.dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
       auto tensor2 = out2.dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
-      IR_ENFORCE(
-          tensor1.dtype() == tensor2.dtype(),
-          "The 1st input dtype %s should be equal to 2ed input dtype %s.",
+      PADDLE_ENFORCE_EQ(
           tensor1.dtype(),
-          tensor2.dtype());
-      IR_ENFORCE(tensor1.data_layout() == tensor2.data_layout(),
-                 "The 1st input data_layout %s should be equal to 2ed input "
-                 "data_layout %s.",
-                 tensor1.data_layout(),
-                 tensor2.data_layout());
-      IR_ENFORCE(tensor1.lod() == tensor2.lod(),
-                 "The 1st input lod %s should be equal to 2ed input lod %s.",
-                 tensor1.lod(),
-                 tensor2.lod());
-      IR_ENFORCE(
-          tensor1.offset() == tensor2.offset(),
-          "The 1st input offset %s should be equal to 2ed input offset %s.",
+          tensor2.dtype(),
+          phi::errors::InvalidArgument(
+              "The 1st input dtype %s should be equal to 2ed input dtype %s.",
+              tensor1.dtype(),
+              tensor2.dtype()));
+      PADDLE_ENFORCE_EQ(
+          tensor1.data_layout(),
+          tensor2.data_layout(),
+          phi::errors::InvalidArgument(
+              "The 1st input data_layout %s should be equal to 2ed input "
+              "data_layout %s.",
+              tensor1.data_layout(),
+              tensor2.data_layout()));
+      PADDLE_ENFORCE_EQ(
+          tensor1.lod(),
+          tensor2.lod(),
+          phi::errors::InvalidArgument(
+              "The 1st input lod %s should be equal to 2ed input lod %s.",
+              tensor1.lod(),
+              tensor2.lod()));
+      PADDLE_ENFORCE_EQ(
           tensor1.offset(),
-          tensor2.offset());
-      IR_ENFORCE(
-          tensor1.place() == tensor2.place(),
-          "The 1st input place %s should be equal to 2ed input place %s.",
+          tensor2.offset(),
+          phi::errors::InvalidArgument(
+              "The 1st input offset %s should be equal to 2ed input offset %s.",
+              tensor1.offset(),
+              tensor2.offset()));
+      PADDLE_ENFORCE_EQ(
           tensor1.place(),
-          tensor2.place());
+          tensor2.place(),
+          phi::errors::InvalidArgument(
+              "The 1st input place %s should be equal to 2ed input place %s.",
+              tensor1.place(),
+              tensor2.place()));
     } else {
-      IR_ENFORCE(out1 == out2,
-                 "The 1st input type %s should be equal to 2ed input type %s.",
-                 out1,
-                 out2);
+      PADDLE_ENFORCE_EQ(
+          out1,
+          out2,
+          phi::errors::InvalidArgument(
+              "The 1st input type %s should be equal to 2ed input type %s.",
+              out1,
+              out2));
     }
   }
   VLOG(4) << "End Verifying for: AssignArray_Op.";
diff --git a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h
index 8b5af449d4820..9b9bcd97b78fe 100644
--- a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h
+++ b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h
@@ -173,7 +173,8 @@ class HasElementsOp : public pir::Op<HasElementsOp> {
 ///      print(summarize number of elements in data)
 ///   }
 ///
-class AssertOp : public pir::Op<AssertOp, OpYamlInfoInterface> {
+class AssertOp
+    : public pir::Op<AssertOp, OpYamlInfoInterface, pir::SideEffectTrait> {
  public:
   using Op::Op;
   static const char *name() { return "pd_op.assert"; }
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_onednn_op.cc b/paddle/fluid/pir/dialect/operator/ir/manual_onednn_op.cc
index 4e4b7f46b382c..17d9a1dadc903 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_onednn_op.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_onednn_op.cc
@@ -124,16 +124,21 @@ void ExpandOp::Build(pir::Builder& builder,
                      pir::AttributeMap attributes) {
   VLOG(4) << "Start build ExpandOp";
 
-  IR_ENFORCE(attributes.find("shape") != attributes.end(),
-             "'shape' Attribute is expected for ExpandOp. ");
+  PADDLE_ENFORCE_NE(attributes.find("shape"),
+                    attributes.end(),
+                    phi::errors::InvalidArgument(
+                        "'shape' Attribute is expected for ExpandOp. "));
   std::vector<int64_t> shape =
       attributes.at("shape")
           .dyn_cast<paddle::dialect::IntArrayAttribute>()
           .data()
           .GetData();
 
-  IR_ENFORCE(attributes.find("mkldnn_data_type") != attributes.end(),
-             "'mkldnn_data_type' Attribute is expected for ExpandOp. ");
+  PADDLE_ENFORCE_NE(
+      attributes.find("mkldnn_data_type"),
+      attributes.end(),
+      phi::errors::InvalidArgument(
+          "'mkldnn_data_type' Attribute is expected for ExpandOp. "));
   std::string mkldnn_data_type = attributes.at("mkldnn_data_type")
                                      .dyn_cast<pir::StrAttribute>()
                                      .AsString();
@@ -190,48 +195,66 @@ void ExpandOp::VerifySig() {
   VLOG(4) << "Verifying inputs:";
   {
     auto input_size = num_operands();
-    IR_ENFORCE(input_size == 2u,
-               "The size %d of inputs must be equal to 2.",
-               input_size);
-    IR_ENFORCE((*this)
-                   ->operand_source(0)
-                   .type()
-                   .isa<paddle::dialect::DenseTensorType>(),
-               "Type validation failed for the 0th input, got %s.",
-               (*this)->operand_source(0).type());
+    PADDLE_ENFORCE_EQ(
+        input_size,
+        2u,
+        phi::errors::InvalidArgument(
+            "The size %d of inputs must be equal to 2.", input_size));
+    PADDLE_ENFORCE_EQ((*this)
+                          ->operand_source(0)
+                          .type()
+                          .isa<paddle::dialect::DenseTensorType>(),
+                      true,
+                      phi::errors::InvalidArgument(
+                          "Type validation failed for the 0th input, got %s.",
+                          (*this)->operand_source(0).type()));
     if (auto vec_type =
             (*this)->operand_source(1).type().dyn_cast<pir::VectorType>()) {
       for (size_t i = 0; i < vec_type.size(); ++i) {
-        IR_ENFORCE(vec_type[i].isa<paddle::dialect::DenseTensorType>(),
-                   "Type validation failed for the 1th input, got %s.",
-                   (*this)->operand_source(1).type());
+        PADDLE_ENFORCE_EQ(
+            vec_type[i].isa<paddle::dialect::DenseTensorType>(),
+            true,
+            phi::errors::InvalidArgument(
+                "Type validation failed for the 1th input, got %s.",
+                (*this)->operand_source(1).type()));
       }
     } else {
-      IR_ENFORCE((*this)
-                     ->operand_source(1)
-                     .type()
-                     .isa<paddle::dialect::DenseTensorType>(),
-                 "Type validation failed for the 1th input, got %s.",
-                 (*this)->operand_source(1).type());
+      PADDLE_ENFORCE_EQ((*this)
+                            ->operand_source(1)
+                            .type()
+                            .isa<paddle::dialect::DenseTensorType>(),
+                        true,
+                        phi::errors::InvalidArgument(
+                            "Type validation failed for the 1th input, got %s.",
+                            (*this)->operand_source(1).type()));
     }
   }
   VLOG(4) << "Verifying attributes:";
   {
     auto& attributes = this->attributes();
-    IR_ENFORCE(attributes.count("mkldnn_data_type") > 0,
-               "mkldnn_data_type does not exist.");
-    IR_ENFORCE(attributes.at("mkldnn_data_type").isa<pir::StrAttribute>(),
-               "Type of attribute: mkldnn_data_type is not pir::StrAttribute.");
+    PADDLE_ENFORCE_GT(
+        attributes.count("mkldnn_data_type"),
+        0,
+        phi::errors::InvalidArgument("mkldnn_data_type does not exist."));
+    PADDLE_ENFORCE_EQ(
+        attributes.at("mkldnn_data_type").isa<pir::StrAttribute>(),
+        true,
+        phi::errors::InvalidArgument(
+            "Type of attribute: mkldnn_data_type is not pir::StrAttribute."));
   }
   VLOG(4) << "Verifying outputs:";
   {
     auto output_size = num_results();
-    IR_ENFORCE(output_size == 1u,
-               "The size %d of outputs must be equal to 1.",
-               output_size);
-    IR_ENFORCE(
+    PADDLE_ENFORCE_EQ(
+        output_size,
+        1u,
+        phi::errors::InvalidArgument(
+            "The size %d of outputs must be equal to 1.", output_size));
+    PADDLE_ENFORCE_EQ(
         (*this)->result(0).type().isa<paddle::dialect::DenseTensorType>(),
-        "Type validation failed for the 0th output.");
+        true,
+        phi::errors::InvalidArgument(
+            "Type validation failed for the 0th output."));
   }
   VLOG(4) << "End Verifying for: ExpandOp.";
 }
@@ -248,9 +271,11 @@ std::vector<pir::Type> ExpandOp::InferMeta(
       p_attributes,
       common::errors::Fatal(
           "AttrtibueMap pointer in InferMeta function is nullptr."));
-  IR_ENFORCE(input_values.size() == 2,
-             "Num of inputs is expected to be 2 but got %d.",
-             input_values.size());
+  PADDLE_ENFORCE_EQ(input_values.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "Num of inputs is expected to be 2 but got %d.",
+                        input_values.size()));
 
   pir::Value x_ = input_values[0];
   pir::Value shape_ = input_values[1];
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
index c5dc4457b737e..640cfc6456f1d 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
@@ -148,9 +148,11 @@ std::vector<pir::Type> AddNOp::InferMeta(
     const std::vector<pir::Value> &input_values,
     pir::AttributeMap *p_attributes) {
   VLOG(4) << "Start infermeta AddNOp";
-  IR_ENFORCE(input_values.size() == 1,
-             "Num of inputs is expected to be 1 but got %d.",
-             input_values.size());
+  PADDLE_ENFORCE_EQ(input_values.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "Num of inputs is expected to be 1 but got %d.",
+                        input_values.size()));
   pir::Value inputs_ = input_values[0];
 
   VLOG(4) << "Builder construction outputs";
@@ -294,9 +296,11 @@ std::vector<pir::Type> AddN_Op::InferMeta(
     const std::vector<pir::Value> &input_values,
     pir::AttributeMap *p_attributes) {
   VLOG(4) << "Start infermeta AddN_Op";
-  IR_ENFORCE(input_values.size() == 1,
-             "Num of inputs is expected to be 1 but got %d.",
-             input_values.size());
+  PADDLE_ENFORCE_EQ(input_values.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "Num of inputs is expected to be 1 but got %d.",
+                        input_values.size()));
   pir::Value inputs_ = input_values[0];
 
   VLOG(4) << "Builder construction outputs";
@@ -444,9 +448,11 @@ std::vector<pir::Type> AddNArrayOp::InferMeta(
     const std::vector<pir::Value> &input_values,
     pir::AttributeMap *p_attributes) {
   VLOG(4) << "Start infermeta AddNArrayOp";
-  IR_ENFORCE(input_values.size() == 1,
-             "Num of inputs is expected to be 1 but got %d.",
-             input_values.size());
+  PADDLE_ENFORCE_EQ(input_values.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "Num of inputs is expected to be 1 but got %d.",
+                        input_values.size()));
   pir::Value inputs_ = input_values[0];
   VLOG(4) << "Builder construction outputs";
   pir::VectorType inputs = inputs_.type().dyn_cast<pir::VectorType>();
@@ -670,9 +676,11 @@ std::vector<pir::Type> FusedGemmEpilogueOp::InferMeta(
           "AttrtibueMap pointer in InferMeta function is nullptr."));
   auto &attributes = *p_attributes;
   VLOG(4) << "Start infermeta FusedGemmEpilogueOp";
-  IR_ENFORCE(input_values.size() == 3,
-             "Num of inputs is expected to be 3 but got %d.",
-             input_values.size());
+  PADDLE_ENFORCE_EQ(input_values.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "Num of inputs is expected to be 3 but got %d.",
+                        input_values.size()));
   pir::Value x_ = input_values[0];
   pir::Value y_ = input_values[1];
   pir::Value bias_ = input_values[2];
@@ -919,9 +927,11 @@ std::vector<pir::Type> FusedGemmEpilogueGradOp::InferMeta(
       common::errors::Fatal(
           "AttrtibueMap pointer in InferMeta function is nullptr."));
   auto &attributes = *p_attributes;
-  IR_ENFORCE(input_values.size() == 4,
-             "Num of inputs is expected to be 4 but got %d.",
-             input_values.size());
+  PADDLE_ENFORCE_EQ(input_values.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "Num of inputs is expected to be 4 but got %d.",
+                        input_values.size()));
 
   pir::Value x_ = input_values[0];
   pir::Value y_ = input_values[1];
@@ -1218,9 +1228,11 @@ std::vector<pir::Type> SplitGradOp::InferMeta(
     pir::AttributeMap *p_attributes) {
   VLOG(4) << "Start infermeta SplitGradOp";
 
-  IR_ENFORCE(input_values.size() == 2,
-             "Num of inputs is expected to be 2 but got %d.",
-             input_values.size());
+  PADDLE_ENFORCE_EQ(input_values.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "Num of inputs is expected to be 2 but got %d.",
+                        input_values.size()));
   pir::Value out_grad_ = input_values[0];
   pir::Value axis_ = input_values[1];
 
@@ -1479,9 +1491,11 @@ std::vector<pir::Type> CreateArrayLikeOp::InferMeta(
     const std::vector<pir::Value> &input_values,
     pir::AttributeMap *p_attributes) {
   VLOG(4) << "Start infermeta CreateArrayLikeOp";
-  IR_ENFORCE(input_values.size() == 1,
-             "Num of inputs is expected to be 1 but got %d.",
-             input_values.size());
+  PADDLE_ENFORCE_EQ(input_values.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "Num of inputs is expected to be 1 but got %d.",
+                        input_values.size()));
   pir::Value input_ = input_values[0];
 
   VLOG(4) << "Builder construction outputs";
@@ -1600,9 +1614,11 @@ std::vector<pir::Type> ArrayLengthOp::InferMeta(
     const std::vector<pir::Value> &input_values,
     pir::AttributeMap *p_attributes) {
   VLOG(4) << "Start infermeta ArrayLengthOp";
-  IR_ENFORCE(input_values.size() == 1,
-             "Num of inputs is expected to be 1 but got %d.",
-             input_values.size());
+  PADDLE_ENFORCE_EQ(input_values.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "Num of inputs is expected to be 1 but got %d.",
+                        input_values.size()));
   pir::Value x_ = input_values[0];
 
   paddle::dialect::DenseTensorArrayType x_type;
@@ -1756,9 +1772,11 @@ std::vector<pir::Type> ArrayReadOp::InferMeta(
     const std::vector<pir::Value> &input_values,
     pir::AttributeMap *p_attributes) {
   VLOG(4) << "Start infermeta ArrayLengthOp";
-  IR_ENFORCE(input_values.size() == 2,
-             "Num of inputs is expected to be 2 but got %d.",
-             input_values.size());
+  PADDLE_ENFORCE_EQ(input_values.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "Num of inputs is expected to be 2 but got %d.",
+                        input_values.size()));
   pir::Value array_ = input_values[0];
   pir::Value i_ = input_values[1];
 
@@ -1924,9 +1942,11 @@ std::vector<pir::Type> ArrayWrite_Op::InferMeta(
     const std::vector<pir::Value> &input_values,
     pir::AttributeMap *p_attributes) {
   VLOG(4) << "Start infermeta ArrayWrite_Op";
-  IR_ENFORCE(input_values.size() == 3,
-             "Num of inputs is expected to be 3 but got %d.",
-             input_values.size());
+  PADDLE_ENFORCE_EQ(input_values.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "Num of inputs is expected to be 3 but got %d.",
+                        input_values.size()));
   pir::Value array_ = input_values[0];
   pir::Value x_ = input_values[1];
 
@@ -2121,17 +2141,23 @@ std::vector<pir::Type> ArrayToTensorOp::InferMeta(
           "AttrtibueMap pointer in InferMeta function is nullptr."));
   auto &attributes = *p_attributes;
   VLOG(4) << "Start infermeta ArrayToTensorOp";
-  IR_ENFORCE(input_values.size() == 1,
-             "Num of inputs is expected to be 1 but got %d.",
-             input_values.size());
+  PADDLE_ENFORCE_EQ(input_values.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "Num of inputs is expected to be 1 but got %d.",
+                        input_values.size()));
   pir::Value x_ = input_values[0];
 
-  IR_ENFORCE(attributes.find("axis") != attributes.end(),
-             "'value' Attribute is expected for IncrementOp. ");
+  PADDLE_ENFORCE_NE(attributes.find("axis"),
+                    attributes.end(),
+                    phi::errors::InvalidArgument(
+                        "'value' Attribute is expected for IncrementOp. "));
   int32_t axis = attributes.at("axis").dyn_cast<pir::Int32Attribute>().data();
 
-  IR_ENFORCE(attributes.find("use_stack") != attributes.end(),
-             "'value' Attribute is expected for IncrementOp. ");
+  PADDLE_ENFORCE_NE(attributes.find("use_stack"),
+                    attributes.end(),
+                    phi::errors::InvalidArgument(
+                        "'value' Attribute is expected for IncrementOp. "));
   bool use_stack =
       attributes.at("use_stack").dyn_cast<pir::BoolAttribute>().data();
 
@@ -2315,21 +2341,27 @@ std::vector<pir::Type> TensorToArrayOp::InferMeta(
           "AttrtibueMap pointer in InferMeta function is nullptr."));
   auto &attributes = *p_attributes;
   VLOG(4) << "Start infermeta TensorToArrayOp";
-  IR_ENFORCE(input_values.size() == 2,
-             "Num of inputs is expected to be 2 but got %d.",
-             input_values.size());
+  PADDLE_ENFORCE_EQ(input_values.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "Num of inputs is expected to be 2 but got %d.",
+                        input_values.size()));
   pir::Value x_ = input_values[0];
   pir::Value out_grad_ = input_values[1];
 
   VLOG(4) << "Builder construction attributes";
   pir::AttributeMap argument_attributes = {};
 
-  IR_ENFORCE(attributes.find("axis") != attributes.end(),
-             "'value' Attribute is expected for IncrementOp. ");
+  PADDLE_ENFORCE_NE(attributes.find("axis"),
+                    attributes.end(),
+                    phi::errors::InvalidArgument(
+                        "'value' Attribute is expected for IncrementOp. "));
   int32_t axis = attributes.at("axis").dyn_cast<pir::Int32Attribute>().data();
 
-  IR_ENFORCE(attributes.find("use_stack") != attributes.end(),
-             "'value' Attribute is expected for IncrementOp. ");
+  PADDLE_ENFORCE_NE(attributes.find("use_stack"),
+                    attributes.end(),
+                    phi::errors::InvalidArgument(
+                        "'value' Attribute is expected for IncrementOp. "));
   bool use_stack =
       attributes.at("use_stack").dyn_cast<pir::BoolAttribute>().data();
 
@@ -2430,39 +2462,53 @@ void SliceArrayOp::VerifySig() {
   VLOG(4) << "Verifying inputs:";
   {
     auto input_size = num_operands();
-    IR_ENFORCE(input_size == 3u,
-               "The size %d of inputs must be equal to 3.",
-               input_size);
-    IR_ENFORCE((*this)
-                   ->operand_source(0)
-                   .type()
-                   .isa<paddle::dialect::DenseTensorArrayType>(),
-               "Type validation failed for the 0th input, got %s.",
-               (*this)->operand_source(0).type());
-    IR_ENFORCE((*this)->operand_source(1).type().isa<pir::VectorType>() ||
-                   (*this)
-                       ->operand_source(1)
-                       .type()
-                       .isa<paddle::dialect::DenseTensorType>(),
-               "Type validation failed for the 1st input, got %s.",
-               (*this)->operand_source(1).type());
-    IR_ENFORCE((*this)->operand_source(2).type().isa<pir::VectorType>() ||
-                   (*this)
-                       ->operand_source(2)
-                       .type()
-                       .isa<paddle::dialect::DenseTensorType>(),
-               "Type validation failed for the 1st input, got %s.",
-               (*this)->operand_source(2).type());
+    PADDLE_ENFORCE_EQ(
+        input_size,
+        3u,
+        phi::errors::InvalidArgument(
+            "The size %d of inputs must be equal to 3.", input_size));
+    PADDLE_ENFORCE_EQ((*this)
+                          ->operand_source(0)
+                          .type()
+                          .isa<paddle::dialect::DenseTensorArrayType>(),
+                      true,
+                      phi::errors::InvalidArgument(
+                          "Type validation failed for the 0th input, got %s.",
+                          (*this)->operand_source(0).type()));
+    PADDLE_ENFORCE_EQ(
+        (*this)->operand_source(1).type().isa<pir::VectorType>() ||
+            (*this)
+                ->operand_source(1)
+                .type()
+                .isa<paddle::dialect::DenseTensorType>(),
+        true,
+        phi::errors::InvalidArgument(
+            "Type validation failed for the 1st input, got %s.",
+            (*this)->operand_source(1).type()));
+    PADDLE_ENFORCE_EQ(
+        (*this)->operand_source(2).type().isa<pir::VectorType>() ||
+            (*this)
+                ->operand_source(2)
+                .type()
+                .isa<paddle::dialect::DenseTensorType>(),
+        true,
+        phi::errors::InvalidArgument(
+            "Type validation failed for the 1st input, got %s.",
+            (*this)->operand_source(2).type()));
   }
   VLOG(4) << "Verifying outputs:";
   {
     auto output_size = num_results();
-    IR_ENFORCE(output_size == 1u,
-               "The size %d of outputs must be equal to 1.",
-               output_size);
-    IR_ENFORCE(
+    PADDLE_ENFORCE_EQ(
+        output_size,
+        1u,
+        phi::errors::InvalidArgument(
+            "The size %d of outputs must be equal to 1.", output_size));
+    PADDLE_ENFORCE_EQ(
         (*this)->result(0).type().isa<paddle::dialect::DenseTensorArrayType>(),
-        "Type validation failed for the 0th output.");
+        true,
+        phi::errors::InvalidArgument(
+            "Type validation failed for the 0th output."));
   }
   VLOG(4) << "End Verifying for: SliceArrayOp.";
 }
@@ -2527,9 +2573,11 @@ std::vector<pir::Type> SliceArrayOp::InferMeta(
     const std::vector<pir::Value> &input_values,
     pir::AttributeMap *p_attributes) {
   VLOG(4) << "Start infermeta SliceArrayOp";
-  IR_ENFORCE(input_values.size() == 3,
-             "Num of inputs is expected to be 3 but got %d.",
-             input_values.size());
+  PADDLE_ENFORCE_EQ(input_values.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "Num of inputs is expected to be 3 but got %d.",
+                        input_values.size()));
   pir::Value input = input_values[0];
   pir::Value starts = input_values[1];
   pir::Value ends = input_values[2];
@@ -2622,32 +2670,43 @@ void SliceArrayDenseOp::VerifySig() {
   VLOG(4) << "Verifying inputs:";
   {
     auto input_size = num_operands();
-    IR_ENFORCE(input_size == 2u,
-               "The size %d of inputs must be equal to 2.",
-               input_size);
-    IR_ENFORCE((*this)
-                   ->operand_source(0)
-                   .type()
-                   .isa<paddle::dialect::DenseTensorArrayType>(),
-               "Type validation failed for the 0th input, got %s.",
-               (*this)->operand_source(0).type());
-    IR_ENFORCE((*this)->operand_source(1).type().isa<pir::VectorType>() ||
-                   (*this)
-                       ->operand_source(1)
-                       .type()
-                       .isa<paddle::dialect::DenseTensorType>(),
-               "Type validation failed for the 1st input, got %s.",
-               (*this)->operand_source(1).type());
+    PADDLE_ENFORCE_EQ(
+        input_size,
+        2u,
+        phi::errors::InvalidArgument(
+            "The size %d of inputs must be equal to 2.", input_size));
+    PADDLE_ENFORCE_EQ((*this)
+                          ->operand_source(0)
+                          .type()
+                          .isa<paddle::dialect::DenseTensorArrayType>(),
+                      true,
+                      phi::errors::InvalidArgument(
+                          "Type validation failed for the 0th input, got %s.",
+                          (*this)->operand_source(0).type()));
+    PADDLE_ENFORCE_EQ(
+        (*this)->operand_source(1).type().isa<pir::VectorType>() ||
+            (*this)
+                ->operand_source(1)
+                .type()
+                .isa<paddle::dialect::DenseTensorType>(),
+        true,
+        phi::errors::InvalidArgument(
+            "Type validation failed for the 1st input, got %s.",
+            (*this)->operand_source(1).type()));
   }
   VLOG(4) << "Verifying outputs:";
   {
     auto output_size = num_results();
-    IR_ENFORCE(output_size == 1u,
-               "The size %d of outputs must be equal to 1.",
-               output_size);
-    IR_ENFORCE(
+    PADDLE_ENFORCE_EQ(
+        output_size,
+        1u,
+        phi::errors::InvalidArgument(
+            "The size %d of outputs must be equal to 1.", output_size));
+    PADDLE_ENFORCE_EQ(
         (*this)->result(0).type().isa<paddle::dialect::DenseTensorType>(),
-        "Type validation failed for the 0th output.");
+        true,
+        phi::errors::InvalidArgument(
+            "Type validation failed for the 0th output."));
   }
   VLOG(4) << "End Verifying for: SliceArrayOp.";
 }
@@ -2678,9 +2737,11 @@ std::vector<pir::Type> SliceArrayDenseOp::InferMeta(
     const std::vector<pir::Value> &input_values,
     pir::AttributeMap *p_attributes) {
   VLOG(4) << "Start infermeta SliceArrayDenseOp";
-  IR_ENFORCE(input_values.size() == 2,
-             "Num of inputs is expected to be 2 but got %d.",
-             input_values.size());
+  PADDLE_ENFORCE_EQ(input_values.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "Num of inputs is expected to be 2 but got %d.",
+                        input_values.size()));
   pir::Value input = input_values[0];
   pir::Value starts = input_values[1];
 
@@ -2772,15 +2833,19 @@ void AssignArrayOp::VerifySig() {
   VLOG(4) << "Verifying inputs:";
   {
     auto input_size = num_operands();
-    IR_ENFORCE(input_size == 1u,
-               "The size %d of inputs must be equal to 1.",
-               input_size);
-    IR_ENFORCE((*this)
-                   ->operand_source(0)
-                   .type()
-                   .isa<paddle::dialect::DenseTensorArrayType>(),
-               "Type validation failed for the 0th input, got %s.",
-               (*this)->operand_source(0).type());
+    PADDLE_ENFORCE_EQ(
+        input_size,
+        1u,
+        phi::errors::InvalidArgument(
+            "The size %d of inputs must be equal to 1.", input_size));
+    PADDLE_ENFORCE_EQ((*this)
+                          ->operand_source(0)
+                          .type()
+                          .isa<paddle::dialect::DenseTensorArrayType>(),
+                      true,
+                      phi::errors::InvalidArgument(
+                          "Type validation failed for the 0th input, got %s.",
+                          (*this)->operand_source(0).type()));
   }
   VLOG(4) << "Verifying attributes:";
   {
@@ -2789,12 +2854,16 @@ void AssignArrayOp::VerifySig() {
   VLOG(4) << "Verifying outputs:";
   {
     auto output_size = num_results();
-    IR_ENFORCE(output_size == 1u,
-               "The size %d of outputs must be equal to 1.",
-               output_size);
-    IR_ENFORCE(
+    PADDLE_ENFORCE_EQ(
+        output_size,
+        1u,
+        phi::errors::InvalidArgument(
+            "The size %d of outputs must be equal to 1.", output_size));
+    PADDLE_ENFORCE_EQ(
         (*this)->result(0).type().isa<paddle::dialect::DenseTensorArrayType>(),
-        "Type validation failed for the 0th output.");
+        true,
+        phi::errors::InvalidArgument(
+            "Type validation failed for the 0th output."));
   }
   VLOG(4) << "End Verifying for: AssignArrayOp.";
 }
@@ -2817,9 +2886,11 @@ std::vector<pir::Type> AssignArrayOp::InferMeta(
     const std::vector<pir::Value> &input_values,
     pir::AttributeMap *p_attributes) {
   VLOG(4) << "Start infermeta AssignArrayOp";
-  IR_ENFORCE(input_values.size() == 1,
-             "Num of inputs is expected to be 1 but got %d.",
-             input_values.size());
+  PADDLE_ENFORCE_EQ(input_values.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "Num of inputs is expected to be 1 but got %d.",
+                        input_values.size()));
   pir::Value x_ = input_values[0];
 
   VLOG(4) << "Builder construction outputs";
@@ -2885,26 +2956,35 @@ void AssignArray_Op::VerifySig() {
   VLOG(4) << "Verifying inputs:";
   {
     auto input_size = num_operands();
-    IR_ENFORCE(input_size == 1u,
-               "The size %d of inputs must be equal to 1.",
-               input_size);
-    IR_ENFORCE((*this)
-                   ->operand_source(0)
-                   .type()
-                   .isa<paddle::dialect::DenseTensorArrayType>(),
-               "Type validation failed for the 0th input, but got %s.",
-               (*this)->operand_source(0).type());
+    PADDLE_ENFORCE_EQ(
+        input_size,
+        1u,
+        phi::errors::InvalidArgument(
+            "The size %d of inputs must be equal to 1.", input_size));
+    PADDLE_ENFORCE_EQ(
+        (*this)
+            ->operand_source(0)
+            .type()
+            .isa<paddle::dialect::DenseTensorArrayType>(),
+        true,
+        phi::errors::InvalidArgument(
+            "Type validation failed for the 0th input, but got %s.",
+            (*this)->operand_source(0).type()));
   }
   VLOG(4) << "Verifying attributes:";
   VLOG(4) << "Verifying outputs:";
   {
     auto output_size = num_results();
-    IR_ENFORCE(output_size == 1u,
-               "The size %d of outputs must be equal to 1.",
-               output_size);
-    IR_ENFORCE(
+    PADDLE_ENFORCE_EQ(
+        output_size,
+        1u,
+        phi::errors::InvalidArgument(
+            "The size %d of outputs must be equal to 1.", output_size));
+    PADDLE_ENFORCE_EQ(
         (*this)->result(0).type().isa<paddle::dialect::DenseTensorArrayType>(),
-        "Type validation failed for the 0th output.");
+        true,
+        phi::errors::InvalidArgument(
+            "Type validation failed for the 0th output."));
   }
   VLOG(4) << "End Verifying for: AssignArray_Op.";
 }
@@ -2918,9 +2998,11 @@ std::vector<pir::Type> AssignArray_Op::InferMeta(
     const std::vector<pir::Value> &input_values,
     pir::AttributeMap *p_attributes) {
   VLOG(4) << "Start infermeta AssignArray_Op";
-  IR_ENFORCE(input_values.size() == 1,
-             "Num of inputs is expected to be 1 but got %d.",
-             input_values.size());
+  PADDLE_ENFORCE_EQ(input_values.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "Num of inputs is expected to be 1 but got %d.",
+                        input_values.size()));
   pir::Value x_ = input_values[0];
 
   VLOG(4) << "Builder construction outputs";
@@ -3022,8 +3104,10 @@ void ExpandOp::Build(pir::Builder &builder,
                      pir::AttributeMap attributes) {
   VLOG(4) << "Start build ExpandOp";
 
-  IR_ENFORCE(attributes.find("shape") != attributes.end(),
-             "'shape' Attribute is expected for ExpandOp. ");
+  PADDLE_ENFORCE_NE(attributes.find("shape"),
+                    attributes.end(),
+                    phi::errors::InvalidArgument(
+                        "'shape' Attribute is expected for ExpandOp. "));
   std::vector<int64_t> shape =
       attributes.at("shape")
           .dyn_cast<paddle::dialect::IntArrayAttribute>()
@@ -3114,10 +3198,13 @@ bool ExpandOp::InferSymbolicShape(
     if (expand_shape[i] == -1) {  // copy the dim from x
       // the shape is right aligned
       int index = i - (expand_shape.size() - x_dims.size());
-      IR_ENFORCE(index >= 0,
-                 "in ExpandOpInferSymbolicShape, the dim to copy must >= 0, "
-                 "but got %d",
-                 index);
+      PADDLE_ENFORCE_GE(
+          index,
+          0,
+          phi::errors::InvalidArgument(
+              "in ExpandOpInferSymbolicShape, the dim to copy must >= 0, "
+              "but got %d",
+              index));
 
       out_shape[i] = x_dims[index];
     }
@@ -3136,26 +3223,34 @@ void ExpandOp::VerifySig() {
   VLOG(4) << "Verifying inputs:";
   {
     auto input_size = num_operands();
-    IR_ENFORCE(input_size == 2u,
-               "The size %d of inputs must be equal to 2.",
-               input_size);
-    IR_ENFORCE((*this)
-                   ->operand_source(0)
-                   .type()
-                   .isa<paddle::dialect::DenseTensorType>(),
-               "Type validation failed for the 0th input.");
+    PADDLE_ENFORCE_EQ(
+        input_size,
+        2u,
+        phi::errors::InvalidArgument(
+            "The size %d of inputs must be equal to 2.", input_size));
+    PADDLE_ENFORCE_EQ((*this)
+                          ->operand_source(0)
+                          .type()
+                          .isa<paddle::dialect::DenseTensorType>(),
+                      true,
+                      phi::errors::InvalidArgument(
+                          "Type validation failed for the 0th input."));
     if (auto vec_type =
             (*this)->operand_source(1).type().dyn_cast<pir::VectorType>()) {
       for (size_t i = 0; i < vec_type.size(); ++i) {
-        IR_ENFORCE(vec_type[i].isa<paddle::dialect::DenseTensorType>(),
-                   "Type validation failed for the 1th input.");
+        PADDLE_ENFORCE_EQ(vec_type[i].isa<paddle::dialect::DenseTensorType>(),
+                          true,
+                          phi::errors::InvalidArgument(
+                              "Type validation failed for the 1th input."));
       }
     } else {
-      IR_ENFORCE((*this)
-                     ->operand_source(1)
-                     .type()
-                     .isa<paddle::dialect::DenseTensorType>(),
-                 "Type validation failed for the 1th input.");
+      PADDLE_ENFORCE_EQ((*this)
+                            ->operand_source(1)
+                            .type()
+                            .isa<paddle::dialect::DenseTensorType>(),
+                        true,
+                        phi::errors::InvalidArgument(
+                            "Type validation failed for the 1th input."));
     }
   }
   VLOG(4) << "Verifying attributes:";
@@ -3165,12 +3260,16 @@ void ExpandOp::VerifySig() {
   VLOG(4) << "Verifying outputs:";
   {
     auto output_size = num_results();
-    IR_ENFORCE(output_size == 1u,
-               "The size %d of outputs must be equal to 1.",
-               output_size);
-    IR_ENFORCE(
+    PADDLE_ENFORCE_EQ(
+        output_size,
+        1u,
+        phi::errors::InvalidArgument(
+            "The size %d of outputs must be equal to 1.", output_size));
+    PADDLE_ENFORCE_EQ(
         (*this)->result(0).type().isa<paddle::dialect::DenseTensorType>(),
-        "Type validation failed for the 0th output.");
+        true,
+        phi::errors::InvalidArgument(
+            "Type validation failed for the 0th output."));
   }
   VLOG(4) << "End Verifying for: ExpandOp.";
 }
@@ -3184,9 +3283,11 @@ std::vector<pir::Type> ExpandOp::InferMeta(
     const std::vector<pir::Value> &input_values,
     pir::AttributeMap *p_attributes) {
   VLOG(4) << "Start infermeta ExpandOp";
-  IR_ENFORCE(input_values.size() == 2,
-             "Num of inputs is expected to be 2 but got %d.",
-             input_values.size());
+  PADDLE_ENFORCE_EQ(input_values.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "Num of inputs is expected to be 2 but got %d.",
+                        input_values.size()));
   pir::Value x_ = input_values[0];
   pir::Value shape_ = input_values[1];
 
@@ -3341,8 +3442,10 @@ void IncrementOp::Build(pir::Builder &builder,
                         pir::AttributeMap attributes) {
   VLOG(4) << "Start build IncrementOp";
 
-  IR_ENFORCE(attributes.find("value") != attributes.end(),
-             "'value' Attribute is expected for IncrementOp. ");
+  PADDLE_ENFORCE_NE(attributes.find("value"),
+                    attributes.end(),
+                    phi::errors::InvalidArgument(
+                        "'value' Attribute is expected for IncrementOp. "));
   float value = attributes.at("value").dyn_cast<pir::FloatAttribute>().data();
 
   VLOG(4) << "Builder construction inputs";
@@ -3367,32 +3470,45 @@ void IncrementOp::VerifySig() {
   VLOG(4) << "Verifying inputs:";
   {
     auto input_size = num_operands();
-    IR_ENFORCE(input_size == 1u,
-               "The size %d of inputs must be equal to 1.",
-               input_size);
-    IR_ENFORCE((*this)
-                   ->operand_source(0)
-                   .type()
-                   .isa<paddle::dialect::DenseTensorType>(),
-               "Type validation failed for the 0th input, got %s.",
-               (*this)->operand_source(0).type());
+    PADDLE_ENFORCE_EQ(
+        input_size,
+        1u,
+        phi::errors::InvalidArgument(
+            "The size %d of inputs must be equal to 1.", input_size));
+    PADDLE_ENFORCE_EQ((*this)
+                          ->operand_source(0)
+                          .type()
+                          .isa<paddle::dialect::DenseTensorType>(),
+                      true,
+                      phi::errors::InvalidArgument(
+                          "Type validation failed for the 0th input, got %s.",
+                          (*this)->operand_source(0).type()));
   }
   VLOG(4) << "Verifying attributes:";
   {
     auto &attributes = this->attributes();
-    IR_ENFORCE(attributes.count("value") > 0, "value does not exist.");
-    IR_ENFORCE(attributes.at("value").isa<pir::FloatAttribute>(),
-               "Type of attribute: value is not pir::FloatAttribute.");
+    PADDLE_ENFORCE_GT(attributes.count("value"),
+                      0,
+                      phi::errors::InvalidArgument("value does not exist."));
+    PADDLE_ENFORCE_EQ(
+        attributes.at("value").isa<pir::FloatAttribute>(),
+        true,
+        phi::errors::InvalidArgument(
+            "Type of attribute: value is not pir::FloatAttribute."));
   }
   VLOG(4) << "Verifying outputs:";
   {
     auto output_size = num_results();
-    IR_ENFORCE(output_size == 1u,
-               "The size %d of outputs must be equal to 1.",
-               output_size);
-    IR_ENFORCE(
+    PADDLE_ENFORCE_EQ(
+        output_size,
+        1u,
+        phi::errors::InvalidArgument(
+            "The size %d of outputs must be equal to 1.", output_size));
+    PADDLE_ENFORCE_EQ(
         (*this)->result(0).type().isa<paddle::dialect::DenseTensorType>(),
-        "Type validation failed for the 0th output.");
+        true,
+        phi::errors::InvalidArgument(
+            "Type validation failed for the 0th output."));
   }
   VLOG(4) << "End Verifying for: IncrementOp.";
 }
@@ -3411,13 +3527,17 @@ std::vector<pir::Type> IncrementOp::InferMeta(
           "AttrtibueMap pointer in InferMeta function is nullptr."));
   auto &attributes = *p_attributes;
   VLOG(4) << "Start infermeta IncrementOp";
-  IR_ENFORCE(input_values.size() == 1,
-             "Num of inputs is expected to be 1 but got %d.",
-             input_values.size());
+  PADDLE_ENFORCE_EQ(input_values.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "Num of inputs is expected to be 1 but got %d.",
+                        input_values.size()));
   pir::Value x_ = input_values[0];
 
-  IR_ENFORCE(attributes.find("value") != attributes.end(),
-             "'value' Attribute is expected for IncrementOp. ");
+  PADDLE_ENFORCE_NE(attributes.find("value"),
+                    attributes.end(),
+                    phi::errors::InvalidArgument(
+                        "'value' Attribute is expected for IncrementOp. "));
   float value = attributes.at("value").dyn_cast<pir::FloatAttribute>().data();
 
   VLOG(4) << "Builder construction outputs";
@@ -3526,8 +3646,10 @@ void Increment_Op::Build(pir::Builder &builder,
                          pir::AttributeMap attributes) {
   VLOG(4) << "Start build Increment_Op";
 
-  IR_ENFORCE(attributes.find("value") != attributes.end(),
-             "'value' Attribute is expected for Increment_Op. ");
+  PADDLE_ENFORCE_NE(attributes.find("value"),
+                    attributes.end(),
+                    phi::errors::InvalidArgument(
+                        "'value' Attribute is expected for Increment_Op. "));
   float value = attributes.at("value").dyn_cast<pir::FloatAttribute>().data();
 
   VLOG(4) << "Builder construction inputs";
@@ -3553,32 +3675,45 @@ void Increment_Op::VerifySig() {
   VLOG(4) << "Verifying inputs:";
   {
     auto input_size = num_operands();
-    IR_ENFORCE(input_size == 1u,
-               "The size %d of inputs must be equal to 1.",
-               input_size);
-    IR_ENFORCE((*this)
-                   ->operand_source(0)
-                   .type()
-                   .isa<paddle::dialect::DenseTensorType>(),
-               "Type validation failed for the 0th input, got %s.",
-               (*this)->operand_source(0).type());
+    PADDLE_ENFORCE_EQ(
+        input_size,
+        1u,
+        phi::errors::InvalidArgument(
+            "The size %d of inputs must be equal to 1.", input_size));
+    PADDLE_ENFORCE_EQ((*this)
+                          ->operand_source(0)
+                          .type()
+                          .isa<paddle::dialect::DenseTensorType>(),
+                      true,
+                      phi::errors::InvalidArgument(
+                          "Type validation failed for the 0th input, got %s.",
+                          (*this)->operand_source(0).type()));
   }
   VLOG(4) << "Verifying attributes:";
   {
     auto &attributes = this->attributes();
-    IR_ENFORCE(attributes.count("value") > 0, "value does not exist.");
-    IR_ENFORCE(attributes.at("value").isa<pir::FloatAttribute>(),
-               "Type of attribute: value is not pir::FloatAttribute.");
+    PADDLE_ENFORCE_GT(attributes.count("value"),
+                      0,
+                      phi::errors::InvalidArgument("value does not exist."));
+    PADDLE_ENFORCE_EQ(
+        attributes.at("value").isa<pir::FloatAttribute>(),
+        true,
+        phi::errors::InvalidArgument(
+            "Type of attribute: value is not pir::FloatAttribute."));
   }
   VLOG(4) << "Verifying outputs:";
   {
     auto output_size = num_results();
-    IR_ENFORCE(output_size == 1u,
-               "The size %d of outputs must be equal to 1.",
-               output_size);
-    IR_ENFORCE(
+    PADDLE_ENFORCE_EQ(
+        output_size,
+        1u,
+        phi::errors::InvalidArgument(
+            "The size %d of outputs must be equal to 1.", output_size));
+    PADDLE_ENFORCE_EQ(
         (*this)->result(0).type().isa<paddle::dialect::DenseTensorType>(),
-        "Type validation failed for the 0th output.");
+        true,
+        phi::errors::InvalidArgument(
+            "Type validation failed for the 0th output."));
   }
   VLOG(4) << "End Verifying for: Increment_Op.";
 }
@@ -3597,13 +3732,17 @@ std::vector<pir::Type> Increment_Op::InferMeta(
           "AttrtibueMap pointer in InferMeta function is nullptr."));
   auto &attributes = *p_attributes;
   VLOG(4) << "Start infermeta Increment_Op";
-  IR_ENFORCE(input_values.size() == 1,
-             "Num of inputs is expected to be 1 but got %d.",
-             input_values.size());
+  PADDLE_ENFORCE_EQ(input_values.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "Num of inputs is expected to be 1 but got %d.",
+                        input_values.size()));
   pir::Value x_ = input_values[0];
 
-  IR_ENFORCE(attributes.find("value") != attributes.end(),
-             "'value' Attribute is expected for Increment_Op. ");
+  PADDLE_ENFORCE_NE(attributes.find("value"),
+                    attributes.end(),
+                    phi::errors::InvalidArgument(
+                        "'value' Attribute is expected for Increment_Op. "));
   float value = attributes.at("value").dyn_cast<pir::FloatAttribute>().data();
 
   VLOG(4) << "Builder construction outputs";
@@ -3715,21 +3854,27 @@ void AssignOut_Op::VerifySig() {
   VLOG(4) << "Verifying inputs:";
   {
     auto input_size = num_operands();
-    IR_ENFORCE(input_size == 2u,
-               "The size %d of inputs must be equal to 2.",
-               input_size);
-    IR_ENFORCE((*this)
-                   ->operand_source(0)
-                   .type()
-                   .isa<paddle::dialect::DenseTensorType>(),
-               "Type validation failed for the 0th input, got %s.",
-               (*this)->operand_source(0).type());
-    IR_ENFORCE((*this)
-                   ->operand_source(1)
-                   .type()
-                   .isa<paddle::dialect::DenseTensorType>(),
-               "Type validation failed for the 1th input, got %s.",
-               (*this)->operand_source(1).type());
+    PADDLE_ENFORCE_EQ(
+        input_size,
+        2u,
+        phi::errors::InvalidArgument(
+            "The size %d of inputs must be equal to 2.", input_size));
+    PADDLE_ENFORCE_EQ((*this)
+                          ->operand_source(0)
+                          .type()
+                          .isa<paddle::dialect::DenseTensorType>(),
+                      true,
+                      phi::errors::InvalidArgument(
+                          "Type validation failed for the 0th input, got %s.",
+                          (*this)->operand_source(0).type()));
+    PADDLE_ENFORCE_EQ((*this)
+                          ->operand_source(1)
+                          .type()
+                          .isa<paddle::dialect::DenseTensorType>(),
+                      true,
+                      phi::errors::InvalidArgument(
+                          "Type validation failed for the 1th input, got %s.",
+                          (*this)->operand_source(1).type()));
   }
   VLOG(4) << "Verifying attributes:";
   {
@@ -3738,12 +3883,16 @@ void AssignOut_Op::VerifySig() {
   VLOG(4) << "Verifying outputs:";
   {
     auto output_size = num_results();
-    IR_ENFORCE(output_size == 1u,
-               "The size %d of outputs must be equal to 1.",
-               output_size);
-    IR_ENFORCE(
+    PADDLE_ENFORCE_EQ(
+        output_size,
+        1u,
+        phi::errors::InvalidArgument(
+            "The size %d of outputs must be equal to 1.", output_size));
+    PADDLE_ENFORCE_EQ(
         (*this)->result(0).type().isa<paddle::dialect::DenseTensorType>(),
-        "Type validation failed for the 0th output.");
+        true,
+        phi::errors::InvalidArgument(
+            "Type validation failed for the 0th output."));
   }
   VLOG(4) << "End Verifying for: AssignOut_Op.";
 }
@@ -3756,9 +3905,11 @@ void AssignOut_Op::InferMeta(phi::InferMetaContext *infer_meta) {
 std::vector<pir::Type> AssignOut_Op::InferMeta(
     const std::vector<pir::Value> &input_values,
     pir::AttributeMap *p_attributes) {
-  IR_ENFORCE(input_values.size() == 2,
-             "Num of inputs is expected to be 2 but got %d.",
-             input_values.size());
+  PADDLE_ENFORCE_EQ(input_values.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "Num of inputs is expected to be 2 but got %d.",
+                        input_values.size()));
 
   pir::Value x_ = input_values[0];
   VLOG(4) << "Builder construction outputs";
@@ -3808,6 +3959,29 @@ phi::DataType AssignOut_Op::GetKernelTypeForVar(
   return expected_kernel_dtype;
 }
 
+OpInfoTuple ShapeBroadcastOp::GetOpInfo() {
+  std::vector<paddle::dialect::OpInputInfo> inputs = {
+      paddle::dialect::OpInputInfo(
+          "x", "paddle::dialect::DenseTensorType", false, false, false, true),
+      paddle::dialect::OpInputInfo(
+          "y", "paddle::dialect::DenseTensorType", false, false, false, true)};
+  std::vector<paddle::dialect::OpAttributeInfo> attributes = {};
+  std::vector<paddle::dialect::OpOutputInfo> outputs = {
+      paddle::dialect::OpOutputInfo(
+          "out", "paddle::dialect::DenseTensorType", false, false)};
+  paddle::dialect::OpRunTimeInfo run_time_info =
+      paddle::dialect::OpRunTimeInfo("ElementwiseInferMeta",
+                                     {"x", "y"},
+                                     "shape_broadcast",
+                                     {"x", "y"},
+                                     {},
+                                     {},
+                                     {},
+                                     {});
+  return std::make_tuple(
+      inputs, attributes, outputs, run_time_info, "shape_broadcast");
+}
+
 void ShapeBroadcastOp::Build(pir::Builder &builder,
                              pir::OperationArgument &argument,
                              pir::Value x_,
@@ -3836,9 +4010,11 @@ std::vector<pir::Type> ShapeBroadcastOp::InferMeta(
     const std::vector<pir::Value> &input_values,
     pir::AttributeMap *p_attributes) {
   VLOG(4) << "Start infermeta ShapeBroadcastOp";
-  IR_ENFORCE(input_values.size() == 2,
-             "Num of inputs is expected to be 2 but got %d.",
-             input_values.size());
+  PADDLE_ENFORCE_EQ(input_values.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "Num of inputs is expected to be 2 but got %d.",
+                        input_values.size()));
   pir::Value x_ = input_values[0];
   pir::Value y_ = input_values[1];
 
@@ -3921,11 +4097,14 @@ symbol::DimExpr GetBroadcastDimExpr(const symbol::DimExpr &lhs,
 std::vector<symbol::DimExpr> ComputeBroadcastShape(
     const std::vector<symbol::DimExpr> &large_shape,
     const std::vector<symbol::DimExpr> &small_shape) {
-  IR_ENFORCE(large_shape.size() >= small_shape.size(),
-             "Size of large_shape is expected to be greater or equal size of "
-             "small_shape, but got [%d] >= [%d].",
-             large_shape.size(),
-             small_shape.size());
+  PADDLE_ENFORCE_GE(
+      large_shape.size(),
+      small_shape.size(),
+      phi::errors::InvalidArgument(
+          "Size of large_shape is expected to be greater or equal size of "
+          "small_shape, but got [%d] >= [%d].",
+          large_shape.size(),
+          small_shape.size()));
   std::vector<symbol::DimExpr> output_data;
   output_data.reserve(large_shape.size());
   auto rank_gap = large_shape.size() - small_shape.size();
@@ -3944,16 +4123,22 @@ bool ShapeBroadcastOp::InferSymbolicShape(
   pir::Value x = operand_source(0);
   pir::Value y = operand_source(1);
 
-  IR_ENFORCE(shape_analysis->HasShapeOrDataForValue(x) > 0,
-             "Value x does not exist.");
-  IR_ENFORCE(shape_analysis->HasShapeOrDataForValue(y) > 0,
-             "Value y does not exist.");
+  PADDLE_ENFORCE_GT(shape_analysis->HasShapeOrDataForValue(x),
+                    0,
+                    phi::errors::InvalidArgument("Value x does not exist."));
+  PADDLE_ENFORCE_GT(shape_analysis->HasShapeOrDataForValue(y),
+                    0,
+                    phi::errors::InvalidArgument("Value y does not exist."));
   const auto &x_data_shape = shape_analysis->GetShapeOrDataForValue(x);
   const auto &y_data_shape = shape_analysis->GetShapeOrDataForValue(y);
-  IR_ENFORCE(x_data_shape.data().has_value(),
-             "Value x comes from ShapeOp, it must have data");
-  IR_ENFORCE(y_data_shape.data().has_value(),
-             "Value y comes from ShapeOp, it must have data");
+  PADDLE_ENFORCE_EQ(x_data_shape.data().has_value(),
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Value x comes from ShapeOp, it must have data"));
+  PADDLE_ENFORCE_EQ(y_data_shape.data().has_value(),
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Value y comes from ShapeOp, it must have data"));
   const auto &x_data = x_data_shape.data().value();
   const auto &y_data = y_data_shape.data().value();
 
@@ -4005,34 +4190,48 @@ void MemcpyD2hMultiIoOp::VerifySig() {
   VLOG(4) << "Verifying inputs:";
   {
     auto input_size = num_operands();
-    IR_ENFORCE(input_size == 1u,
-               "The size %d of inputs must be equal to 1.",
-               input_size);
-    IR_ENFORCE((*this)
-                   ->operand_source(0)
-                   .type()
-                   .isa<paddle::dialect::DenseTensorArrayType>(),
-               "Type validation failed for the 0th input, got %s.",
-               (*this)->operand_source(0).type());
+    PADDLE_ENFORCE_EQ(
+        input_size,
+        1u,
+        phi::errors::InvalidArgument(
+            "The size %d of inputs must be equal to 1.", input_size));
+    PADDLE_ENFORCE_EQ((*this)
+                          ->operand_source(0)
+                          .type()
+                          .isa<paddle::dialect::DenseTensorArrayType>(),
+                      true,
+                      phi::errors::InvalidArgument(
+                          "Type validation failed for the 0th input, got %s.",
+                          (*this)->operand_source(0).type()));
   }
   VLOG(4) << "Verifying attributes:";
   {
     auto &attributes = this->attributes();
-    IR_ENFORCE(attributes.count("dst_place_type") > 0,
-               "dst_place_type does not exist.");
-    IR_ENFORCE(attributes.at("dst_place_type").isa<pir::Int32Attribute>(),
-               "Type of attribute: dst_place_type is not pir::Int32Attribute.");
+    PADDLE_ENFORCE_GT(
+        attributes.count("dst_place_type"),
+        0,
+        phi::errors::InvalidArgument("dst_place_type does not exist."));
+    PADDLE_ENFORCE_EQ(
+        attributes.at("dst_place_type").isa<pir::Int32Attribute>(),
+        true,
+        phi::errors::InvalidArgument(
+            "Type of attribute: dst_place_type is not pir::Int32Attribute."));
   }
   VLOG(4) << "Verifying outputs:";
   {
     auto output_size = num_results();
-    IR_ENFORCE(output_size == 1u,
-               "The size %d of outputs must be equal to 1.",
-               output_size);
+    PADDLE_ENFORCE_EQ(
+        output_size,
+        1u,
+        phi::errors::InvalidArgument(
+            "The size %d of outputs must be equal to 1.", output_size));
     auto output_0_type = (*this)->result(0).type();
 
-    IR_ENFORCE(output_0_type.isa<paddle::dialect::DenseTensorArrayType>(),
-               "Type validation failed for the 0th output.");
+    PADDLE_ENFORCE_EQ(
+        output_0_type.isa<paddle::dialect::DenseTensorArrayType>(),
+        true,
+        phi::errors::InvalidArgument(
+            "Type validation failed for the 0th output."));
   }
   VLOG(4) << "End Verifying for: MemcpyD2hMultiIoOp.";
 }
@@ -4045,9 +4244,11 @@ void MemcpyD2hMultiIoOp::InferMeta(phi::InferMetaContext *infer_meta) {
 std::vector<pir::Type> MemcpyD2hMultiIoOp::InferMeta(
     const std::vector<pir::Value> &input_values,
     pir::AttributeMap *p_attributes) {
-  IR_ENFORCE(input_values.size() == 1,
-             "Num of inputs is expected to be 1 but got %d.",
-             input_values.size());
+  PADDLE_ENFORCE_EQ(input_values.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "Num of inputs is expected to be 1 but got %d.",
+                        input_values.size()));
 
   pir::Value x_ = input_values[0];
   VLOG(4) << "Builder construction outputs";
@@ -4130,35 +4331,50 @@ void ArrayPopOp::VerifySig() {
   VLOG(4) << "Verifying inputs:";
   {
     auto input_size = num_operands();
-    IR_ENFORCE(input_size == 1u,
-               "The size %d of inputs must be equal to 1.",
-               input_size);
-    IR_ENFORCE((*this)
-                   ->operand_source(0)
-                   .type()
-                   .isa<paddle::dialect::DenseTensorArrayType>(),
-               "Type validation failed for the 0th input, got %s.",
-               (*this)->operand_source(0).type());
+    PADDLE_ENFORCE_EQ(
+        input_size,
+        1u,
+        phi::errors::InvalidArgument(
+            "The size %d of inputs must be equal to 1.", input_size));
+    PADDLE_ENFORCE_EQ((*this)
+                          ->operand_source(0)
+                          .type()
+                          .isa<paddle::dialect::DenseTensorArrayType>(),
+                      true,
+                      phi::errors::InvalidArgument(
+                          "Type validation failed for the 0th input, got %s.",
+                          (*this)->operand_source(0).type()));
   }
   VLOG(4) << "Verifying attributes:";
   {
     auto &attributes = this->attributes();
-    IR_ENFORCE(attributes.count("index") > 0, "index does not exist.");
-    IR_ENFORCE(attributes.at("index").isa<pir::Int32Attribute>(),
-               "Type of attribute: index is not pir::Int32Attribute.");
+    PADDLE_ENFORCE_GT(attributes.count("index"),
+                      0,
+                      phi::errors::InvalidArgument("index does not exist."));
+    PADDLE_ENFORCE_EQ(
+        attributes.at("index").isa<pir::Int32Attribute>(),
+        true,
+        phi::errors::InvalidArgument(
+            "Type of attribute: index is not pir::Int32Attribute."));
   }
   VLOG(4) << "Verifying outputs:";
   {
     auto output_size = num_results();
-    IR_ENFORCE(output_size == 2u,
-               "The size %d of outputs must be equal to 2.",
-               output_size);
-    IR_ENFORCE(
+    PADDLE_ENFORCE_EQ(
+        output_size,
+        2u,
+        phi::errors::InvalidArgument(
+            "The size %d of outputs must be equal to 2.", output_size));
+    PADDLE_ENFORCE_EQ(
         (*this)->result(0).type().isa<paddle::dialect::DenseTensorArrayType>(),
-        "Type validation failed for the 0th output.");
-    IR_ENFORCE(
+        true,
+        phi::errors::InvalidArgument(
+            "Type validation failed for the 0th output."));
+    PADDLE_ENFORCE_EQ(
         (*this)->result(1).type().isa<paddle::dialect::DenseTensorType>(),
-        "Type validation failed for the 1st output.");
+        true,
+        phi::errors::InvalidArgument(
+            "Type validation failed for the 1st output."));
   }
   VLOG(4) << "End Verifying for: ArrayPopOp.";
 }
@@ -4198,9 +4414,11 @@ std::vector<pir::Type> ArrayPopOp::InferMeta(
           "AttrtibueMap pointer in InferMeta function is nullptr."));
   auto &attributes = *p_attributes;
   VLOG(4) << "Start infermeta ArrayPopOp";
-  IR_ENFORCE(input_values.size() == 1,
-             "Num of inputs is expected to be 1 but got %d.",
-             input_values.size());
+  PADDLE_ENFORCE_EQ(input_values.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "Num of inputs is expected to be 1 but got %d.",
+                        input_values.size()));
   pir::Value input = input_values[0];
 
   VLOG(4) << "Builder construction outputs";
@@ -4213,8 +4431,10 @@ std::vector<pir::Type> ArrayPopOp::InferMeta(
         "paddle::dialect::AllocatedDenseTensorArrayType"));
   }
 
-  IR_ENFORCE(attributes.find("index") != attributes.end(),
-             "'index' Attribute is expected for ArrayPopOp. ");
+  PADDLE_ENFORCE_NE(attributes.find("index"),
+                    attributes.end(),
+                    phi::errors::InvalidArgument(
+                        "'index' Attribute is expected for ArrayPopOp. "));
   int index = attributes.at("index").dyn_cast<pir::Int32Attribute>().data();
 
   paddle::dialect::IrTensor dense_input(
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op.h b/paddle/fluid/pir/dialect/operator/ir/manual_op.h
index 8d13c11d06a59..7f472ef1fecab 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op.h
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op.h
@@ -734,6 +734,7 @@ class MemcpyD2hMultiIoOp
 
 class IR_API ShapeBroadcastOp
     : public pir::Op<ShapeBroadcastOp,
+                     paddle::dialect::OpYamlInfoInterface,
                      paddle::dialect::InferSymbolicShapeInterface,
                      paddle::dialect::InferMetaInterface> {
  public:
@@ -741,6 +742,7 @@ class IR_API ShapeBroadcastOp
   static const char *name() { return "pd_op.shape_broadcast"; }
   static constexpr const char **attributes_name = nullptr;
   static constexpr uint32_t attributes_num = 0;
+  static OpInfoTuple GetOpInfo();
   static void Build(pir::Builder &builder,             // NOLINT
                     pir::OperationArgument &argument,  // NOLINT
                     pir::Value x_,
diff --git a/paddle/fluid/pir/dialect/operator/ir/op_attribute.cc b/paddle/fluid/pir/dialect/operator/ir/op_attribute.cc
index 2f4c9a2b7e504..8a843a8881734 100644
--- a/paddle/fluid/pir/dialect/operator/ir/op_attribute.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/op_attribute.cc
@@ -30,7 +30,7 @@ phi::DataLayout DataLayoutAttribute::data() const {
   return storage()->GetAsKey();
 }
 
-phi::Scalar ScalarAttribute::data() {
+phi::Scalar ScalarAttribute::data() const {
   if (isa<pir::FloatAttribute>()) {
     return phi::Scalar(dyn_cast<pir::FloatAttribute>().data());
   } else if (isa<pir::DoubleAttribute>()) {
diff --git a/paddle/fluid/pir/dialect/operator/ir/op_attribute.h b/paddle/fluid/pir/dialect/operator/ir/op_attribute.h
index 153414c7ad0d0..b7a54d6ca58d2 100644
--- a/paddle/fluid/pir/dialect/operator/ir/op_attribute.h
+++ b/paddle/fluid/pir/dialect/operator/ir/op_attribute.h
@@ -37,6 +37,8 @@ class IntArrayAttribute : public pir::Attribute {
   static IntArrayAttribute Parse(pir::IrParser &parser);  // NOLINT
 
   const phi::IntArray &data() const;
+
+  static std::string name() { return "a_intarray"; }
 };
 
 class ScalarAttribute : public pir::Attribute {
@@ -59,7 +61,9 @@ class ScalarAttribute : public pir::Attribute {
     return TransToIrAttribute(scalar, ctx);
   }
 
-  phi::Scalar data();
+  phi::Scalar data() const;
+
+  static std::string name() { return "a_scalar"; }
 };
 
 class DataTypeAttribute : public pir::Attribute {
@@ -76,6 +80,8 @@ class DataTypeAttribute : public pir::Attribute {
   static DataTypeAttribute Parse(pir::IrParser &parser);  // NOLINT
 
   phi::DataType data() const;
+
+  static std::string name() { return "a_dtype"; }
 };
 
 class PlaceAttribute : public pir::Attribute {
@@ -91,6 +97,7 @@ class PlaceAttribute : public pir::Attribute {
   static PlaceAttribute Parse(pir::IrParser &parser);  // NOLINT
 
   phi::Place data() const;
+  static std::string name() { return "a_place"; }
 };
 
 class DataLayoutAttribute : public pir::Attribute {
diff --git a/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc b/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
index 1beaf8369bdc7..fa9fccaba2701 100644
--- a/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
@@ -46,9 +46,11 @@ struct CombineOpInferSymbolicShapeInterfaceModel
     const auto shape_data_list = [&] {
       symbol::TensorListShapeOrDataDimExprs shape_data_list;
       for (size_t i = 0; i < op->num_operands(); ++i) {
-        IR_ENFORCE(op->operand(i).type().dyn_cast<DenseTensorType>(),
-                   "Currently InferSymbolicShape of CombineOp only support "
-                   "DenseTensorType.");
+        PADDLE_ENFORCE_NOT_NULL(
+            op->operand(i).type().dyn_cast<DenseTensorType>(),
+            phi::errors::InvalidArgument(
+                "Currently InferSymbolicShape of CombineOp only support "
+                "DenseTensorType."));
 
         shape_data_list.emplace_back(
             shape_analysis->GetShapeOrDataForValue(op->operand_source(i))
@@ -70,9 +72,11 @@ struct ConstantOpInferSymbolicShapeInterfaceModel
     : public InferSymbolicShapeInterface::Concept {
   static inline bool InferSymbolicShape(
       pir::Operation* op, pir::ShapeConstraintIRAnalysis* shape_analysis) {
-    IR_ENFORCE(op->result(0).type().dyn_cast<DenseTensorType>(),
-               "Currently InferSymbolicShape of ConstantOp only support "
-               "DenseTensorType result.");
+    PADDLE_ENFORCE_NOT_NULL(
+        op->result(0).type().dyn_cast<DenseTensorType>(),
+        phi::errors::InvalidArgument(
+            "Currently InferSymbolicShape of ConstantOp only support "
+            "DenseTensorType result."));
 
     const std::vector<symbol::DimExpr> out_dims = [op] {
       std::vector<symbol::DimExpr> dims;
@@ -243,9 +247,9 @@ OperatorDialect::OperatorDialect(pir::IrContext* ctx)
                        ShadowOutputOpInferSymbolicShapeInterfaceModel>());
 
   info = ctx->GetRegisteredOpInfo(pir::SplitOp::name());
-  info.AttachInterface(std::move(
+  info.AttachInterface(
       pir::InterfaceValue::Get<InferSymbolicShapeInterface,
-                               SplitOpInferSymbolicShapeInterfaceModel>()));
+                               SplitOpInferSymbolicShapeInterfaceModel>());
 
   info = ctx->GetRegisteredOpInfo(pir::YieldOp::name());
   info.AttachInterface(
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops.yaml b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
index 4da4f54c3ac90..11ff0e8f47c90 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
@@ -1,7 +1,7 @@
 # The operators included in this file are:
 # 1) Operators defined only in PIR, dynamic graphs do not exist;
 # 2) The definitions of static graphs and dynamic graphs are inconsistent, but the final definition plan has not yet been clarified.
-# After the definition is clearly defined, migrate to paddle /fluid/pir/dialect/operator/ir/update_ops.yaml or paddle/phi/api/yaml/ops.yaml
+# After the definition is clearly defined, migrate to paddle/fluid/pir/dialect/operator/ir/update_ops.yaml or paddle/phi/api/yaml/ops.yaml
 
 - op : adadelta_
   args : (Tensor param, Tensor grad, Tensor avg_squared_grad, Tensor avg_squared_update, Tensor learning_rate, Tensor master_param, float rho, float epsilon, bool multi_precision)
@@ -22,6 +22,7 @@
     spmd_rule : ElementwiseBinaryInferSpmd
   kernel :
     func : add
+    data_type: x
   inplace : (x -> out)
   backward : add_grad
   data_transform :
@@ -107,6 +108,14 @@
   inplace : (output -> out)
   backward : assign_out__grad
 
+- op : assign_pos
+  args : (Tensor x, Tensor cum_count, Tensor eff_num_len)
+  output : Tensor(out)
+  infer_meta :
+    func : AssignPosInferMeta
+  kernel :
+    func : assign_pos
+
 - op : assign_value
   args : (int[] shape, DataType dtype, Scalar[] values, Place place = {})
   output : Tensor(out)
@@ -470,6 +479,24 @@
   optional : in_accum, in_state, out_scale, out_accum, out_state
   inplace : (scale -> out_scale, in_accum -> out_accum, in_state -> out_state)
 
+- op : dgc
+  args : (Tensor u, Tensor v, Tensor grad, Tensor param, Tensor current_step, Tensor nranks, float[] sparsity, float m=0.9, bool use_nesterov=true, float rampup_begin_step=0.0, float rampup_step=0.0, float regular_coeff=0.0, int regular_type=0)
+  output : Tensor(u_out), Tensor(v_out), Tensor(encode_grad), Tensor(grad_out), Tensor(k), Tensor(gather_buff)
+  kernel :
+    func : dgc
+    param : [u, v, grad, param, current_step, nranks, m, use_nesterov, sparsity, rampup_begin_step, rampup_step, regular_coeff, regular_type]
+  optional: param
+
+- op : dgc_momentum
+  args : (Tensor param, Tensor grad, Tensor velocity, Tensor learning_rate, Tensor master_param, Tensor current_step_tensor, Tensor nranks_tensor, float mu, bool use_nesterov=false, str regularization_method="", float regularization_coeff=0.0f, bool multi_precision=false, float rescale_grad=1.0f, float rampup_begin_step=-1.0f)
+  output : Tensor(param_out), Tensor(velocity_out), Tensor(master_param_out), Tensor(grad_out)
+  infer_meta :
+    func : DGCMomentumInferMeta
+  kernel :
+    func : dgc_momentum
+    data_type : param
+  optional : master_param, master_param_out
+
 - op : disable_check_model_nan_inf
   args: (Tensor x, int flag = 0)
   output: Tensor(out)
@@ -491,6 +518,16 @@
     data_type : fpn_rois
   optional : rois_num, multi_level_rois_num
 
+- op : distributed_fused_lamb
+  args : (Tensor[] param, Tensor[] grad, Tensor fp32_fused_param, Tensor fp32_fused_grad, Tensor fp16_fused_param, Tensor fp16_fused_grad, Tensor moment1, Tensor moment2, Tensor beta1pow, Tensor beta2pow, Tensor fused_param_offsets, Tensor fp32_shard_fused_param_offsets, Tensor fp16_shard_fused_param_offsets, Tensor param_info, Tensor param_order, Tensor learning_rate, Tensor global_scale, float beta1, float beta2, float epsilon, float max_global_grad_norm, float weight_decay, bool clip_after_allreduce, int[] ring_ids= {}, int acc_steps = 1, bool use_master_param_norm = true, bool use_master_acc_grad = true, bool is_grad_scaled_by_nranks = true, int64_t nranks = 1, bool use_hierarchical_allreduce = false)
+  output : Tensor(fp32_fused_param_out), Tensor(fp16_fused_param_out), Tensor(fp32_acc_fused_grad), Tensor(fp16_acc_fused_grad), Tensor(moment1_out), Tensor(moment2_out), Tensor(beta1pow_out), Tensor(beta2pow_out), Tensor[](param_out){param.size()}, Tensor(found_inf), Tensor(acc_step), Tensor(stop_update), Tensor(step)
+  kernel :
+    func : distributed_fused_lamb
+    data_type : DataType::FLOAT32
+    param : [param, grad, fp32_fused_param, fp32_fused_grad, fp16_fused_param, fp16fused_grad, moment1, moment2, beta1pow, beta2pow, fused_param_offsets, fp32_shard_fused_param_offsets, fp16_shard_fused_param_offsets, param_info, param_order, learning_rate, global_scale, acc_steps, beta1, beta2, epsilon, max_global_grad_norm, weight_decay, clip_after_allreduce, use_master_param_norm, use_master_acc_grad, is_grad_scaled_by_nranks, use_hierarchical_allreduce, nranks, ring_ids]
+  optional : fp32_fused_param, fp32_fused_grad, fp16_fused_param, fp16_fused_grad, fp32_fused_param_out, fp16_fused_param_out, fp32_acc_fused_grad, fp16_acc_fused_grad, acc_step, stop_update
+  inplace : (fp32_fused_param -> fp32_fused_param_out), (fp16_fused_param -> fp16_fused_param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1pow -> beta1pow_out), (beta2pow -> beta2pow_out), (param -> param_out)
+
 - op : distributed_fused_lamb_init
   args : (Tensor[] param, Tensor[] grad, float beta1, float beta2, int[] apply_weight_decay, int alignment, int rank, int nranks)
   output : Tensor(fp32_fused_param), Tensor(fp32_fused_grad), Tensor(fp16_fused_param), Tensor(fp16_fused_grad), Tensor(moment1), Tensor(moment2), Tensor(beta1_pow), Tensor(beta2_pow), Tensor(fused_param_offsets), Tensor(fp32_shard_fused_param_offsets), Tensor(fp16_shard_fused_param_offsets), Tensor(param_info), Tensor(param_order), Tensor[](param_out){param.size()}, Tensor[](master_param_out){param.size()}, Tensor[](grad_out){grad.size()}, Tensor(global_scale), Tensor(step)
@@ -661,6 +698,7 @@
   args : (str name, int col)
   output : Tensor(out)
   interfaces : paddle::dialect::InferSymbolicShapeInterface
+  traits: pir::ImmutableLayoutTrait
 
 - op : fetch
   args : (Tensor x, str name, int col)
@@ -671,7 +709,7 @@
   kernel :
     func : fetch
     param : [x]
-  traits : pir::SideEffectTrait
+  traits : pir::SideEffectTrait, pir::ImmutableLayoutTrait
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
 - op : floor_divide
@@ -748,7 +786,7 @@
     skip_transform : x
 
 - op : full_with_tensor
-  args : (Tensor shape, Tensor value, DataType dtype=DataType::FLOAT32)
+  args : (Tensor value, IntArray shape, DataType dtype=DataType::FLOAT32)
   output: Tensor(out)
   infer_meta :
     func : FullWithTensorInferMeta
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml b/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml
index 2f3d370e4ccff..452b845a43a1a 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml
@@ -27,6 +27,7 @@
     spmd_rule : ElementwiseBinaryGradInferSpmd
   kernel :
     func : add_grad
+    data_type: out_grad
   no_need_buffer : x, y
   composite : add_grad(x, y, out_grad, axis, x_grad, y_grad)
   backward : add_double_grad
@@ -201,15 +202,15 @@
 
 - backward_op : divide_double_grad
   forward : divide_grad (Tensor x, Tensor y, Tensor out, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y)
-  args : (Tensor y, Tensor out, Tensor grad_x, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1)
+  args : (Tensor y, Tensor out, Tensor grad_out, Tensor grad_x, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1)
   output : Tensor(y_grad), Tensor(out_grad), Tensor(grad_out_grad)
   infer_meta :
     func : GeneralTernaryGradInferMeta
-    param : [y, grad_x, grad_x]
+    param : [y, out, out]
   kernel :
     func : divide_double_grad
     data_type : out
-  optional : grad_x_grad, grad_y_grad
+  optional : grad_x, grad_x_grad, grad_y_grad
   inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_op : divide_grad
diff --git a/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h b/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h
index 86370dd0cc6c1..e8719d4adb73e 100644
--- a/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h
+++ b/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h
@@ -98,8 +98,9 @@ struct OpRunTimeInfo {
   std::vector<std::string> skip_transform_inputs;
   pir::AttributeMap extra_args_default_value;
   std::vector<std::string> data_format_tensors;
-  bool is_onednn_only;
-  bool dynamic_fallback;
+  bool is_onednn_only = false;
+  bool dynamic_fallback = false;
+  OpRunTimeInfo() = default;
 
   OpRunTimeInfo(const std::string& infer_meta_func,
                 const std::vector<std::string>& infer_meta_param,
diff --git a/paddle/fluid/pir/drr/CMakeLists.txt b/paddle/fluid/pir/drr/CMakeLists.txt
index b23774a431795..ded64839dc97f 100644
--- a/paddle/fluid/pir/drr/CMakeLists.txt
+++ b/paddle/fluid/pir/drr/CMakeLists.txt
@@ -87,7 +87,7 @@ if(WITH_CINN)
   set(DRR_SRCS ${DRR_SRCS} ${CINN_SOURCE_FILE})
 endif()
 
-if(WITH_MKLDNN)
+if(WITH_ONEDNN)
   set(onednn_dialect_name onednn_op)
   set(pir_op_onednn_yaml
       ${PADDLE_BINARY_DIR}/paddle/fluid/pir/dialect/operator/ir/generated/onednn.parsed.yaml
diff --git a/paddle/fluid/pir/drr/include/drr_pattern_context.h b/paddle/fluid/pir/drr/include/drr_pattern_context.h
index b7755f659e85d..17090fb3e210a 100644
--- a/paddle/fluid/pir/drr/include/drr_pattern_context.h
+++ b/paddle/fluid/pir/drr/include/drr_pattern_context.h
@@ -129,10 +129,11 @@ class TEST_API DrrPatternContext {
 
   const Op& ResultOpPattern(
       const std::string& op_type,
-      const std::unordered_map<std::string, Attribute>& attributes = {});
+      const std::unordered_map<std::string, Attribute>& attributes = {},
+      const std::unordered_map<std::string, Attribute>& runtime_attributes =
+          {});
   drr::Tensor& ResultTensorPattern(const std::string& name);
 
-  // void RequireEqual(const Attribute& first, const Attribute& second);
   void RequireEqual(const TensorShape& first, const TensorShape& second);
   void RequireEqual(const TensorDataType& first, const TensorDataType& second);
   void RequireNativeCall(const ConstraintFunction& custom_fn);
@@ -157,34 +158,28 @@ class Op {
                               const Tensor& arg2) const;
   TEST_API void operator()(const std::vector<const Tensor*>& args,
                            const std::vector<const Tensor*>& outputs) const;
-  // const Tensor& operator()(const Tensor& arg0, const Tensor& arg1, const
-  // Tensor& arg2) const; const Tensor& operator()(const Tensor& arg0, const
-  // Tensor& arg1, const Tensor& arg2, const Tensor& arg3) const; const Tensor&
-  // operator()(const Tensor& arg0, const Tensor& arg1, const Tensor& arg2,
-  // const Tensor& arg3, const Tensor& arg4) const;
 
   static const char* prefix;
 
  private:
   Op(const std::string& op_type_name,
+     PatternGraph* pattern_graph,
      const std::unordered_map<std::string, Attribute>& attributes,
-     PatternGraph* pattern_graph)
+     const std::unordered_map<std::string, Attribute>& runtime_attributes = {})
       : op_type_name_(op_type_name),
+        pattern_graph_(pattern_graph),
         attributes_(attributes),
-        pattern_graph_(pattern_graph) {}
-
-  const std::unordered_map<std::string, Attribute>& attributes() const {
-    return attributes_;
-  }
-
-  friend class DrrPatternContext;
-  friend class OpCall;
+        runtime_attributes_(runtime_attributes) {}
 
   std::string op_type_name_;
-  std::unordered_map<std::string, Attribute> attributes_;
   PatternGraph* pattern_graph_{nullptr};
+  std::unordered_map<std::string, Attribute> attributes_;
+  std::unordered_map<std::string, Attribute> runtime_attributes_;
 
   thread_local static int64_t count;
+
+  friend class DrrPatternContext;
+  friend class OpCall;
 };
 
 class TEST_API Tensor {
@@ -244,7 +239,8 @@ class TEST_API OpCall {
       : op_name_(op->op_type_name_),
         inputs_(inputs),
         outputs_(outputs),
-        attributes_(op->attributes_) {}
+        attributes_(op->attributes_),
+        runtime_attributes_(op->runtime_attributes_) {}
 
   const std::string& name() const { return op_name_; }
 
@@ -256,18 +252,24 @@ class TEST_API OpCall {
     return attributes_;
   }
 
+  const std::unordered_map<std::string, Attribute>& runtime_attributes() const {
+    return runtime_attributes_;
+  }
+
  private:
   std::string op_name_;
   std::vector<const Tensor*> inputs_;
   std::vector<const Tensor*> outputs_;
   std::unordered_map<std::string, Attribute> attributes_;
+  std::unordered_map<std::string, Attribute> runtime_attributes_;
 };
 
 class TEST_API ResultPattern {
  public:
-  const drr::Op& Op(
-      const std::string& op_type,
-      const std::unordered_map<std::string, Attribute>& attributes = {});
+  const drr::Op&
+  Op(const std::string& op_type,
+     const std::unordered_map<std::string, Attribute>& attributes = {},
+     const std::unordered_map<std::string, Attribute>& runtime_attributes = {});
 
   drr::Tensor& Tensor(const std::string& name);
 
@@ -304,10 +306,44 @@ class TEST_API ResultPattern {
 
   Attribute VectorFloatAttr(const std::vector<float>& value) const;
 
+  // {"bool", phi::DataType::BOOL},
+  // {"uint8", phi::DataType::UINT8},
+  // {"int8", phi::DataType::INT8},
+  // {"uint16", phi::DataType::UINT16},
+  // {"int16", phi::DataType::INT16},
+  // {"uint32", phi::DataType::UINT32},
+  // {"int32", phi::DataType::INT32},
+  // {"uint64", phi::DataType::UINT64},
+  // {"int64", phi::DataType::INT64},
+  // {"float32", phi::DataType::FLOAT32},
+  // {"complex64", phi::DataType::COMPLEX64},
+  // {"complex128", phi::DataType::COMPLEX128},
+  // {"Undefined", phi::DataType::UNDEFINED},
+  // {"psting", phi::DataType::PSTRING},
+  // {"float16", phi::DataType::FLOAT16},
+  // {"bfloat16", phi::DataType::BFLOAT16},
+  // {"float64", phi::DataType::FLOAT64}};
   Attribute DataTypeAttr(const std::string& value) const;
 
+  // {"cpu", phi::CPUPlace{}},
+  // {"gpu", phi::GPUPlace{}},
+  // {"gpu_pinned", phi::GPUPinnedPlace{}},
+  // {"xpu", phi::XPUPlace{}},
+  // {"ipu", phi::IPUPlace{}},
+  // {":", phi::CustomPlace{}},
+  // {"undefined", phi::Place{}}};
   Attribute PlaceAttr(const std::string& value) const;
 
+  // {"NHWC", phi::DataLayout::kNHWC},
+  // {"NCHW", phi::DataLayout::kNCHW},
+  // {"Undefined", phi::DataLayout::kAnyLayout},
+  // {"ONEDNN", phi::DataLayout::ONEDNN},
+  // {"SPARSE_COO", phi::DataLayout::SPARSE_COO},
+  // {"SPARSE_CSR", phi::DataLayout::SPARSE_CSR},
+  // {"NDHWC", phi::DataLayout::kNDHWC},
+  // {"NCDHW", phi::DataLayout::kNCDHW},
+  // {"PSTRING_UNION", phi::DataLayout::PSTRING_UNION},
+  // {"STRIDED", phi::DataLayout::STRIDED}};
   Attribute DataLayoutAttr(const std::string& value) const;
 
   Attribute ComputeAttr(const AttrComputeFunc& attr_compute_func) const;
diff --git a/paddle/fluid/pir/drr/src/ir_operation_factory.cc b/paddle/fluid/pir/drr/src/ir_operation_factory.cc
index e625db38d1b8f..20a281dd12d36 100644
--- a/paddle/fluid/pir/drr/src/ir_operation_factory.cc
+++ b/paddle/fluid/pir/drr/src/ir_operation_factory.cc
@@ -15,19 +15,22 @@
 #include <any>
 
 #include "paddle/common/layout.h"
+
 #include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
+#ifdef PADDLE_WITH_DNNL
+#include "paddle/fluid/pir/dialect/operator/ir/onednn_op.h"
+#endif
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/fluid/pir/drr/include/drr_pattern_context.h"
 #include "paddle/fluid/pir/drr/src/attr_type_uilts.h"
 #include "paddle/fluid/pir/drr/src/ir_operation_factory.h"
-#include "paddle/phi/core/enforce.h"
+
 #include "paddle/pir/include/core/builtin_attribute.h"
 #include "paddle/pir/include/core/builtin_op.h"
 #include "paddle/pir/include/core/operation.h"
 #include "paddle/pir/include/core/value.h"
-#ifdef PADDLE_WITH_DNNL
-#include "paddle/fluid/pir/dialect/operator/ir/onednn_op.h"
-#endif
+
+#include "paddle/phi/core/enforce.h"
 
 namespace paddle {
 namespace drr {
@@ -317,10 +320,11 @@ pir::Attribute CreateIrAttribute(const std::any& obj) {
   }
 }
 
-pir::AttributeMap CreateAttributeMap(const OpCall& op_call,
-                                     const MatchContextImpl& src_match_ctx) {
+pir::AttributeMap CreateAttributeMap(
+    const std::unordered_map<std::string, Attribute>& attrs,
+    const MatchContextImpl& src_match_ctx) {
   pir::AttributeMap attr_map;
-  for (const auto& kv : op_call.attributes()) {
+  for (const auto& kv : attrs) {
     std::visit(
         [&](auto&& arg) {
           if constexpr (std::is_same_v<std::decay_t<decltype(arg)>,
@@ -339,12 +343,12 @@ pir::AttributeMap CreateAttributeMap(const OpCall& op_call,
   return attr_map;
 }
 
-pir::Value GetIrValueByDrrTensor(const Tensor& tensor,
+pir::Value GetIrValueByDrrTensor(const Tensor* tensor,
                                  const MatchContextImpl& res_match_ctx) {
-  if (tensor.is_none()) {
+  if (tensor->is_none()) {
     return pir::Value{};
   }
-  return res_match_ctx.GetIrValue(tensor.name());
+  return res_match_ctx.GetIrValue(tensor->name());
 }
 
 std::vector<pir::Value> GetIrValuesByDrrTensors(
@@ -353,16 +357,21 @@ std::vector<pir::Value> GetIrValuesByDrrTensors(
   std::vector<pir::Value> ir_values;
   ir_values.reserve(tensors.size());
   for (const auto* tensor : tensors) {
-    ir_values.push_back(GetIrValueByDrrTensor(*tensor, res_match_ctx));
+    ir_values.push_back(GetIrValueByDrrTensor(tensor, res_match_ctx));
   }
   return ir_values;
 }
 
-void BindIrOutputs(const OpCall& op_call,
-                   pir::Operation* op,
-                   MatchContextImpl* match_ctx) {
-  for (size_t i = 0; i < op_call.outputs().size(); ++i) {
-    match_ctx->BindIrValue(op_call.outputs()[i]->name(), op->result(i));
+void BindIrOutputsWithDrrOutputs(const std::vector<const Tensor*>& tensors,
+                                 pir::Operation* op,
+                                 MatchContextImpl* match_ctx) {
+  PADDLE_ENFORCE_LE(
+      tensors.size(),
+      op->num_results(),
+      phi::errors::InvalidArgument(
+          "The size of drr outputs should less equal the size of pir outputs"));
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    match_ctx->BindIrValue(tensors[i]->name(), op->result(i));
   }
 }
 
@@ -371,15 +380,17 @@ pir::Operation* CreateOperation(const OpCall& op_call,
                                 pir::PatternRewriter& rewriter,  // NOLINT
                                 MatchContextImpl* res_match_ctx) {
   VLOG(6) << "Drr create [" << op_call.name() << "] op...";
-  const auto& inputs = op_call.inputs();
-  std::vector<pir::Value> ir_values =
-      GetIrValuesByDrrTensors(inputs, *res_match_ctx);
   pir::Operation* op = OperationFactory::Instance().CreateOperation(
       op_call.name(),
-      ir_values,
-      CreateAttributeMap(op_call, src_match_ctx),
+      GetIrValuesByDrrTensors(op_call.inputs(), *res_match_ctx),
+      CreateAttributeMap(op_call.attributes(), src_match_ctx),
       rewriter);
-  BindIrOutputs(op_call, op, res_match_ctx);
+  auto runtime_attr_map =
+      CreateAttributeMap(op_call.runtime_attributes(), src_match_ctx);
+  for (const auto& kv : runtime_attr_map) {
+    op->set_attribute(kv.first, kv.second);
+  }
+  BindIrOutputsWithDrrOutputs(op_call.outputs(), op, res_match_ctx);
   VLOG(6) << "Drr create [" << op_call.name() << " @" << op << "] op done.";
   return op;
 }
diff --git a/paddle/fluid/pir/drr/src/ir_operation_factory.h b/paddle/fluid/pir/drr/src/ir_operation_factory.h
index 23095bf9a73e0..eaf3f866ec60c 100644
--- a/paddle/fluid/pir/drr/src/ir_operation_factory.h
+++ b/paddle/fluid/pir/drr/src/ir_operation_factory.h
@@ -30,13 +30,13 @@ class OperationFactory {
     return operation_factory;
   }
 
-  using operation_create_fn =
+  using OperationCreateFunction =
       std::function<pir::Operation*(const std::vector<pir::Value>&,
                                     const pir::AttributeMap&,
                                     pir::PatternRewriter&)>;
 
   void RegisterOperationCreator(const std::string& op_name,
-                                const operation_create_fn& create_fn) {
+                                const OperationCreateFunction& create_fn) {
     op_creator_map[op_name] = create_fn;
   }
 
@@ -76,7 +76,7 @@ class OperationFactory {
 #ifdef PADDLE_WITH_DNNL
   void RegisterOnednnOpGeneratedOpCreator();
 #endif
-  std::unordered_map<std::string, operation_create_fn> op_creator_map;
+  std::unordered_map<std::string, OperationCreateFunction> op_creator_map;
 };
 
 pir::Operation* CreateOperation(const OpCall& op_call,
diff --git a/paddle/fluid/pir/drr/src/match_context_impl.h b/paddle/fluid/pir/drr/src/match_context_impl.h
index 12a0dc7a65ab5..a9acb5f6ed8df 100644
--- a/paddle/fluid/pir/drr/src/match_context_impl.h
+++ b/paddle/fluid/pir/drr/src/match_context_impl.h
@@ -37,10 +37,9 @@ class MatchContextImpl final {
     PADDLE_ENFORCE_NE(
         tensor_map_.count(tensor_name),
         0,
-        phi::errors::NotFound(
-            "Not found tensor."
-            "The Drr tensor [%s] must exist in pattern graph to be obtained.",
-            tensor_name));
+        phi::errors::NotFound("Not found tensor. The drr tensor [%s] must "
+                              "exist in pattern graph to be obtained.",
+                              tensor_name));
     return tensor_map_.at(tensor_name);
   }
 
@@ -48,10 +47,10 @@ class MatchContextImpl final {
     PADDLE_ENFORCE_NE(
         operation_map_.count(op_call),
         0,
-        phi::errors::NotFound("Not found operation."
-                              "The Drr operation [%s] must exist in the "
-                              "pattern graph to be obtained.",
-                              op_call->name()));
+        phi::errors::NotFound(
+            "Not found operation. The drr operation [%s] must exist in the "
+            "pattern graph to be obtained.",
+            op_call->name()));
     return operation_map_.at(op_call);
   }
 
@@ -65,10 +64,10 @@ class MatchContextImpl final {
     PADDLE_ENFORCE_NE(
         iter,
         tensor_map_.end(),
-        phi::errors::NotFound("Not found tensor."
-                              "The Drr tensor [%s] is not found in the map, "
-                              "unable to obtain the corresponding IrValue.",
-                              tensor_name));
+        phi::errors::NotFound(
+            "Not found tensor. The drr tensor [%s] is not found in the map, "
+            "unable to obtain the corresponding IrValue.",
+            tensor_name));
     return iter->second;
   }
 
@@ -77,10 +76,10 @@ class MatchContextImpl final {
     PADDLE_ENFORCE_NE(
         iter,
         attr_map_.end(),
-        phi::errors::NotFound("Not found attr."
-                              "The Drr attr [%s] is not found in the map, "
-                              "unable to obtain the corresponding Attribute.",
-                              attr_name));
+        phi::errors::NotFound(
+            "Not found attr. The drr attr [%s] is not found in the map, unable "
+            "to obtain the corresponding Attribute.",
+            attr_name));
     return iter->second;
   }
 
diff --git a/paddle/fluid/pir/drr/src/pattern_context.cc b/paddle/fluid/pir/drr/src/pattern_context.cc
index 7bdee5d5dcafe..fe72170bc9eea 100644
--- a/paddle/fluid/pir/drr/src/pattern_context.cc
+++ b/paddle/fluid/pir/drr/src/pattern_context.cc
@@ -39,7 +39,7 @@ const Op& DrrPatternContext::SourceOpPattern(
     const std::string& op_type,
     const std::unordered_map<std::string, Attribute>& attributes) {
   owned_ops_.push_back(std::shared_ptr<drr::Op>(
-      new drr::Op(op_type, attributes, source_pattern_graph_.get())));
+      new drr::Op(op_type, source_pattern_graph_.get(), attributes)));
   return *owned_ops_.back();
 }
 
@@ -50,9 +50,10 @@ drr::Tensor& DrrPatternContext::SourceTensorPattern(const std::string& name) {
 
 const Op& DrrPatternContext::ResultOpPattern(
     const std::string& op_type,
-    const std::unordered_map<std::string, Attribute>& attributes) {
-  owned_ops_.push_back(std::shared_ptr<drr::Op>(
-      new drr::Op(op_type, attributes, result_pattern_graph_.get())));
+    const std::unordered_map<std::string, Attribute>& attributes,
+    const std::unordered_map<std::string, Attribute>& runtime_attributes) {
+  owned_ops_.push_back(std::shared_ptr<drr::Op>(new drr::Op(
+      op_type, result_pattern_graph_.get(), attributes, runtime_attributes)));
   return *owned_ops_.back();
 }
 
@@ -174,8 +175,9 @@ void Tensor::operator=(const Tensor& other) const {  // NOLINT
 
 const drr::Op& ResultPattern::Op(
     const std::string& op_type,
-    const std::unordered_map<std::string, Attribute>& attributes) {
-  return ctx_->ResultOpPattern(op_type, attributes);
+    const std::unordered_map<std::string, Attribute>& attributes,
+    const std::unordered_map<std::string, Attribute>& runtime_attributes) {
+  return ctx_->ResultOpPattern(op_type, attributes, runtime_attributes);
 }
 
 drr::Tensor& ResultPattern::Tensor(const std::string& name) {
diff --git a/paddle/fluid/pir/drr/src/rewrite_pattern.cc b/paddle/fluid/pir/drr/src/rewrite_pattern.cc
index 2bd2fdc36b717..6b2c7cab2ba13 100644
--- a/paddle/fluid/pir/drr/src/rewrite_pattern.cc
+++ b/paddle/fluid/pir/drr/src/rewrite_pattern.cc
@@ -15,6 +15,7 @@
 #include <glog/logging.h>
 #include <queue>
 
+#include "glog/vlog_is_on.h"
 #include "paddle/fluid/pir/drr/include/drr_pattern_base.h"
 #include "paddle/fluid/pir/drr/include/drr_rewrite_pattern.h"
 #include "paddle/fluid/pir/drr/src/ir_operation_factory.h"
@@ -43,12 +44,17 @@ DrrRewritePattern::DrrRewritePattern(
       constraints_(drr_context.constraints()),
       result_pattern_graph_(drr_context.result_pattern_graph()),
       drr_pattern_owner_(drr_pattern_owner) {
-  PADDLE_ENFORCE_NE(
-      source_pattern_graph_->owned_op_call().empty(),
-      true,
-      phi::errors::InvalidArgument("Source pattern graph is empty."
-                                   "Suggested fix: Please check the DRR "
-                                   "source pattern definition code."));
+  PADDLE_ENFORCE_NE(source_pattern_graph_->owned_op_call().empty(),
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Source pattern graph is empty. Suggested fix: please "
+                        "check the drr source pattern definition code."));
+  if (VLOG_IS_ON(4)) {
+    std::cout << "\nThe source pattern graph in [" << pattern_name << "]:\n"
+              << *source_pattern_graph_ << std::endl;
+    std::cout << "\nThe result pattern graph in [" << pattern_name << "]:\n"
+              << *result_pattern_graph_ << std::endl;
+  }
 }
 
 bool DrrRewritePattern::MatchAndRewrite(
@@ -324,7 +330,11 @@ bool DrrRewritePattern::MatchFromOutputToInput(
     }
     return false;
   };
-
+  // Check whether Drr Tensor and IR Value is None.
+  const auto& IsNoneTensorAndValue = [](const Tensor* drr_input_tensor,
+                                        pir::Value ir_value) {
+    return drr_input_tensor->is_none() && ir_value == nullptr;
+  };
   // Step 1: Initialize DRR matched queue.
   bool matched = true;
   size_t step = 0;
@@ -348,7 +358,15 @@ bool DrrRewritePattern::MatchFromOutputToInput(
     auto ir_input_values = ir_node->operands_source();
     for (size_t i = 0; i < drr_input_tensors.size(); ++i) {
       if (drr_input_tensors[i]->is_none()) {
-        continue;
+        if (IsNoneTensorAndValue(drr_input_tensors[i], ir_input_values[i])) {
+          continue;
+        } else {
+          VLOG(8) << drr_node->name() << "Match failed:drr_input[" << i
+                  << "] !=  pir_intput[" << i << "] , drr_input_tensor[" << i
+                  << "] is None.";
+          matched = false;
+          break;
+        }
       }
       if (HasVisitedOperands(drr_input_tensors[i], ir_input_values[i])) {
         matched = false;
@@ -403,9 +421,8 @@ bool DrrRewritePattern::MatchFromOutputToInput(
         step,
         source_pattern_graph.CountOfOpCalls(),
         phi::errors::PreconditionNotMet(
-            "Pattern matching failed."
-            "The number of successful matches and the number of OpCalls in the "
-            "source pattern graph are not equal."));
+            "Pattern matching failed. The number of successful matches and the "
+            "number of OpCalls in the source pattern graph are not equal."));
   } else {
     return matched;
   }
@@ -453,25 +470,25 @@ MatchContextImpl DrrRewritePattern::CreateOperations(
     PADDLE_ENFORCE_NE(
         result_pattern_graph.id2owned_tensor().count(in_tensor),
         0,
-        phi::errors::NotFound("Not found the input tensor."
-                              "Drr input tensor [%s] must exist in the result "
-                              "pattern graph to be obtained.",
-                              in_tensor));
+        phi::errors::NotFound(
+            "Not found the input tensor. Drr input tensor [%s] must exist in "
+            "the result pattern graph to be obtained.",
+            in_tensor));
     if (!result_pattern_graph.id2owned_tensor().at(in_tensor)->is_none()) {
       res_match_ctx.BindIrValue(in_tensor, src_match_ctx.GetIrValue(in_tensor));
     }
   }
 
-  std::vector<std::vector<pir::Operation*>> temp_program;
-  std::unordered_map<pir::Operation*, size_t> op_2_temp_program_index;
-  for (auto& op : *rewriter.block()) {
-    op_2_temp_program_index[&op] = temp_program.size();
-    temp_program.push_back({&op});
-  }
-
   // topo order visit result_pattern_graph
   GraphTopo graph_topo_visit(&result_pattern_graph);
   graph_topo_visit.WalkGraphNodesTopoOrder([&](const OpCall& op_call) {
+    std::vector<std::vector<pir::Operation*>> temp_program;
+    std::unordered_map<pir::Operation*, size_t> op_2_temp_program_index;
+    for (auto& op : *rewriter.block()) {
+      op_2_temp_program_index[&op] = temp_program.size();
+      temp_program.push_back({&op});
+    }
+
     // set insert point
     size_t max_input_op_index = 0UL;
     pir::Operation* max_index_op = nullptr;
@@ -518,11 +535,13 @@ MatchContextImpl DrrRewritePattern::CreateOperations(
 
     pir::Operation* new_op =
         CreateOperation(op_call, src_match_ctx, rewriter, &res_match_ctx);
-    op_2_temp_program_index[new_op] = max_input_op_index + 1;
-    if (max_input_op_index + 1 >= temp_program.size()) {
+
+    size_t new_max_input_op_index = max_input_op_index + 1;
+    op_2_temp_program_index[new_op] = new_max_input_op_index;
+    if (new_max_input_op_index >= temp_program.size()) {
       temp_program.push_back({});
     }
-    temp_program[max_input_op_index + 1].push_back(new_op);
+    temp_program[new_max_input_op_index].push_back(new_op);
   });
 
   return res_match_ctx;
diff --git a/paddle/fluid/pir/serialize_deserialize/CMakeLists.txt b/paddle/fluid/pir/serialize_deserialize/CMakeLists.txt
new file mode 100644
index 0000000000000..4ab79bd350dc0
--- /dev/null
+++ b/paddle/fluid/pir/serialize_deserialize/CMakeLists.txt
@@ -0,0 +1,9 @@
+file(GLOB_RECURSE SERIALIZE_DESERIALIZE_CPP_SOURCES "*.cc")
+
+include_directories(pir_save_load PRIVATE
+                    ${PADDLE_SOURCE_DIR}/third_party/nlohmann_json/include/)
+
+cc_library(
+  pir_save_load
+  SRCS ${SERIALIZE_DESERIALIZE_CPP_SOURCES}
+  DEPS op_dialect phi json)
diff --git a/paddle/fluid/pir/serialize_deserialize/include/deserialize_utils.h b/paddle/fluid/pir/serialize_deserialize/include/deserialize_utils.h
new file mode 100644
index 0000000000000..d4aaefe81c983
--- /dev/null
+++ b/paddle/fluid/pir/serialize_deserialize/include/deserialize_utils.h
@@ -0,0 +1,316 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <fstream>
+#include <initializer_list>
+#include <string>
+#include <vector>
+
+#include "glog/logging.h"
+#include "paddle/common/layout.h"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/serialize_deserialize/include/schema.h"
+#include "paddle/fluid/pir/serialize_deserialize/include/third_part.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/pir/include/core/builtin_attribute.h"
+#include "paddle/pir/include/core/builtin_type.h"
+
+namespace pir {
+
+template <typename T>
+T deserializeTypeFromJson(Json* type_json, pir::IrContext* ctx) {
+  return T::get(ctx);
+}
+
+template <typename T, typename CPP_T>
+T deserializeAttrFromJson(Json* attr_json, pir::IrContext* ctx) {
+  CPP_T data = attr_json->at(DATA).template get<CPP_T>();
+  return T::get(ctx, data);
+}
+
+template <>
+pir::Complex64Attribute deserializeAttrFromJson<pir::Complex64Attribute, float>(
+    Json* attr_json, pir::IrContext* ctx) {
+  Json data_json = attr_json->at(DATA);
+  phi::dtype::complex<float> data =
+      phi::dtype::complex(data_json.at(0).template get<float>(),
+                          data_json.at(1).template get<float>());
+  return pir::Complex64Attribute::get(ctx, data);
+}
+
+template <>
+pir::Complex128Attribute
+deserializeAttrFromJson<pir::Complex128Attribute, double>(Json* attr_json,
+                                                          pir::IrContext* ctx) {
+  Json data_json = attr_json->at(DATA);
+  phi::dtype::complex<double> data =
+      phi::dtype::complex(data_json.at(0).template get<double>(),
+                          data_json.at(1).template get<double>());
+  return pir::Complex128Attribute::get(ctx, data);
+}
+
+template <>
+paddle::dialect::IntArrayAttribute
+deserializeAttrFromJson<paddle::dialect::IntArrayAttribute,
+                        std::vector<int64_t>>(Json* attr_json,
+                                              pir::IrContext* ctx) {
+  std::vector<int64_t> data = attr_json->at(DATA).get<std::vector<int64_t>>();
+  phi::IntArray int_array = phi::IntArray(data);
+  return paddle::dialect::IntArrayAttribute::get(ctx, int_array);
+}
+
+pir::Attribute deserializeAttrFromJson_scalarAttr(Json* attr_json,
+                                                  pir::IrContext* ctx) {
+  Json content = attr_json->at(DATA);
+  phi::DataType dtype_ =
+      phi::StringToDataType(content.at(0).template get<std::string>());
+  phi::Scalar scalar;
+
+  if (dtype_ == phi::DataType::FLOAT32) {
+    scalar = phi::Scalar(content.at(1).template get<float>());
+  } else if (dtype_ == phi::DataType::INT32) {
+    scalar = phi::Scalar(content.at(1).template get<int32_t>());
+  } else if (dtype_ == phi::DataType::FLOAT64) {
+    scalar = phi::Scalar(content.at(1).template get<double>());
+  } else if (dtype_ == phi::DataType::INT8) {
+    scalar = phi::Scalar(content.at(1).template get<int8_t>());
+  } else if (dtype_ == phi::DataType::FLOAT16 ||
+             dtype_ == phi::DataType::UINT16 ||
+             dtype_ == phi::DataType::BFLOAT16) {
+    scalar = phi::Scalar(content.at(1).template get<uint16_t>());
+  } else if (dtype_ == phi::DataType::INT16) {
+    scalar = phi::Scalar(content.at(1).template get<int16_t>());
+  } else if (dtype_ == phi::DataType::INT64) {
+    scalar = phi::Scalar(content.at(1).template get<int64_t>());
+  } else if (dtype_ == phi::DataType::UINT8) {
+    scalar = phi::Scalar(content.at(1).template get<uint8_t>());
+  } else if (dtype_ == phi::DataType::UINT32) {
+    scalar = phi::Scalar(content.at(1).template get<uint32_t>());
+  } else if (dtype_ == phi::DataType::UINT64) {
+    scalar = phi::Scalar(content.at(1).template get<uint64_t>());
+  } else if (dtype_ == phi::DataType::BOOL) {
+    scalar = phi::Scalar(content.at(1).template get<bool>());
+  } else if (dtype_ == phi::DataType::COMPLEX64) {
+    float scalar_real = content.at(1).template get<float>();
+    float scalar_imag = content.at(2).template get<float>();
+    phi::dtype::complex<float> data =
+        phi::dtype::complex(scalar_real, scalar_imag);
+    scalar = phi::Scalar(data);
+  } else if (dtype_ == phi::DataType::COMPLEX128) {
+    double scalar_real = content.at(1).template get<double>();
+    double scalar_imag = content.at(1).template get<double>();
+    phi::dtype::complex<double> data =
+        phi::dtype::complex(scalar_real, scalar_imag);
+    scalar = phi::Scalar(data);
+  } else {
+    PADDLE_ENFORCE(false,
+                   phi::errors::InvalidArgument(
+                       "Invalid tensor data type `", dtype_, "`."));
+  }
+
+  return paddle::dialect::ScalarAttribute::get(ctx, scalar);
+}
+
+template <>
+paddle::dialect::DataTypeAttribute
+deserializeAttrFromJson<paddle::dialect::DataTypeAttribute, std::string>(
+    Json* attr_json, pir::IrContext* ctx) {
+  std::string data = attr_json->at(DATA).template get<std::string>();
+  phi::DataType data_type = phi::StringToDataType(data);
+  return paddle::dialect::DataTypeAttribute::get(ctx, data_type);
+}
+
+template <>
+paddle::dialect::PlaceAttribute
+deserializeAttrFromJson<paddle::dialect::PlaceAttribute, int8_t>(
+    Json* attr_json, pir::IrContext* ctx) {
+  Json data_json = attr_json->at(DATA);
+  int8_t type_id = data_json.at(0).template get<int8_t>();
+  phi::AllocationType type = static_cast<phi::AllocationType>(type_id);
+  int8_t id = data_json.at(1).template get<int8_t>();                  // int8_t
+  std::string dev_type = data_json.at(2).template get<std::string>();  // string
+  phi::Place place = phi::Place(type, id, dev_type);
+  return paddle::dialect::PlaceAttribute::get(ctx, place);
+}
+
+pir::Type parseType(Json* type_json) {
+  auto type_name = type_json->at(ID).template get<std::string>();
+  pir::IrContext* ctx = pir::IrContext::Instance();
+
+  if (type_name == pir::BoolType::name()) {
+    VLOG(8) << "Parse BoolType ... ";
+    return pir::deserializeTypeFromJson<pir::BoolType>(type_json, ctx);
+  } else if (type_name == pir::BFloat16Type::name()) {
+    VLOG(8) << "Parse BFloat16Type ... ";
+    return pir::deserializeTypeFromJson<pir::BFloat16Type>(type_json, ctx);
+  } else if (type_name == pir::Float16Type::name()) {
+    VLOG(8) << "Parse Float16Type ... ";
+    return pir::deserializeTypeFromJson<pir::Float16Type>(type_json, ctx);
+  } else if (type_name == pir::Float32Type::name()) {
+    VLOG(8) << "Parse Float32Type ... ";
+    return pir::deserializeTypeFromJson<pir::Float32Type>(type_json, ctx);
+  } else if (type_name == pir::Float64Type::name()) {
+    VLOG(8) << "Parse Float64Type ... ";
+    return pir::deserializeTypeFromJson<pir::Float64Type>(type_json, ctx);
+  } else if (type_name == pir::Int8Type::name()) {
+    VLOG(8) << "Parse Int8Type ... ";
+    return pir::deserializeTypeFromJson<pir::Int8Type>(type_json, ctx);
+  } else if (type_name == pir::UInt8Type::name()) {
+    VLOG(8) << "Parse UInt8Type ... ";
+    return pir::deserializeTypeFromJson<pir::UInt8Type>(type_json, ctx);
+  } else if (type_name == pir::Int16Type::name()) {
+    VLOG(8) << "Parse Int16Type ... ";
+    return pir::deserializeTypeFromJson<pir::Int16Type>(type_json, ctx);
+  } else if (type_name == pir::Int32Type::name()) {
+    VLOG(8) << "Parse Int32Type ... ";
+    return pir::deserializeTypeFromJson<pir::Int32Type>(type_json, ctx);
+  } else if (type_name == pir::Int64Type::name()) {
+    VLOG(8) << "Parse Int64Type ... ";
+    return pir::deserializeTypeFromJson<pir::Int64Type>(type_json, ctx);
+  } else if (type_name == pir::IndexType::name()) {
+    VLOG(8) << "Parse IndexType ... ";
+    return pir::deserializeTypeFromJson<pir::IndexType>(type_json, ctx);
+  } else if (type_name == pir::Complex64Type::name()) {
+    VLOG(8) << "Parse Complex64Type ... ";
+    return pir::deserializeTypeFromJson<pir::Complex64Type>(type_json, ctx);
+  } else if (type_name == pir::Complex128Type::name()) {
+    VLOG(8) << "Parse Complex128Type ... ";
+    return pir::deserializeTypeFromJson<pir::Complex128Type>(type_json, ctx);
+  } else if (type_name == pir::VectorType::name()) {
+    VLOG(8) << "Parse VectorType ... ";
+    std::vector<pir::Type> content;
+    for (auto& type_x : type_json->at(DATA)) {
+      content.push_back(parseType(&type_x));
+    }
+    return pir::VectorType::get(ctx, content);
+  } else if (type_name == pir::DenseTensorType::name()) {
+    VLOG(8) << "Parse DenseTensorType ... ";
+    Json data_json = type_json->at(DATA);
+    pir::Type dtype = parseType(&(data_json.at(0)));
+
+    std::vector<int64_t> dims =
+        data_json.at(1).template get<std::vector<int64_t>>();
+    phi::DDim ddim = phi::make_ddim(dims);
+    pir::DataLayout data_layout =
+        common::StringToDataLayout(data_json.at(2).template get<std::string>());
+
+    std::vector<std::vector<size_t>> lod =
+        data_json.at(3).template get<std::vector<std::vector<size_t>>>();
+
+    size_t offset = data_json.at(4).get<size_t>();
+    return pir::DenseTensorType::get(
+        ctx, dtype, ddim, data_layout, lod, offset);
+  } else if (type_name == NULL_TYPE) {
+    return pir::Type();
+  } else {
+    PADDLE_ENFORCE(false,
+                   phi::errors::InvalidArgument(
+                       "Unknown Type %s for parse type", type_name));
+  }
+  VLOG(8) << "Finish Parse Type ... ";
+
+  return pir::Type();
+}
+
+template <>
+pir::TypeAttribute deserializeAttrFromJson<pir::TypeAttribute, pir::Type>(
+    Json* attr_json, pir::IrContext* ctx) {
+  pir::Type type = parseType(&(attr_json->at(DATA)));
+  return pir::TypeAttribute::get(ctx, type);
+}
+
+pir::Attribute parseAttr(Json* attr_json) {
+  std::string attr_name = attr_json->at(ID).template get<std::string>();
+  pir::IrContext* ctx = pir::IrContext::Instance();
+
+  if (attr_name == pir::BoolAttribute::name()) {
+    VLOG(8) << "Parse BoolAttribute .";
+    return pir::deserializeAttrFromJson<pir::BoolAttribute, bool>(attr_json,
+                                                                  ctx);
+  } else if (attr_name == pir::FloatAttribute::name()) {
+    VLOG(8) << "Parse FloatAttribute .";
+    return pir::deserializeAttrFromJson<pir::FloatAttribute, float>(attr_json,
+                                                                    ctx);
+  } else if (attr_name == pir::DoubleAttribute::name()) {
+    VLOG(8) << "Parse DoubleAttribute .";
+    return pir::deserializeAttrFromJson<pir::DoubleAttribute, double>(attr_json,
+                                                                      ctx);
+  } else if (attr_name == pir::Int32Attribute::name()) {
+    VLOG(8) << "Parse Int32Attribute .";
+    return pir::deserializeAttrFromJson<pir::Int32Attribute, int32_t>(attr_json,
+                                                                      ctx);
+  } else if (attr_name == pir::Int64Attribute::name()) {
+    VLOG(8) << "Parse Int64Attribute .";
+    return pir::deserializeAttrFromJson<pir::Int64Attribute, int64_t>(attr_json,
+                                                                      ctx);
+  } else if (attr_name == pir::IndexAttribute::name()) {
+    VLOG(8) << "Parse IndexAttribute .";
+    return pir::deserializeAttrFromJson<pir::IndexAttribute, int64_t>(attr_json,
+                                                                      ctx);
+  } else if (attr_name == pir::ArrayAttribute::name()) {
+    VLOG(8) << "Parse ArrayAttribute .";
+    std::vector<pir::Attribute> val;
+    for (auto& attr_ : attr_json->at(DATA)) {
+      val.push_back(parseAttr(&(attr_)));
+    }
+    return pir::ArrayAttribute::get(ctx, val);
+  } else if (attr_name == pir::TypeAttribute::name()) {
+    VLOG(8) << "Parse TypeAttribute .";
+    return pir::deserializeAttrFromJson<pir::TypeAttribute, pir::Type>(
+        attr_json, ctx);
+  } else if (attr_name == pir::TensorNameAttribute::name()) {
+    VLOG(8) << "Parse TensorNameAttribute .";
+    return pir::deserializeAttrFromJson<pir::TensorNameAttribute, std::string>(
+        attr_json, ctx);
+  } else if (attr_name == pir::Complex64Attribute::name()) {
+    VLOG(8) << "Parse Complex64Attribute .";
+    return pir::deserializeAttrFromJson<pir::Complex64Attribute, float>(
+        attr_json, ctx);
+  } else if (attr_name == pir::Complex128Attribute::name()) {
+    VLOG(8) << "Parse Complex128Attribute .";
+    return pir::deserializeAttrFromJson<pir::Complex128Attribute, double>(
+        attr_json, ctx);
+  } else if (attr_name == pir::StrAttribute::name()) {
+    VLOG(8) << "Parse StrAttribute .";
+    return pir::deserializeAttrFromJson<pir::StrAttribute, std::string>(
+        attr_json, ctx);
+  } else if (attr_name == paddle::dialect::IntArrayAttribute::name()) {
+    VLOG(8) << "Parse IntArrayAttribute .";
+    return pir::deserializeAttrFromJson<paddle::dialect::IntArrayAttribute,
+                                        std::vector<int64_t>>(attr_json, ctx);
+  } else if (attr_name == paddle::dialect::ScalarAttribute::name()) {
+    VLOG(8) << "Parse ScalarAttribute .";
+    // this func's return type is pir::Attribute which is diffrent
+    // from paddle::dialect::ScalarAttribute
+    return pir::deserializeAttrFromJson_scalarAttr(attr_json, ctx);
+  } else if (attr_name == paddle::dialect::DataTypeAttribute::name()) {
+    VLOG(8) << "Parse DataTypeAttribute .";
+    return pir::deserializeAttrFromJson<paddle::dialect::DataTypeAttribute,
+                                        std::string>(attr_json, ctx);
+  } else if (attr_name == paddle::dialect::PlaceAttribute::name()) {
+    VLOG(8) << "Parse PlaceAttribute .";
+    return pir::deserializeAttrFromJson<paddle::dialect::PlaceAttribute,
+                                        int8_t>(attr_json, ctx);
+  } else {
+    PADDLE_ENFORCE(false,
+                   phi::errors::InvalidArgument(
+                       "Unknown Attr %s for parse attr", attr_name));
+  }
+  VLOG(8) << "Finish Parse Attr ... ";
+
+  return pir::Attribute();
+}
+
+}  // namespace pir
diff --git a/paddle/fluid/pir/serialize_deserialize/include/interface.h b/paddle/fluid/pir/serialize_deserialize/include/interface.h
new file mode 100644
index 0000000000000..3302dc1b90bb7
--- /dev/null
+++ b/paddle/fluid/pir/serialize_deserialize/include/interface.h
@@ -0,0 +1,63 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/pir/include/core/program.h"
+namespace pir {
+/**
+ * @brief Write the given PIR program into a file at the specified file path.
+ *
+ * @param[in] program      The PIR program to be written.
+ * @param[in] file_path    The path to the file to be written.
+ * @param[in] pir_version  The version number of PIR, used to identify or verify
+ * the written program version
+ * @param[in] overwrite    If the file already exists, this flag determines
+ * whether to overwrite the existing file.
+ * @param[in] readable     (Optional parameter, default to false) If true, the
+ * generated file will be has indent structure.
+ * @param[in] trainable    (Optional parameter, default to true) If true,
+ * operation has opresult_attrs for training like stop_gradient,persistable;
+ * Otherwise, it may only has opinfo attrs.
+ *
+ * @return void。
+ *
+ * @note readable and trainable Parameters may affect the content and format of
+ * the generated file, depending on implementation.
+ */
+void WriteModule(const pir::Program& program,
+                 const std::string& file_path,
+                 const uint64_t& pir_version,
+                 bool overwrite,
+                 bool readable = false,
+                 bool trainable = true);
+
+/**
+ * @brief Gets a PIR program from the specified file path.
+ *
+ * @param[in] file_path    The path to the file from which the PIR program
+ * should be read.
+ * @param[out] program     A pointer to the PIR program object where the
+ * deserilize program will be stored.
+ * @param[in] pir_version  The current version of the PIR program format.
+ *
+ * @return Void. The function modifies the 'program' object to contain the data
+ * read from the file.
+ *
+ * @note If 'pir_version' is larger than the version of file, will trigger
+ * version compatibility modification rule.
+ */
+void ReadModule(const std::string& file_path,
+                pir::Program* program,
+                const uint64_t& pir_version);
+}  // namespace pir
diff --git a/paddle/fluid/pir/serialize_deserialize/include/ir_deserialize.h b/paddle/fluid/pir/serialize_deserialize/include/ir_deserialize.h
new file mode 100644
index 0000000000000..2ae9f22d21a9c
--- /dev/null
+++ b/paddle/fluid/pir/serialize_deserialize/include/ir_deserialize.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <fstream>
+#include "paddle/common/enforce.h"
+#include "paddle/fluid/pir/serialize_deserialize/include/third_part.h"
+#include "paddle/pir/include/core/operation.h"
+#include "paddle/pir/include/core/program.h"
+
+namespace pir {
+
+class ProgramReader {
+ public:
+  explicit ProgramReader(const uint64_t version) : current_version(version) {}
+
+  ProgramReader(ProgramReader&&) = delete;
+  ProgramReader(const ProgramReader& ProgramReader) = delete;
+  ProgramReader& operator=(const ProgramReader&) = delete;
+  ProgramReader& operator=(ProgramReader&&);
+
+  // static void staticInit()
+
+  void RecoverProgram(Json* program_json, pir::Program* recover_program);
+  ~ProgramReader() = default;
+
+ private:
+  uint64_t current_version;
+  std::map<int64_t, pir::Value> id_value_map;
+
+  void ReadProgram(Json* program_json, pir::Program* program);
+  void ReadRegion(Json* region_json, pir::Region* region);
+  void ReadBlock(Json* block_json, pir::Block* block);
+  pir::Operation* ReadOp(Json* op_json);
+  pir::AttributeMap ReadAttributesMap(Json* attrs_json,
+                                      Json* operesult_attrs_json);
+  pir::Attribute ReadAttribute(Json* attr_json);
+  pir::Type ReadType(Json* type_json);
+};
+
+}  // namespace pir
diff --git a/paddle/fluid/pir/serialize_deserialize/include/ir_serialize.h b/paddle/fluid/pir/serialize_deserialize/include/ir_serialize.h
new file mode 100644
index 0000000000000..de8c7be16c5d6
--- /dev/null
+++ b/paddle/fluid/pir/serialize_deserialize/include/ir_serialize.h
@@ -0,0 +1,81 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/pir/serialize_deserialize/include/third_part.h"
+#include "paddle/pir/include/core/program.h"
+
+namespace pir {
+/**
+ * ProgramWriter is used to serialize pir program to json object.
+ *
+ */
+
+class ProgramWriter {
+ public:
+  explicit ProgramWriter(const uint64_t version) : version_(version) {}
+  explicit ProgramWriter(const uint64_t version, const bool trainable)
+      : version_(version), trainable_(trainable) {}
+
+  ProgramWriter(ProgramWriter&&) = delete;
+  ProgramWriter(const ProgramWriter& ProgramWriter) = delete;
+  ProgramWriter& operator=(const ProgramWriter&) = delete;
+  ProgramWriter& operator=(ProgramWriter&&);
+
+  /** GetProgramJson is used by writeModulde api*/
+  Json GetProgramJson(const pir::Program* program);
+  ~ProgramWriter() = default;
+
+ private:
+  /** version_ is the version of paddlepaddle. which is used to
+   * Conduct version compatibility judgment and modification.*/
+  uint64_t version_;
+
+  /** program_json is the json object of pir program. */
+  Json program_json;
+
+  /** value_id_map is used to record the serialize id of pir::Value.
+   * which is used to serilize op's operands. */
+  std::map<pir::Value, int64_t> value_id_map;
+
+  /** xxx_id_ is used to record current id of IR structure
+   * which should be serialized.*/
+
+  int64_t region_id_ = 0;
+  int64_t block_id_ = 0;
+  int64_t value_id_ = 1;
+  int64_t blockarg_id_ = -1;
+
+  bool trainable_ = true;
+
+  Json WriteProgram(const pir::Program* program);
+  Json WriteRegion(const pir::Region* region, const std::string& region_name);
+  Json WriteBlock(const pir::Block* block, const std::string& block_name);
+  Json WriteOp(const pir::Operation& op);
+  Json WriteBlockArg(const pir::Value& value);
+  Json WriteValue(const pir::Value& value);
+  Json WriteOpOperand(const pir::OpOperand& op_operand);
+  Json WriteAttributesMapOpinfo(pir::Operation* op,
+                                const AttributeMap& attr_map);
+  Json WriteAttributesMapOther(const AttributeMap& attr_map);
+  /** WriteAttribute is used to write attribute of op.
+   * which call writeAttr to get Derived Class‘s json object.
+   * same as WriteType
+   */
+
+  Json WriteAttribute(const std::string& op_attr_name,
+                      const pir::Attribute& attr);
+  Json WriteType(const pir::Type& type);
+};
+
+}  // namespace pir
diff --git a/paddle/fluid/pir/serialize_deserialize/include/save_load_parameters.h b/paddle/fluid/pir/serialize_deserialize/include/save_load_parameters.h
new file mode 100644
index 0000000000000..5ebbafb1eb4f7
--- /dev/null
+++ b/paddle/fluid/pir/serialize_deserialize/include/save_load_parameters.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace pir {
+
+void SaveFunction(const phi::DenseTensor& x,
+                  const std::string& name,
+                  const std::string& file_path,
+                  bool overwrite,
+                  bool save_as_fp16);
+
+void SaveCombineFunction(const std::vector<const phi::DenseTensor*>& x,
+                         const std::vector<std::string>& names,
+                         const std::string& file_path,
+                         bool overwrite,
+                         bool save_as_fp16,
+                         bool save_to_memory);
+
+void LoadFunction(const std::string& file_path,
+                  int64_t seek,
+                  const std::vector<int64_t>& shape,
+                  bool load_as_fp16,
+                  phi::DenseTensor* out);
+
+void LoadCombineFunction(const std::string& file_path,
+                         const std::vector<std::string>& names,
+                         std::vector<phi::DenseTensor*>* out,
+                         bool load_as_fp16);
+}  // namespace pir
diff --git a/paddle/fluid/pir/serialize_deserialize/include/schema.h b/paddle/fluid/pir/serialize_deserialize/include/schema.h
new file mode 100644
index 0000000000000..d444bee469596
--- /dev/null
+++ b/paddle/fluid/pir/serialize_deserialize/include/schema.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+namespace pir {
+/**
+ * IMPORTANT!!!
+ * all those defining strings can't be changed, otherwise the deserialization
+ * will failed. define all keys in serialized files to ensure accuracy for
+ * deserialization make sure all the key mutually exclusive
+ */
+
+// all IR structure's identifier (region, block, op, attr, type value etc)
+// which can be string , int64_t etc.
+#define ID "id"
+
+// program's key:
+#define REGIONS "regions"
+
+// region's key:
+// which is json array with block json object(ID and BLOCKARGS and BLOCKOPS)
+#define BLOCKS "blocks"
+
+// block's key:
+// which is json array with value json object
+#define BLOCKARGS "args"
+// which is json array with operation json object
+#define BLOCKOPS "ops"
+
+// operation's key:
+// which is json array with opoperand json object(ID)
+#define OPOPERANDS "I"
+
+// which is json array with value json object(ID and TYPE_TYPE)
+#define OPRESULTS "O"
+
+// which is json array with json object(NAME and ATTR_TYPE)
+#define ATTRS "A"
+#define OPRESULTS_ATTRS "OA"
+
+// value's key:
+//  value's type which should be pir::Type's json object(ID or ID and DATA).
+#define TYPE_TYPE "TT"
+
+// attr's name which is operation's feature.
+#define NAME "N"
+
+// attr's value which is pir::Attribute's json object(ID and DATA).
+#define ATTR_TYPE "AT"
+
+// type/attr's contents which is json::array.
+#define DATA "D"
+
+// NULL_TYPE
+#define NULL_TYPE "NULL"
+}  // namespace pir
diff --git a/paddle/fluid/pir/serialize_deserialize/include/serialize_utils.h b/paddle/fluid/pir/serialize_deserialize/include/serialize_utils.h
new file mode 100644
index 0000000000000..a6cae97f135d9
--- /dev/null
+++ b/paddle/fluid/pir/serialize_deserialize/include/serialize_utils.h
@@ -0,0 +1,346 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <fstream>
+#include <initializer_list>
+#include <string>
+#include <vector>
+
+#include "glog/logging.h"
+
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/serialize_deserialize/include/schema.h"
+#include "paddle/fluid/pir/serialize_deserialize/include/third_part.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/pir/include/core/builtin_attribute.h"
+#include "paddle/pir/include/core/builtin_type.h"
+
+namespace pir {
+/** serializeTypeToJson is a template function to serialize
+ * a pir type to a json object. a pir type may have value or no value
+ * Value free types only have ID, while value based types have
+ * DATA in addition to ID.
+ *
+ * If a new pir type is added, which needs to be serialized,
+ * it must have a name() method, returning a string which
+ * should be different from other types' names.
+ * (The name template is t_dialectname_typename).
+ * Note: The prefixes t are assumed to represent 'type'.
+ *
+ * If the pir type has value, it should have a data() method,
+ * which returns the value of type. The data() method is better
+ * suited to return TYPE  which supported by json like std::vector,
+ * std::string, int, float and so on. if not, serailizeTypeToJson
+ * need to be specialized.
+ */
+
+template <typename T>
+Json serializeTypeToJson(const T& type) {
+  Json json_obj;
+  json_obj[ID] = type.name();
+  return json_obj;
+}
+
+/** serializeAttrToJson is a template function to serialize
+ * pir attribute to json object. pir attribute usually have
+ * value, so it's json object has DATA and ID.
+ *
+ * If a new pir attr is added, which needs to be serialized,
+ * it must have a name() method, returning a string which
+ * should be different from other types' names.
+ * (The name template is a_dialectname_typename).
+ * Note: The prefixes a are assumed to represent 'attribute'.
+ *
+ * It also need have a data() method, which returns the value of
+ * attribute. The data() method is better suited to return TYPE
+ * which supported by json like std::vector, std::string, int,
+ * float and so on. if not, serailizeAttrToJson
+ * need to be specialized.
+ */
+
+template <typename T>
+Json serializeAttrToJson(const T& attr) {
+  Json json_obj;
+  json_obj[ID] = attr.name();
+  json_obj[DATA] = attr.data();
+  return json_obj;
+}
+
+#define SERIALIZE_ATTR_TO_JSON(type, data)           \
+  template <>                                        \
+  Json serializeAttrToJson<type>(const type& attr) { \
+    Json json_obj;                                   \
+    json_obj[ID] = attr.name();                      \
+    json_obj[DATA] = data;                           \
+    return json_obj;                                 \
+  }
+
+SERIALIZE_ATTR_TO_JSON(pir::StrAttribute, attr.AsString());
+
+SERIALIZE_ATTR_TO_JSON(pir::Complex64Attribute,
+                       std::vector({attr.data().real, attr.data().imag}));
+SERIALIZE_ATTR_TO_JSON(pir::Complex128Attribute,
+                       std::vector({attr.data().real, attr.data().imag}));
+SERIALIZE_ATTR_TO_JSON(paddle::dialect::IntArrayAttribute,
+                       attr.data().GetData());
+SERIALIZE_ATTR_TO_JSON(paddle::dialect::DataTypeAttribute,
+                       phi::DataTypeToString(attr.data()));
+
+template <>
+Json serializeAttrToJson<paddle::dialect::ScalarAttribute>(
+    const paddle::dialect::ScalarAttribute& attr) {
+  Json json_obj;
+  json_obj[ID] = attr.name();
+
+  Json content = Json::array();
+  auto scalar = attr.data();
+  auto dtype_ = scalar.dtype();
+  content.push_back(DataTypeToString(dtype_));
+
+  if (dtype_ == phi::DataType::FLOAT32) {
+    content.push_back(scalar.to<float>());
+  } else if (dtype_ == phi::DataType::INT32) {
+    content.push_back(scalar.to<int32_t>());
+  } else if (dtype_ == phi::DataType::FLOAT64) {
+    content.push_back(scalar.to<double>());
+  } else if (dtype_ == phi::DataType::INT8) {
+    content.push_back(scalar.to<int8_t>());
+  } else if (dtype_ == phi::DataType::FLOAT16 ||
+             dtype_ == phi::DataType::UINT16 ||
+             dtype_ == phi::DataType::BFLOAT16) {
+    content.push_back(scalar.to<uint16_t>());
+  } else if (dtype_ == phi::DataType::INT16) {
+    content.push_back(scalar.to<int16_t>());
+  } else if (dtype_ == phi::DataType::INT64) {
+    content.push_back(scalar.to<int64_t>());
+  } else if (dtype_ == phi::DataType::UINT8) {
+    content.push_back(scalar.to<uint8_t>());
+  } else if (dtype_ == phi::DataType::UINT32) {
+    content.push_back(scalar.to<uint32_t>());
+  } else if (dtype_ == phi::DataType::UINT64) {
+    content.push_back(scalar.to<uint64_t>());
+  } else if (dtype_ == phi::DataType::BOOL) {
+    content.push_back(scalar.to<bool>());
+  } else if (dtype_ == phi::DataType::COMPLEX64) {
+    content.push_back(scalar.to<phi::dtype::complex<float>>().real);
+    content.push_back(scalar.to<phi::dtype::complex<float>>().imag);
+  } else if (dtype_ == phi::DataType::COMPLEX128) {
+    content.push_back(scalar.to<phi::dtype::complex<double>>().real);
+    content.push_back(scalar.to<phi::dtype::complex<double>>().imag);
+  } else {
+    PADDLE_THROW(common::errors::InvalidArgument(
+        "Invalid tensor data type `", dtype_, "`."));
+  }
+  json_obj[DATA] = content;
+  return json_obj;
+}
+
+template <>
+Json serializeAttrToJson<paddle::dialect::PlaceAttribute>(
+    const paddle::dialect::PlaceAttribute& attr) {
+  Json json_obj;
+  json_obj[ID] = attr.name();
+  Json content = Json::array();
+  auto place = attr.data();
+  content.push_back(static_cast<int8_t>(place.GetType()));
+  content.push_back(place.GetDeviceId());    // int8_t
+  content.push_back(place.GetDeviceType());  // string
+  json_obj[DATA] = content;
+  return json_obj;
+}
+
+Json writeType(const pir::Type& type) {
+  Json type_json = Json::object();
+  if (type.isa<pir::BoolType>()) {
+    VLOG(8) << "Write BoolType ... ";
+    return pir::serializeTypeToJson<pir::BoolType>(
+        type.dyn_cast<pir::BoolType>());
+  } else if (type.isa<pir::BFloat16Type>()) {
+    VLOG(8) << "Write BFloat16Type ... ";
+    return pir::serializeTypeToJson<pir::BFloat16Type>(
+        type.dyn_cast<pir::BFloat16Type>());
+  } else if (type.isa<pir::Float16Type>()) {
+    VLOG(8) << "Write Float16Type ... ";
+    return pir::serializeTypeToJson<pir::Float16Type>(
+        type.dyn_cast<pir::Float16Type>());
+  } else if (type.isa<pir::Float32Type>()) {
+    VLOG(8) << "Write Float32Type ... ";
+    return pir::serializeTypeToJson<pir::Float32Type>(
+        type.dyn_cast<pir::Float32Type>());
+  } else if (type.isa<pir::Float64Type>()) {
+    VLOG(8) << "Write Float64Type ... ";
+    return pir::serializeTypeToJson<pir::Float64Type>(
+        type.dyn_cast<pir::Float64Type>());
+  } else if (type.isa<pir::Int8Type>()) {
+    VLOG(8) << "Write Int8Type ... ";
+    return pir::serializeTypeToJson<pir::Int8Type>(
+        type.dyn_cast<pir::Int8Type>());
+  } else if (type.isa<pir::UInt8Type>()) {
+    VLOG(8) << "Write UInt8Type ... ";
+    return pir::serializeTypeToJson<pir::UInt8Type>(
+        type.dyn_cast<pir::UInt8Type>());
+  } else if (type.isa<pir::Int16Type>()) {
+    VLOG(8) << "Write Int16Type ... ";
+    return pir::serializeTypeToJson<pir::Int16Type>(
+        type.dyn_cast<pir::Int16Type>());
+  } else if (type.isa<pir::Int32Type>()) {
+    VLOG(8) << "Write Int32Type ... ";
+    return pir::serializeTypeToJson<pir::Int32Type>(
+        type.dyn_cast<pir::Int32Type>());
+  } else if (type.isa<pir::Int64Type>()) {
+    VLOG(8) << "Write Int64Type ... ";
+    return pir::serializeTypeToJson<pir::Int64Type>(
+        type.dyn_cast<pir::Int64Type>());
+  } else if (type.isa<pir::IndexType>()) {
+    VLOG(8) << "Write IndexType ... ";
+    return pir::serializeTypeToJson<pir::IndexType>(
+        type.dyn_cast<pir::IndexType>());
+  } else if (type.isa<pir::Complex64Type>()) {
+    VLOG(8) << "Write Complex64Type ... ";
+    return pir::serializeTypeToJson<pir::Complex64Type>(
+        type.dyn_cast<pir::Complex64Type>());
+  } else if (type.isa<pir::Complex128Type>()) {
+    VLOG(8) << "Write Complex128Type ... ";
+    return pir::serializeTypeToJson<pir::Complex128Type>(
+        type.dyn_cast<pir::Complex128Type>());
+    // NOTE(Ruting) those Types need call writeType which make build error
+    //  when use template func serializeTypeToJson
+  } else if (type.isa<pir::VectorType>()) {
+    VLOG(8) << "Write VectorType ... ";
+    auto type_ = type.dyn_cast<pir::VectorType>();
+    type_json[ID] = type_.name();
+    Json content = Json::array();
+    for (auto type_x : type_.data()) {
+      content.push_back(writeType(type_x));
+    }
+    type_json[DATA] = content;
+    return type_json;
+  } else if (type.isa<pir::DenseTensorType>()) {
+    VLOG(8) << "Write DenseTensorType ... ";
+    auto type_ = type.dyn_cast<pir::DenseTensorType>();
+
+    type_json[ID] = type_.name();
+    Json content = Json::array();
+    content.push_back(writeType(type_.dtype()));
+
+    std::vector<int64_t> dims_;
+    for (auto i = 0; i < type_.dims().size(); i++) {
+      dims_.push_back(type_.dims().at(i));
+    }
+    content.push_back(dims_);
+
+    content.push_back(DataLayoutToString(type_.data_layout()));
+
+    content.push_back(type_.lod());
+
+    content.push_back(type_.offset());
+    type_json[DATA] = content;
+    return type_json;
+  } else if (!type) {
+    type_json[ID] = NULL_TYPE;
+    return type_json;
+  } else {
+    PADDLE_ENFORCE(
+        false, phi::errors::InvalidArgument("Unknown Type when write type"));
+  }
+  VLOG(8) << "Finish write Type ... ";
+
+  return type_json;
+}
+
+SERIALIZE_ATTR_TO_JSON(pir::TypeAttribute, writeType(attr.data()));
+
+Json writeAttr(const pir::Attribute& attr) {
+  Json attr_json = Json::object();
+  if (attr.isa<pir::BoolAttribute>()) {
+    VLOG(8) << "write BoolAttribute .";
+    return pir::serializeAttrToJson<pir::BoolAttribute>(
+        attr.dyn_cast<pir::BoolAttribute>());
+  } else if (attr.isa<pir::FloatAttribute>()) {
+    VLOG(8) << "write FloatAttribute .";
+    return pir::serializeAttrToJson<pir::FloatAttribute>(
+        attr.dyn_cast<pir::FloatAttribute>());
+  } else if (attr.isa<pir::DoubleAttribute>()) {
+    VLOG(8) << "write DoubleAttribute .";
+    return pir::serializeAttrToJson<pir::DoubleAttribute>(
+        attr.dyn_cast<pir::DoubleAttribute>());
+  } else if (attr.isa<pir::Int32Attribute>()) {
+    VLOG(8) << "write Int32Attribute .";
+    return pir::serializeAttrToJson<pir::Int32Attribute>(
+        attr.dyn_cast<pir::Int32Attribute>());
+  } else if (attr.isa<pir::Int64Attribute>()) {
+    VLOG(8) << "write Int64Attribute .";
+    return pir::serializeAttrToJson<pir::Int64Attribute>(
+        attr.dyn_cast<pir::Int64Attribute>());
+  } else if (attr.isa<pir::IndexAttribute>()) {
+    VLOG(8) << "write IndexAttribute .";
+    return pir::serializeAttrToJson<pir::IndexAttribute>(
+        attr.dyn_cast<pir::IndexAttribute>());
+  } else if (attr.isa<pir::ArrayAttribute>()) {
+    VLOG(8) << "write ArrayAttribute .";
+    auto attr_ = attr.dyn_cast<pir::ArrayAttribute>();
+    Json val = Json::array();
+    for (size_t i = 0; i < attr_.size(); i++) {
+      val.push_back(writeAttr(attr_.at(i)));
+    }
+    attr_json[ID] = attr_.name();
+    attr_json[DATA] = val;
+    return attr_json;
+  } else if (attr.isa<pir::TypeAttribute>()) {
+    VLOG(8) << "write TypeAttribute .";
+    return pir::serializeAttrToJson<pir::TypeAttribute>(
+        attr.dyn_cast<pir::TypeAttribute>());
+  } else if (attr.isa<pir::TensorNameAttribute>()) {
+    VLOG(8) << "write TensorNameAttribute .";
+    return pir::serializeAttrToJson<pir::TensorNameAttribute>(
+        attr.dyn_cast<pir::TensorNameAttribute>());
+  } else if (attr.isa<pir::Complex64Attribute>()) {
+    VLOG(8) << "write Complex64Attribute .";
+    return pir::serializeAttrToJson<pir::Complex64Attribute>(
+        attr.dyn_cast<pir::Complex64Attribute>());
+  } else if (attr.isa<pir::Complex128Attribute>()) {
+    VLOG(8) << "write Complex128Attribute .";
+    return pir::serializeAttrToJson<pir::Complex128Attribute>(
+        attr.dyn_cast<pir::Complex128Attribute>());
+  } else if (attr.isa<pir::StrAttribute>()) {
+    VLOG(8) << "write StrAttribute .";
+    return pir::serializeAttrToJson<pir::StrAttribute>(
+        attr.dyn_cast<pir::StrAttribute>());
+  } else if (attr.isa<paddle::dialect::IntArrayAttribute>()) {
+    VLOG(8) << "write IntArrayAttribute .";
+    return pir::serializeAttrToJson<paddle::dialect::IntArrayAttribute>(
+        attr.dyn_cast<paddle::dialect::IntArrayAttribute>());
+  } else if (attr.isa<paddle::dialect::ScalarAttribute>()) {
+    VLOG(8) << "write ScalarAttribute .";
+    return pir::serializeAttrToJson<paddle::dialect::ScalarAttribute>(
+        attr.dyn_cast<paddle::dialect::ScalarAttribute>());
+  } else if (attr.isa<paddle::dialect::DataTypeAttribute>()) {
+    VLOG(8) << "write DataTypeAttribute .";
+    return pir::serializeAttrToJson<paddle::dialect::DataTypeAttribute>(
+        attr.dyn_cast<paddle::dialect::DataTypeAttribute>());
+  } else if (attr.isa<paddle::dialect::PlaceAttribute>()) {
+    VLOG(8) << "write PlaceAttribute .";
+    return pir::serializeAttrToJson<paddle::dialect::PlaceAttribute>(
+        attr.dyn_cast<paddle::dialect::PlaceAttribute>());
+  } else {
+    PADDLE_ENFORCE(
+        false, phi::errors::InvalidArgument("Unknown Attr %s when write attr"));
+  }
+  VLOG(8) << "Finish write& attr ... ";
+
+  return attr_json;
+}
+
+}  // namespace pir
diff --git a/paddle/fluid/pir/serialize_deserialize/include/third_part.h b/paddle/fluid/pir/serialize_deserialize/include/third_part.h
new file mode 100644
index 0000000000000..bfa5146336902
--- /dev/null
+++ b/paddle/fluid/pir/serialize_deserialize/include/third_part.h
@@ -0,0 +1,17 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "nlohmann/json.hpp"
+using Json = nlohmann::json;
diff --git a/paddle/fluid/pir/serialize_deserialize/src/interface.cc b/paddle/fluid/pir/serialize_deserialize/src/interface.cc
new file mode 100644
index 0000000000000..7a55c478c8b1b
--- /dev/null
+++ b/paddle/fluid/pir/serialize_deserialize/src/interface.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/serialize_deserialize/include/interface.h"
+#include "paddle/common/enforce.h"
+#include "paddle/fluid/pir/serialize_deserialize/include/ir_deserialize.h"
+#include "paddle/fluid/pir/serialize_deserialize/include/ir_serialize.h"
+#include "paddle/phi/common/port.h"
+
+namespace pir {
+#define PROGRAM "program"
+#define BASE_CODE "base_code"
+#define MAGIC "magic"
+#define PIRVERSION "version"
+#define PIR "pir"
+void WriteModule(const pir::Program& program,
+                 const std::string& file_path,
+                 const uint64_t& pir_version,
+                 bool overwrite,
+                 bool readable,
+                 bool trainable) {
+  PADDLE_ENFORCE_EQ(
+      FileExists(file_path) && !overwrite,
+      false,
+      common::errors::PreconditionNotMet(
+          "%s exists!, cannot save to it when overwrite is set to false.",
+          file_path,
+          overwrite));
+
+  // write base code
+  Json total;
+
+  total[BASE_CODE] = {{MAGIC, PIR}, {PIRVERSION, pir_version}};
+
+  ProgramWriter writer(pir_version, trainable);
+  // write program
+  total[PROGRAM] = writer.GetProgramJson(&program);
+  std::string total_str;
+  if (readable) {
+    total_str = total.dump(4);
+  } else {
+    total_str = total.dump();
+  }
+
+  MkDirRecursively(DirName(file_path).c_str());
+  std::ofstream fout(file_path, std::ios::binary);
+  PADDLE_ENFORCE_EQ(static_cast<bool>(fout),
+                    true,
+                    common::errors::Unavailable(
+                        "Cannot open %s to save variables.", file_path));
+  fout << total_str;
+  fout.close();
+}
+
+void ReadModule(const std::string& file_path,
+                pir::Program* program,
+                const uint64_t& pir_version) {
+  std::ifstream f(file_path);
+  Json data = Json::parse(f);
+
+  if (data.contains(BASE_CODE) && data[BASE_CODE].contains(MAGIC) &&
+      data[BASE_CODE][MAGIC] == PIR) {
+    uint64_t file_version =
+        data.at(BASE_CODE).at(PIRVERSION).template get<uint64_t>();
+    if (file_version != pir_version) {
+      PADDLE_THROW(
+          common::errors::InvalidArgument("Invalid model version file."));
+    }
+  } else {
+    PADDLE_THROW(common::errors::InvalidArgument("Invalid model file."));
+  }
+
+  ProgramReader reader(pir_version);
+  reader.RecoverProgram(&(data[PROGRAM]), program);
+}
+
+}  // namespace pir
diff --git a/paddle/fluid/pir/serialize_deserialize/src/ir_deserialize.cc b/paddle/fluid/pir/serialize_deserialize/src/ir_deserialize.cc
new file mode 100644
index 0000000000000..88ee2ba168476
--- /dev/null
+++ b/paddle/fluid/pir/serialize_deserialize/src/ir_deserialize.cc
@@ -0,0 +1,160 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/serialize_deserialize/include/ir_deserialize.h"
+#include "paddle/fluid/pir/serialize_deserialize/include/deserialize_utils.h"
+namespace pir {
+void ProgramReader::RecoverProgram(Json* program_json,
+                                   pir::Program* recover_program) {
+  id_value_map[0] = pir::Value();
+  ReadProgram(program_json, recover_program);
+  VLOG(6) << "Finish json to program.";
+  return;
+}
+void ProgramReader::ReadProgram(Json* program_json, pir::Program* program) {
+  auto top_level_op = program->module_op();
+  PADDLE_ENFORCE_EQ(
+      program_json->at(REGIONS).size(),
+      1,
+      common::errors::InvalidArgument(
+          "The redions size of program module should be 1 but got %d.",
+          program_json->at(REGIONS).size()));
+  auto& region_json = program_json->at(REGIONS).at(0);
+  auto& block_json = region_json.at(BLOCKS).at(0);
+  auto& block = top_level_op.block();
+  ReadBlock(&block_json, &block);
+
+  VLOG(6) << "Finish Read program.";
+  return;
+}
+
+void ProgramReader::ReadRegion(Json* region_json, pir::Region* region) {
+  auto region_name = region_json->at(ID).template get<std::string>();
+  for (auto& block_json : region_json->at(BLOCKS)) {
+    auto& block = region->emplace_back();
+    ReadBlock(&block_json, &block);
+  }
+  VLOG(6) << "Finish Read " << region_name;
+  return;
+}
+
+void ProgramReader::ReadBlock(Json* block_json, pir::Block* block) {
+  auto block_name = block_json->at(ID).template get<std::string>();
+
+  Json& args_json = block_json->at(BLOCKARGS);
+  if (!args_json.empty()) {
+    for (auto& arg_json : args_json) {
+      int64_t arg_id_ = arg_json.at(ID).template get<int64_t>();
+      auto value = block->AddArg(ReadType(&(arg_json.at(TYPE_TYPE))));
+      id_value_map[arg_id_] = value;
+      VLOG(6) << "Finish Read blockargument " << arg_id_;
+    }
+  }
+
+  Json& ops_json = block_json->at(BLOCKOPS);
+  if (!ops_json.empty()) {
+    for (auto& op_json : ops_json) {
+      block->push_back(ReadOp(&op_json));
+    }
+  }
+
+  VLOG(6) << "Finish Read " << block_name;
+  return;
+}
+
+pir::Operation* ProgramReader::ReadOp(Json* op_json) {
+  auto op_name = op_json->at(ID).template get<std::string>();
+
+  // deserialize opoperands (find value)
+  Json& operands_json = op_json->at(OPOPERANDS);
+  std::vector<pir::Value> inputs;
+  for (auto& operand_json : operands_json) {
+    int64_t id = operand_json.at(ID).template get<int64_t>();
+    inputs.push_back(id_value_map[id]);
+  }
+
+  // deserialize opresults (find type)
+  Json& opresults_json = op_json->at(OPRESULTS);
+  std::vector<pir::Type> output_types;
+  std::vector<int64_t> output_ids;
+  for (auto& opresult_json : opresults_json) {
+    int64_t value_id_ = opresult_json.at(ID).template get<int64_t>();
+    output_ids.push_back(value_id_);
+    output_types.push_back(ReadType(&(opresult_json.at(TYPE_TYPE))));
+    VLOG(6) << "Finish Read value " << value_id_;
+  }
+
+  // serialize necessary attributes
+  Json& attrs_json = op_json->at(ATTRS);
+
+  pir::AttributeMap attributes;
+  if (op_json->contains(OPRESULTS_ATTRS)) {
+    Json& opresults_attrs_json = op_json->at(OPRESULTS_ATTRS);
+    attributes = ReadAttributesMap(&attrs_json, &opresults_attrs_json);
+  } else {
+    Json empty_json = Json::array();
+    attributes = ReadAttributesMap(&attrs_json, &empty_json);
+  }
+
+  pir::IrContext* ctx_ = pir::IrContext::Instance();
+  // prepare opinfo
+  pir::OpInfo op_info = ctx_->GetRegisteredOpInfo(op_name);
+
+  // deserialize op
+  pir::Operation* op =
+      Operation::Create(inputs, attributes, output_types, op_info);
+
+  PADDLE_ENFORCE_EQ(
+      output_ids.size(),
+      static_cast<size_t>(op->num_results()),
+      common::errors::InvalidArgument(
+          "deserialized op has %d results, but the original op has %d results.",
+          op->num_results(),
+          output_ids.size()));
+
+  for (uint32_t i = 0; i < op->num_results(); i++) {
+    id_value_map[output_ids[i]] = op->result(i);
+  }
+
+  VLOG(6) << "Finish Read Operation " << op->name();
+  return op;
+}
+
+pir::AttributeMap ProgramReader::ReadAttributesMap(Json* attrs_json,
+                                                   Json* opresult_attrs_json) {
+  pir::AttributeMap attributes;
+  for (auto& attr_json : *attrs_json) {
+    auto attr_name = attr_json.at(NAME).template get<std::string>();
+    attributes.insert({attr_name, ReadAttribute(&attr_json)});
+  }
+  VLOG(6) << "Finish Read pir::AttributeMap ";
+  for (auto& attr_json : *opresult_attrs_json) {
+    auto attr_name = attr_json.at(NAME).template get<std::string>();
+    attributes.insert({attr_name, ReadAttribute(&attr_json)});
+  }
+  VLOG(6) << "Finish Read Opresults_AttributeMap ";
+  return attributes;
+}
+
+pir::Attribute ProgramReader::ReadAttribute(Json* attr_json) {
+  VLOG(6) << "Begin Read Attribute. ";
+  return pir::parseAttr(&attr_json->at(ATTR_TYPE));
+}
+
+pir::Type ProgramReader::ReadType(Json* type_json) {
+  VLOG(6) << "Begin Read Type. ";
+  return pir::parseType(type_json);
+}
+
+}  // namespace pir
diff --git a/paddle/fluid/pir/serialize_deserialize/src/ir_serialize.cc b/paddle/fluid/pir/serialize_deserialize/src/ir_serialize.cc
new file mode 100644
index 0000000000000..21067aa83906d
--- /dev/null
+++ b/paddle/fluid/pir/serialize_deserialize/src/ir_serialize.cc
@@ -0,0 +1,216 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/serialize_deserialize/include/ir_serialize.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/serialize_deserialize/include/serialize_utils.h"
+#include "paddle/pir/include/core/dialect.h"
+#include "paddle/pir/include/core/operation.h"
+
+namespace pir {
+
+Json ProgramWriter::GetProgramJson(const pir::Program* program) {
+  program_json = WriteProgram(program);
+  VLOG(6) << "Finish program to json.";
+  return program_json;
+}
+
+Json ProgramWriter::WriteProgram(const pir::Program* program) {
+  Json program_json;
+  program_json[REGIONS] = Json::array();
+  auto top_level_op = program->module_op();
+
+  for (size_t i = 0; i < top_level_op->num_regions(); ++i) {
+    std::string region_name = "region_" + std::to_string(region_id_++);
+    auto& region = top_level_op->region(i);
+    auto region_json = WriteRegion(&region, region_name);
+    program_json[REGIONS].emplace_back(region_json);
+  }
+  VLOG(6) << "Finish write program.";
+  return program_json;
+}
+
+Json ProgramWriter::WriteRegion(const pir::Region* region,
+                                const std::string& region_name) {
+  Json region_json;
+  region_json[ID] = region_name;
+  region_json[BLOCKS] = Json::array();
+  for (auto block : region->blocks()) {
+    std::string block_name = "block_" + std::to_string(block_id_++);
+    auto block_json = WriteBlock(block, block_name);
+    region_json[BLOCKS].emplace_back(block_json);
+  }
+  VLOG(6) << "Finish write " << region_name;
+  return region_json;
+}
+
+Json ProgramWriter::WriteBlock(const pir::Block* block,
+                               const std::string& block_name) {
+  Json block_json;
+  block_json[ID] = block_name;
+
+  Json args_json = Json::array();
+  for (auto arg : block->args()) {
+    auto arg_json = WriteBlockArg(arg);
+    args_json.emplace_back(arg_json);
+  }
+  block_json[BLOCKARGS] = args_json;
+
+  Json ops_json = Json::array();
+  for (auto op : block->ops()) {
+    auto op_json = WriteOp(*op);
+    ops_json.emplace_back(op_json);
+  }
+  block_json[BLOCKOPS] = ops_json;
+
+  VLOG(6) << "Finish write " << block_name;
+  return block_json;
+}
+
+Json ProgramWriter::WriteBlockArg(const pir::Value& value) {
+  Json arg_json;
+  Json var = WriteType(value.type());
+  value_id_map[value] = blockarg_id_;
+  arg_json[ID] = blockarg_id_;
+  arg_json[TYPE_TYPE] = var;
+
+  VLOG(6) << "Finish write blockargument " << blockarg_id_;
+  blockarg_id_--;
+
+  return arg_json;
+}
+
+Json ProgramWriter::WriteValue(const pir::Value& value) {
+  Json var_json;
+  if (value) {
+    value_id_map[value] = value_id_;
+    var_json[ID] = value_id_;
+    VLOG(6) << "Finish write value " << value_id_;
+    value_id_++;
+  } else {
+    var_json[ID] = 0;  // NULL_TYPE
+    VLOG(6) << "Finish write NULL_TYPE value.";
+  }
+
+  Json var = WriteType(value.type());
+  var_json[TYPE_TYPE] = var;
+
+  return var_json;
+}
+
+Json ProgramWriter::WriteOp(const pir::Operation& op) {
+  Json op_json = Json::object();
+  op_json[ID] = op.name();
+  // serialize opoperands
+  Json operands_json = Json::array();
+  for (auto operand : op.operands()) {
+    auto operand_json = WriteOpOperand(operand);
+    operands_json.emplace_back(operand_json);
+  }
+  op_json[OPOPERANDS] = operands_json;
+
+  // serialize opresults
+  Json opresults_json = Json::array();
+  for (auto& opresult : op.results()) {
+    auto opresult_json = WriteValue(opresult);
+    opresults_json.emplace_back(opresult_json);
+  }
+  op_json[OPRESULTS] = opresults_json;
+
+  // serialize attributes
+  op_json[ATTRS] = WriteAttributesMapOpinfo(const_cast<pir::Operation*>(&op),
+                                            op.attributes());
+  if (trainable_) {
+    op_json[OPRESULTS_ATTRS] = WriteAttributesMapOther(op.attributes());
+  }
+
+  VLOG(6) << "Finish write Operation " << op.name();
+  return op_json;
+}
+
+Json ProgramWriter::WriteOpOperand(const pir::OpOperand& op_operand) {
+  Json operand_json = Json::object();
+  if (op_operand.source()) {
+    int64_t id = value_id_map[op_operand.source()];
+    operand_json[ID] = id;
+    VLOG(6) << "Finish write OpOperand " << id;
+  } else {
+    operand_json[ID] = 0;  // NULL_VALUE
+    VLOG(6) << "Finish write NULL_VALUE OpOperand.";
+  }
+
+  return operand_json;
+}
+
+Json ProgramWriter::WriteAttributesMapOpinfo(pir::Operation* op,
+                                             const AttributeMap& attr_map) {
+  Json attrs_json = Json::array();
+
+  if (op->dialect()->name() == "pd_op") {
+    if (op->dyn_cast<paddle::dialect::OpYamlInfoInterface>() != nullptr) {
+      auto [_1, attr_info, _3, _4, _5] =
+          op->dyn_cast<paddle::dialect::OpYamlInfoInterface>().GetOpInfo();
+      if (attr_info.size() != 0) {
+        for (auto it = attr_info.begin(); it != attr_info.end(); it++) {
+          if (attr_map.find(it->name) != attr_map.end()) {
+            attrs_json.emplace_back(
+                WriteAttribute(it->name, attr_map.at(it->name)));
+          }
+        }
+      }
+    } else {
+      PADDLE_THROW(common::errors::InvalidArgument(
+          "the %s do not has OpYamlInfoInterface", op->name()));
+    }
+  } else {
+    for (auto& attr : attr_map) {
+      if (attr.first != "stop_gradient" && attr.first != "persistable" &&
+          attr.first != "op_callstack") {
+        attrs_json.emplace_back(WriteAttribute(attr.first, attr.second));
+      }
+    }
+  }
+
+  VLOG(6) << "Finish write Opinfo AttributeMap ";
+  return attrs_json;
+}
+
+Json ProgramWriter::WriteAttributesMapOther(const AttributeMap& attr_map) {
+  Json operesult_attrs_json = Json::array();
+  for (auto& attr : attr_map) {
+    if (attr.first == "stop_gradient" || attr.first == "persistable") {
+      operesult_attrs_json.emplace_back(
+          WriteAttribute(attr.first, attr.second));
+    }
+  }
+
+  VLOG(6) << "Finish write Other AttributeMap ";
+  return operesult_attrs_json;
+}
+
+Json ProgramWriter::WriteAttribute(const std::string& op_attr_name,
+                                   const pir::Attribute& attr) {
+  Json attr_json;
+  attr_json[NAME] = op_attr_name;
+  attr_json[ATTR_TYPE] = pir::writeAttr(attr);
+
+  VLOG(6) << "Finish write Attribute. ";
+  return attr_json;
+}
+
+Json ProgramWriter::WriteType(const pir::Type& type) {
+  VLOG(6) << "Finish write Type. ";
+  return pir::writeType(type);
+}
+}  // namespace pir
diff --git a/paddle/fluid/pir/serialize_deserialize/src/save_load_parameters.cc b/paddle/fluid/pir/serialize_deserialize/src/save_load_parameters.cc
new file mode 100644
index 0000000000000..d3c047f78b960
--- /dev/null
+++ b/paddle/fluid/pir/serialize_deserialize/src/save_load_parameters.cc
@@ -0,0 +1,209 @@
+/* Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/pir/serialize_deserialize/include/save_load_parameters.h"
+
+#include <cstdint>
+#include <fstream>
+#include <numeric>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/phi/common/port.h"
+#include "paddle/phi/kernels/funcs/data_type_transform.h"
+
+namespace pir {
+
+const phi::DeviceContext* GetDeviceContext(const phi::DenseTensor& x) {
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  const phi::DeviceContext* dev_ctx = nullptr;
+  auto place = x.place();
+  dev_ctx = pool.Get(place);
+  return dev_ctx;
+}
+
+const phi::DenseTensor CastTensorType(const phi::DeviceContext* dev_ctx,
+                                      const phi::DenseTensor& x,
+                                      phi::DataType out_dtype) {
+  auto place = x.place();
+  if (paddle::platform::is_cpu_place(place)) {
+    auto out = phi::funcs::TransDataType(
+        reinterpret_cast<const phi::CPUContext&>(*dev_ctx), x, out_dtype);
+    return out;
+  } else if (paddle::platform::is_gpu_place(place)) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    return phi::funcs::TransDataType(
+        reinterpret_cast<const phi::GPUContext&>(*dev_ctx), x, out_dtype);
+#endif
+  }
+  return x;
+}
+
+void SaveFunction(const phi::DenseTensor& x,
+                  const std::string& name,
+                  const std::string& file_path,
+                  bool overwrite,
+                  bool save_as_fp16) {
+  PADDLE_ENFORCE_EQ(
+      FileExists(file_path) && !overwrite,
+      false,
+      phi::errors::PreconditionNotMet(
+          "%s exists!, cannot save to it when overwrite is set to false.",
+          file_path,
+          overwrite));
+
+  MkDirRecursively(DirName(file_path).c_str());
+  VLOG(6) << "save func save path: " << file_path;
+  std::ofstream fout(file_path, std::ios::binary);
+  PADDLE_ENFORCE_EQ(
+      static_cast<bool>(fout),
+      true,
+      phi::errors::Unavailable("Cannot open %s to save variables.", file_path));
+
+  auto in_dtype = x.dtype();
+  auto out_dtype = save_as_fp16 ? phi::DataType::FLOAT16 : in_dtype;
+
+  const phi::DeviceContext* dev_ctx = GetDeviceContext(x);
+  if (in_dtype != out_dtype) {
+    auto out = CastTensorType(dev_ctx, x, out_dtype);
+    paddle::framework::SerializeToStream(fout, out, *dev_ctx);
+  } else {
+    paddle::framework::SerializeToStream(fout, x, *dev_ctx);
+  }
+  fout.close();
+  VLOG(6) << "save func done ";
+}
+
+void SaveCombineFunction(const std::vector<const phi::DenseTensor*>& x,
+                         const std::vector<std::string>& names,
+                         const std::string& file_path,
+                         bool overwrite,
+                         bool save_as_fp16,
+                         bool save_to_memory) {
+  PADDLE_ENFORCE_EQ(
+      FileExists(file_path) && !overwrite,
+      false,
+      phi::errors::PreconditionNotMet(
+          "%s exists!, cannot save to it when overwrite is set to false.",
+          file_path,
+          overwrite));
+
+  MkDirRecursively(DirName(file_path).c_str());
+  VLOG(6) << "save func save path: " << file_path;
+  std::ostringstream ss;
+  PADDLE_ENFORCE_GT(x.size(),
+                    0UL,
+                    phi::errors::InvalidArgument(
+                        "The number of variables to be saved is %d, expect "
+                        "it to be greater than 0.",
+                        x.size()));
+  const phi::DeviceContext* dev_ctx = GetDeviceContext(*(x[0]));
+  for (size_t i = 0; i < x.size(); i++) {
+    auto& tensor = *(x[i]);
+    PADDLE_ENFORCE_EQ(
+        tensor.IsInitialized(),
+        true,
+        phi::errors::InvalidArgument(
+            "The Tensor with Index (%d) to be saved is not initialized.", i));
+    auto in_dtype = tensor.dtype();
+    auto out_dtype = save_as_fp16 ? phi::DataType::FLOAT16 : in_dtype;
+    if (in_dtype != out_dtype) {
+      auto out = CastTensorType(dev_ctx, tensor, out_dtype);
+      paddle::framework::SerializeToStream(ss, out, *dev_ctx);
+    } else {
+      paddle::framework::SerializeToStream(ss, tensor, *dev_ctx);
+    }
+  }
+  MkDirRecursively(DirName(file_path).c_str());
+  std::ofstream fout(file_path, std::ios::binary);
+  PADDLE_ENFORCE_EQ(
+      static_cast<bool>(fout),
+      true,
+      phi::errors::Unavailable("Cannot open %s to save variables.", file_path));
+  fout << ss.str();
+  fout.close();
+  VLOG(6) << "save combine done ";
+}
+
+void LoadFunction(const std::string& file_path,
+                  int64_t seek,
+                  const std::vector<int64_t>& shape,
+                  bool load_as_fp16,
+                  phi::DenseTensor* out) {
+  std::ifstream fin(file_path, std::ios::binary);
+  PADDLE_ENFORCE_EQ(static_cast<bool>(fin),
+                    true,
+                    phi::errors::Unavailable(
+                        "Load operator fail to open file %s, please check "
+                        "whether the model file is complete or damaged.",
+                        file_path));
+  PADDLE_ENFORCE_NOT_NULL(out,
+                          phi::errors::InvalidArgument(
+                              "The variable to be loaded cannot be found."));
+  const phi::DeviceContext* dev_ctx = GetDeviceContext(*out);
+
+  if (seek != -1) {
+    PADDLE_ENFORCE_GE(seek,
+                      0,
+                      phi::errors::InvalidArgument(
+                          "seek with tensor must great than or equal to 0"));
+    paddle::framework::DeserializeFromStream(fin, out, *dev_ctx, seek, shape);
+  } else {
+    paddle::framework::DeserializeFromStream(fin, out);
+  }
+
+  auto in_dtype = out->dtype();
+  auto out_dtype = load_as_fp16 ? phi::DataType::FLOAT16 : in_dtype;
+  if (in_dtype != out_dtype) {
+    auto cast_in = *out;
+    *out = CastTensorType(dev_ctx, cast_in, out_dtype);
+  }
+}
+
+void LoadCombineFunction(const std::string& file_path,
+                         const std::vector<std::string>& names,
+                         std::vector<phi::DenseTensor*>* out,
+                         bool load_as_fp16) {
+  std::ifstream fin(file_path, std::ios::binary);
+  PADDLE_ENFORCE_EQ(static_cast<bool>(fin),
+                    true,
+                    phi::errors::Unavailable(
+                        "Load operator fail to open file %s, please check "
+                        "whether the model file is complete or damaged.",
+                        file_path));
+
+  PADDLE_ENFORCE_GT(out->size(),
+                    0UL,
+                    phi::errors::InvalidArgument(
+                        "The number of variables to be saved is %d, expect "
+                        "it to be greater than 0.",
+                        out->size()));
+  const phi::DeviceContext* dev_ctx = GetDeviceContext(*(out->at(0)));
+  for (size_t i = 0; i < names.size(); i++) {
+    auto tensor = out->at(i);
+    paddle::framework::DeserializeFromStream(fin, tensor);
+
+    auto in_dtype = tensor->dtype();
+    auto out_dtype = load_as_fp16 ? phi::DataType::FLOAT16 : in_dtype;
+    if (in_dtype != out_dtype) {
+      auto cast_in = *tensor;
+      *tensor = CastTensorType(dev_ctx, cast_in, out_dtype);
+    }
+  }
+  fin.peek();
+  PADDLE_ENFORCE_EQ(
+      fin.eof(),
+      true,
+      phi::errors::Unavailable("Not allowed to load partial data via "
+                               "load_combine_op, please use load_op instead."));
+}
+
+}  // namespace pir
diff --git a/paddle/fluid/pir/transforms/CMakeLists.txt b/paddle/fluid/pir/transforms/CMakeLists.txt
index 627fcb78d8563..3a06aa2da7d77 100644
--- a/paddle/fluid/pir/transforms/CMakeLists.txt
+++ b/paddle/fluid/pir/transforms/CMakeLists.txt
@@ -6,7 +6,7 @@ if(NOT WITH_CINN)
     ${CMAKE_CURRENT_SOURCE_DIR}/sub_graph_detector.cc)
 endif()
 
-if(NOT WITH_MKLDNN)
+if(NOT WITH_ONEDNN)
   file(GLOB_RECURSE onednn_srcs "onednn/*.cc")
   list(REMOVE_ITEM transforms_srcs ${onednn_srcs})
 endif()
@@ -26,7 +26,7 @@ set(transforms_deps
     device_event_base)
 
 if(WITH_CINN)
-  set(transforms_deps ${transforms_deps} cinn_op_dialect cinnapi)
+  set(transforms_deps ${transforms_deps} cinnapi)
 endif()
 
 cc_library(
diff --git a/paddle/fluid/pir/transforms/general/constant_folding_pass.cc b/paddle/fluid/pir/transforms/general/constant_folding_pass.cc
index bf1bc26850c56..e70039be7d375 100644
--- a/paddle/fluid/pir/transforms/general/constant_folding_pass.cc
+++ b/paddle/fluid/pir/transforms/general/constant_folding_pass.cc
@@ -238,7 +238,11 @@ class ConstantFoldingPattern : public pir::RewritePattern {
       const std::vector<std::pair<pir::Operation*, int32_t>>& use_ops) const {
     for (auto [use_op, idx] : use_ops) {
       if (use_op->isa<pir::CombineOp>()) {
-        if (!ReplaceResultByParameterOp(use_op)) return false;
+        if (!ReplaceResultByParameterOp(use_op)) {
+          return false;
+        }
+      } else if (use_op->isa<paddle::dialect::MemcpyH2dOp>()) {
+        return false;
       } else if (use_op->HasInterface<paddle::dialect::OpYamlInfoInterface>()) {
         auto [input_infos, _1, _2, _3, _4] =
             use_op->dyn_cast<paddle::dialect::OpYamlInfoInterface>()
@@ -255,6 +259,9 @@ class ConstantFoldingPattern : public pir::RewritePattern {
   }
 
   bool ReplaceResultByParameterOp(pir::Operation* op) const {
+    if (op->isa<paddle::dialect::MemcpyD2hOp>()) {
+      return false;
+    }
     for (uint32_t i = 0; i < op->num_results(); i++) {
       auto use_ops = pir::GetUseOpsForOutput(op, i);
       if (!CheckUseOps(use_ops)) return false;
diff --git a/paddle/fluid/pir/transforms/general/identity_op_clean_pass.cc b/paddle/fluid/pir/transforms/general/identity_op_clean_pass.cc
index fe2369e71a551..ed7dc13da540c 100644
--- a/paddle/fluid/pir/transforms/general/identity_op_clean_pass.cc
+++ b/paddle/fluid/pir/transforms/general/identity_op_clean_pass.cc
@@ -245,39 +245,6 @@ class ReplaceDropoutWithScalePattern : public paddle::drr::DrrPatternBase {
   }
 };
 
-class RemoveRedundantTransposePattern : public paddle::drr::DrrPatternBase {
- public:
-  std::string name() const override {
-    return "RemoveRedundantTransposePattern";
-  }
-
-  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
-    paddle::drr::SourcePattern pat = ctx->SourcePattern();
-    const auto &transpose1 =
-        pat.Op("pd_op.transpose", {{"perm", pat.Attr("perm_1")}});
-    const auto &transpose2 =
-        pat.Op("pd_op.transpose", {{"perm", pat.Attr("perm_2")}});
-
-    pat.Tensor("ret") = transpose2(transpose1(pat.Tensor("arg_transpose")));
-
-    paddle::drr::ResultPattern res = pat.ResultPattern();
-    const auto &new_perm_attr = res.ComputeAttr(
-        [](const paddle::drr::MatchContext &match_ctx) -> std::vector<int> {
-          const auto &perm1 = match_ctx.Attr<std::vector<int>>("perm_1");
-          const auto &perm2 = match_ctx.Attr<std::vector<int>>("perm_2");
-          std::vector<int> new_perm;
-          for (int v : perm2) {
-            new_perm.emplace_back(perm1[v]);
-          }
-          return new_perm;
-        });
-    const auto &transpose_continuous =
-        res.Op("pd_op.transpose", {{"perm", new_perm_attr}});
-
-    res.Tensor("ret") = transpose_continuous(res.Tensor("arg_transpose"));
-  }
-};
-
 class IdentityOpCleanPass : public pir::PatternRewritePass {
  public:
   IdentityOpCleanPass()
@@ -292,7 +259,6 @@ class IdentityOpCleanPass : public pir::PatternRewritePass {
     ps.Add(paddle::drr::Create<RemoveRedundantCastPattern>(context));
     ps.Add(paddle::drr::Create<DeleteDropoutOpPattern>(context));
     ps.Add(paddle::drr::Create<ReplaceDropoutWithScalePattern>(context));
-    ps.Add(paddle::drr::Create<RemoveRedundantTransposePattern>(context));
     return ps;
   }
 };
diff --git a/paddle/fluid/pir/transforms/general/remove_redundant_transpose_pass.cc b/paddle/fluid/pir/transforms/general/remove_redundant_transpose_pass.cc
new file mode 100644
index 0000000000000..d6a1efb0e16ab
--- /dev/null
+++ b/paddle/fluid/pir/transforms/general/remove_redundant_transpose_pass.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/transforms/general/remove_redundant_transpose_pass.h"
+
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/drr/include/drr_pattern_base.h"
+
+#include "paddle/pir/include/pass/pass.h"
+#include "paddle/pir/include/pass/pass_registry.h"
+
+namespace {
+
+class RemoveRedundantTransposePattern : public paddle::drr::DrrPatternBase {
+ public:
+  std::string name() const override {
+    return "RemoveRedundantTransposePattern";
+  }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern pat = ctx->SourcePattern();
+    const auto &transpose1 =
+        pat.Op("pd_op.transpose", {{"perm", pat.Attr("perm_1")}});
+    const auto &transpose2 =
+        pat.Op("pd_op.transpose", {{"perm", pat.Attr("perm_2")}});
+
+    pat.Tensor("ret") = transpose2(transpose1(pat.Tensor("arg_transpose")));
+
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+    const auto &new_perm_attr = res.ComputeAttr(
+        [](const paddle::drr::MatchContext &match_ctx) -> std::vector<int> {
+          const auto &perm1 = match_ctx.Attr<std::vector<int>>("perm_1");
+          const auto &perm2 = match_ctx.Attr<std::vector<int>>("perm_2");
+          std::vector<int> new_perm;
+          for (int v : perm2) {
+            new_perm.emplace_back(perm1[v]);
+          }
+          return new_perm;
+        });
+    const auto &transpose_continuous =
+        res.Op("pd_op.transpose", {{"perm", new_perm_attr}});
+
+    res.Tensor("ret") = transpose_continuous(res.Tensor("arg_transpose"));
+  }
+};
+
+class RemoveRedundantTransposePass : public pir::PatternRewritePass {
+ public:
+  RemoveRedundantTransposePass()
+      : pir::PatternRewritePass("remove_redundant_transpose_pass", 2) {}
+
+  pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override {
+    pir::RewritePatternSet ps(context);
+    ps.Add(paddle::drr::Create<RemoveRedundantTransposePattern>(context));
+    return ps;
+  }
+};
+
+}  // namespace
+
+namespace pir {
+
+std::unique_ptr<Pass> CreateRemoveRedundantTransposePass() {
+  return std::make_unique<RemoveRedundantTransposePass>();
+}
+}  // namespace pir
+
+REGISTER_IR_PASS(remove_redundant_transpose_pass, RemoveRedundantTransposePass);
diff --git a/paddle/fluid/pir/transforms/general/remove_redundant_transpose_pass.h b/paddle/fluid/pir/transforms/general/remove_redundant_transpose_pass.h
new file mode 100644
index 0000000000000..338dcd26f6564
--- /dev/null
+++ b/paddle/fluid/pir/transforms/general/remove_redundant_transpose_pass.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "paddle/pir/include/core/dll_decl.h"
+
+namespace pir {
+
+class Pass;
+
+IR_API std::unique_ptr<Pass> CreateRemoveRedundantTransposePass();
+
+}  // namespace pir
diff --git a/paddle/fluid/pir/transforms/general/remove_shadow_feed_pass.cc b/paddle/fluid/pir/transforms/general/remove_shadow_feed_pass.cc
new file mode 100644
index 0000000000000..bcf88170bdd54
--- /dev/null
+++ b/paddle/fluid/pir/transforms/general/remove_shadow_feed_pass.cc
@@ -0,0 +1,199 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/transforms/general/remove_shadow_feed_pass.h"
+
+#include "paddle/common/enforce.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_op.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/pir/include/core/block.h"
+#include "paddle/pir/include/core/builtin_op.h"
+#include "paddle/pir/include/core/ir_context.h"
+#include "paddle/pir/include/pass/pass.h"
+#include "paddle/pir/include/pass/pass_registry.h"
+#include "paddle/pir/include/pattern_rewrite/pattern_match.h"
+
+namespace {
+
+std::unique_ptr<paddle::dialect::OpYamlInfoParser> GetParser(
+    pir::Operation *op) {
+  std::unique_ptr<paddle::dialect::OpYamlInfoParser> op_info_parser(nullptr);
+  std::string op_name = op->dyn_cast<paddle::dialect::PhiKernelOp>().op_name();
+  auto op_info = pir::IrContext::Instance()->GetRegisteredOpInfo(op_name);
+  if (op_info.HasInterface<paddle::dialect::OpYamlInfoInterface>()) {
+    auto impl =
+        op_info.GetInterfaceImpl<paddle::dialect::OpYamlInfoInterface>();
+    auto op_info_tuple = impl->get_op_info_(op_name);
+    op_info_parser = std::make_unique<paddle::dialect::OpYamlInfoParser>(
+        op_info_tuple, paddle::dialect::IsLegacyOp(op_name));
+  }
+  return op_info_parser;
+}
+
+template <typename T>
+phi::Place GetVarPlace(const paddle::framework::Variable *var,
+                       const phi::Place &exe_place) {
+  phi::Place place;
+  auto &tensor = var->Get<T>();
+  if (tensor.initialized()) {
+    place = tensor.place();
+  } else {
+    place = exe_place;
+  }
+  return place;
+}
+
+class RemoveShadowFeedPattern
+    : public pir::OpRewritePattern<paddle::dialect::PhiKernelOp> {
+ public:
+  explicit RemoveShadowFeedPattern(pir::IrContext *context,
+                                   const pir::Block *block,
+                                   const phi::Place &place,
+                                   const paddle::framework::Scope *scope)
+      : pir::OpRewritePattern<paddle::dialect::PhiKernelOp>::OpRewritePattern(
+            context),
+        place_(place),
+        scope_(scope) {
+    for (auto &[name, value] : block->kwargs()) {
+      kwargs_map_[value] = name;
+    }
+  }
+
+  bool IsSamePlaceShadowFeed(paddle::dialect::PhiKernelOp op) const {
+    if (op.op_name() == "pd_op.shadow_feed") {
+      auto in = op.operand_source(0);
+      if (!kwargs_map_.count(in)) {
+        return false;
+      }
+      auto in_name = kwargs_map_.at(in);
+      auto *var = scope_->FindVar(in_name);
+      phi::Place var_place;
+      if (var->IsType<phi::DenseTensor>()) {
+        var_place = GetVarPlace<phi::DenseTensor>(var, place_);
+      } else if (var->IsType<phi::SelectedRows>()) {
+        var_place = GetVarPlace<phi::SelectedRows>(var, place_);
+      } else if (var->IsType<paddle::framework::VariableRefArray>()) {
+        var_place =
+            GetVarPlace<paddle::framework::VariableRefArray>(var, place_);
+      } else {
+        PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+            "RemoveShadowFeedPattern only support output "
+            "variable of type DenseTensor, SelectedRows or VariableRefArray"));
+      }
+      return var_place == place_;
+    }
+    return false;
+  }
+
+  bool IsTensorAttrShadowFeed(paddle::dialect::PhiKernelOp op) const {
+    if (op.op_name() == "pd_op.shadow_feed") {
+      auto in = op.operand_source(0);
+      if (!kwargs_map_.count(in)) {
+        return false;
+      }
+      auto out = op.result(0);
+      if (out.use_count() == 1) {
+        auto use_op = out.first_use().owner();
+        if (!use_op->isa<paddle::dialect::PhiKernelOp>()) {
+          return false;
+        }
+        auto op_info_parser = GetParser(use_op);
+        for (size_t i = 0; i < use_op->num_operands(); ++i) {
+          if (out == use_op->operand_source(i) &&
+              op_info_parser->IsTensorAttribute(i)) {
+            return true;
+          }
+        }
+      }
+    }
+    return false;
+  }
+
+  bool Match(paddle::dialect::PhiKernelOp op) const override {
+    return IsSamePlaceShadowFeed(op) || IsTensorAttrShadowFeed(op);
+  }
+
+  void Rewrite(paddle::dialect::PhiKernelOp op,
+               pir::PatternRewriter &rewriter) const override {  // NOLINT
+    auto in = op.operand_source(0);
+    auto out = op.result(0);
+    in.set_type(out.type());
+    rewriter.ReplaceAllUsesWith(out, in);
+    rewriter.EraseOp(op);
+  }
+
+ private:
+  const phi::Place place_;
+  const paddle::framework::Scope *scope_;
+  std::unordered_map<::pir::Value, std::string> kwargs_map_;
+};
+
+class RemoveShadowFeedPass : public pir::PatternRewritePass {
+ public:
+  RemoveShadowFeedPass()
+      : pir::PatternRewritePass("remove_shadow_feed_pass", 0) {}
+
+  pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override {
+    PADDLE_ENFORCE_EQ(
+        Has("top_block"),
+        true,
+        phi::errors::InvalidArgument(
+            "Pass initialize failed."
+            "When using RemoveShadowFeedPass, block attribute is required!"
+            "Use Set method to set the place attribute."));
+    PADDLE_ENFORCE_EQ(
+        Has(pir::Pass::kPlaceAttr),
+        true,
+        phi::errors::InvalidArgument(
+            "Pass initialize failed."
+            "When using RemoveShadowFeedPass, place attribute is required!"
+            "Use Set method to set the place attribute."));
+    PADDLE_ENFORCE_EQ(
+        Has(pir::Pass::kParamScopeAttr),
+        true,
+        phi::errors::InvalidArgument(
+            "Pass initialize failed."
+            "When using RemoveShadowFeedPass, scope attribute is required!"
+            "Use Set method to set the scope attribute."));
+    auto block = &Get<const pir::Block>("top_block");
+    auto &place = Get<const phi::Place>(pir::Pass::kPlaceAttr);
+    auto scope =
+        &Get<const paddle::framework::Scope>(pir::Pass::kParamScopeAttr);
+    PADDLE_ENFORCE_NOT_NULL(
+        block, phi::errors::InvalidArgument("block can not be nullptr"));
+    PADDLE_ENFORCE_NOT_NULL(
+        scope, phi::errors::InvalidArgument("scope can not be nullptr"));
+
+    pir::RewritePatternSet ps(context);
+    ps.Add<RemoveShadowFeedPattern>(context, block, place, scope);
+    return ps;
+  }
+};
+
+}  // namespace
+
+namespace pir {
+
+std::unique_ptr<pir::Pass> CreateRemoveShadowFeedPass() {
+  return std::make_unique<RemoveShadowFeedPass>();
+}
+
+}  // namespace pir
+
+// REGISTER_IR_PASS(remove_shadow_feed_pass,
+//                  RemoveShadowFeedPass);
diff --git a/paddle/fluid/pir/transforms/general/remove_shadow_feed_pass.h b/paddle/fluid/pir/transforms/general/remove_shadow_feed_pass.h
new file mode 100644
index 0000000000000..0096eb452a585
--- /dev/null
+++ b/paddle/fluid/pir/transforms/general/remove_shadow_feed_pass.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "paddle/pir/include/core/dll_decl.h"
+
+namespace pir {
+
+class Pass;
+
+IR_API std::unique_ptr<Pass> CreateRemoveShadowFeedPass();
+
+}  // namespace pir
diff --git a/paddle/fluid/pir/transforms/gpu/add_norm_fuse_pass.cc b/paddle/fluid/pir/transforms/gpu/add_norm_fuse_pass.cc
index fc58eb2db607c..bf0c758ef3530 100644
--- a/paddle/fluid/pir/transforms/gpu/add_norm_fuse_pass.cc
+++ b/paddle/fluid/pir/transforms/gpu/add_norm_fuse_pass.cc
@@ -37,7 +37,7 @@ class RmsNormFusePattern : public paddle::drr::DrrPatternBase {
 
   std::string name() const override { return "RmsNormFusePattern"; }
 
-  uint32_t benefit() const override { return 2; }
+  uint32_t benefit() const override { return 3; }
 
   void operator()(paddle::drr::DrrPatternContext *ctx) const override {
     paddle::drr::SourcePattern pat = ctx->SourcePattern();
@@ -139,7 +139,14 @@ class RmsNormFusePattern : public paddle::drr::DrrPatternBase {
 };
 
 class AddRmsNormFusePattern : public paddle::drr::DrrPatternBase {
+ private:
+  const bool extra_add_;
+
  public:
+  explicit AddRmsNormFusePattern(bool extra_add) : extra_add_(extra_add) {}
+
+  uint32_t benefit() const override { return extra_add_ ? 2 : 1; }
+
   std::string name() const override { return "AddRmsNormFusePattern"; }
 
   void operator()(paddle::drr::DrrPatternContext *ctx) const override {
@@ -157,16 +164,21 @@ class AddRmsNormFusePattern : public paddle::drr::DrrPatternBase {
                });
     pat.Tensor("add_out") = add(pat.Tensor("x"), pat.Tensor("residual"));
     pat_rms_norm({&pat.Tensor("add_out"),
-                  &pat.InputNoneTensor(),
+                  &pat.Tensor("bias"),
                   &pat.InputNoneTensor(),
                   &pat.Tensor("w"),
                   &pat.InputNoneTensor()},
                  {&pat.Tensor("rms_norm_out"),
                   &pat.Tensor("residual_out_0"),
                   &pat.Tensor("inv_var_0")});
-
+    // TODO(bukejiyu) :DRR support matching placeholder op,
+    // the following needs to be deleted
+    if (extra_add_) {
+      const auto &add1 = pat.Op(paddle::dialect::AddOp::name());
+      pat.Tensor("add_out1") =
+          add1(pat.Tensor("add_out"), pat.Tensor("any_tensor"));
+    }
     paddle::drr::ResultPattern res = pat.ResultPattern();
-
     const auto &res_rms_norm =
         res.Op(paddle::dialect::RmsNormOp::name(),
                {
@@ -181,19 +193,25 @@ class AddRmsNormFusePattern : public paddle::drr::DrrPatternBase {
     res_rms_norm(
         {
             &res.Tensor("x"),
-            &res.InputNoneTensor(),
+            &res.Tensor("bias"),
             &res.Tensor("residual"),
             &res.Tensor("w"),
             &res.InputNoneTensor(),
         },
         {&res.Tensor("rms_norm_out"),
-         &res.Tensor("residual_out"),
+         &res.Tensor("add_out"),
          &res.Tensor("inv_var")});
   }
 };
 
 class AddLayerNormFusePattern : public paddle::drr::DrrPatternBase {
+ private:
+  const bool extra_add_;
+
  public:
+  explicit AddLayerNormFusePattern(bool extra_add) : extra_add_(extra_add) {}
+
+  uint32_t benefit() const override { return extra_add_ ? 2 : 1; }
   std::string name() const override { return "AddLayerNormFusePattern"; }
 
   void operator()(paddle::drr::DrrPatternContext *ctx) const override {
@@ -204,13 +222,31 @@ class AddLayerNormFusePattern : public paddle::drr::DrrPatternBase {
                {{"epsilon", pat.Attr("epsilon")},
                 {"begin_norm_axis", pat.Attr("begin_norm_axis")}});
     pat.Tensor("add_out") = add(pat.Tensor("x"), pat.Tensor("residual"));
-    layer_norm(
-        {&pat.Tensor("add_out"), &pat.Tensor("w"), &pat.InputNoneTensor()},
-        {&pat.Tensor("layer_norm_out"),
-         &pat.Tensor("mean_out_0"),
-         &pat.Tensor("variance_out_0")});
+    layer_norm({&pat.Tensor("add_out"), &pat.Tensor("w"), &pat.Tensor("bias")},
+               {&pat.Tensor("layer_norm_out"),
+                &pat.Tensor("mean_out_0"),
+                &pat.Tensor("variance_out_0")});
+    // TODO(bukejiyu) :DRR support matching placeholder op,
+    // the following needs to be deleted
+    if (extra_add_) {
+      const auto &add1 = pat.Op(paddle::dialect::AddOp::name());
+      pat.Tensor("add_out1") =
+          add1(pat.Tensor("add_out"), pat.Tensor("any_tensor"));
+    }
 
     paddle::drr::ResultPattern res = pat.ResultPattern();
+    const auto &cast_op_dtype = res.ComputeAttr(
+        [](const paddle::drr::MatchContext &match_ctx) -> phi::DataType {
+          auto x_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("x"));
+          return paddle::dialect::TransToPhiDataType(x_dtype);
+        });
+    const auto &cast_op_1 =
+        res.Op(paddle::dialect::CastOp::name(), {{"dtype", cast_op_dtype}});
+    res.Tensor("casted_bias") = cast_op_1(res.Tensor("bias"));
+    const auto &cast_op_2 =
+        res.Op(paddle::dialect::CastOp::name(), {{"dtype", cast_op_dtype}});
+    res.Tensor("casted_w") = cast_op_2(res.Tensor("w"));
+
     const auto &fuse_layer_norm =
         res.Op(paddle::dialect::FusedBiasResidualLayernormOp::name(),
                {{"epsilon", pat.Attr("epsilon")},
@@ -224,13 +260,13 @@ class AddLayerNormFusePattern : public paddle::drr::DrrPatternBase {
     fuse_layer_norm(
         {
             &res.Tensor("x"),
-            &res.InputNoneTensor(),
+            &res.Tensor("casted_bias"),
             &res.Tensor("residual"),
-            &res.Tensor("w"),
+            &res.Tensor("casted_w"),
             &res.InputNoneTensor(),
         },
         {&res.Tensor("layer_norm_out"),
-         &res.Tensor("residual_out"),
+         &res.Tensor("add_out"),
          &res.Tensor("mean_out"),
          &res.Tensor("variance_out")});
   }
@@ -248,16 +284,19 @@ class AddNormFusePass : public pir::PatternRewritePass {
     //                                mul --->rms_norm
     // w-----------------------------
     bool is_half_weight = true;
+    bool extra_add = true;
     ps.Add(paddle::drr::Create<RmsNormFusePattern>(context, !is_half_weight));
     ps.Add(paddle::drr::Create<RmsNormFusePattern>(context, is_half_weight));
     // x--------
     //           add-rms_norm ---> rms_norm
     // residual-
-    ps.Add(paddle::drr::Create<AddRmsNormFusePattern>(context));
+    ps.Add(paddle::drr::Create<AddRmsNormFusePattern>(context, !extra_add));
+    ps.Add(paddle::drr::Create<AddRmsNormFusePattern>(context, extra_add));
     // x--------
     //           add-layer_norm ----> fused_bias_residual_layernorm
     // residual-
-    ps.Add(paddle::drr::Create<AddLayerNormFusePattern>(context));
+    ps.Add(paddle::drr::Create<AddLayerNormFusePattern>(context, !extra_add));
+    ps.Add(paddle::drr::Create<AddLayerNormFusePattern>(context, extra_add));
     return ps;
   }
 };
diff --git a/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc b/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc
index dfd2b0ed588e2..09ecf2f170155 100644
--- a/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc
+++ b/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc
@@ -20,8 +20,6 @@
 #include "paddle/fluid/pir/drr/include/drr_pattern_base.h"
 
 #include "paddle/fluid/pir/utils/general_functions.h"
-#include "paddle/pir/include/core/builtin_op.h"
-#include "paddle/pir/include/core/value.h"
 #include "paddle/pir/include/pass/pass.h"
 #include "paddle/pir/include/pass/pass_registry.h"
 
@@ -89,7 +87,7 @@ class Conv2dAddFusePattern : public paddle::drr::DrrPatternBase {
                           &res.Tensor("filter"),
                           &res.Tensor("bias"),
                           &res.InputNoneTensor()},
-                         {&res.Tensor("add_out")});
+                         {&res.Tensor("add_out"), &res.OutputNoneTensor()});
   }
 };
 
diff --git a/paddle/fluid/pir/transforms/gpu/embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/pir/transforms/gpu/embedding_eltwise_layernorm_fuse_pass.cc
index 58409b2fbcb15..97b560e503265 100644
--- a/paddle/fluid/pir/transforms/gpu/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/pir/transforms/gpu/embedding_eltwise_layernorm_fuse_pass.cc
@@ -14,10 +14,12 @@
 
 #include "paddle/fluid/pir/transforms/gpu/embedding_eltwise_layernorm_fuse_pass.h"
 
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/fluid/pir/drr/include/drr_pattern_base.h"
 #include "paddle/fluid/pir/utils/general_functions.h"
 
-#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/pir/include/core/builtin_type.h"
 #include "paddle/pir/include/pass/pass.h"
 #include "paddle/pir/include/pass/pass_registry.h"
 
@@ -32,19 +34,12 @@ class Fused2EmbeddingEltwiseLayernormPattern
 
   void operator()(paddle::drr::DrrPatternContext *ctx) const override {
     paddle::drr::SourcePattern pat = ctx->SourcePattern();
-    const auto &embedding_1 = pat.Op(paddle::dialect::EmbeddingOp::name(),
-                                     {{{"padding_idx", pat.Attr("padding_idx")},
-                                       {"sparse", pat.Attr("sparse")}}});
-    const auto &embedding_2 = pat.Op(paddle::dialect::EmbeddingOp::name(),
-                                     {{{"padding_idx", pat.Attr("padding_idx")},
-                                       {"sparse", pat.Attr("sparse")}}});
-
+    const auto &embedding_1 = pat.Op(paddle::dialect::EmbeddingOp::name());
+    const auto &embedding_2 = pat.Op(paddle::dialect::EmbeddingOp::name());
     const auto &add = pat.Op(paddle::dialect::AddOp::name());
 
-    const auto &layernorm =
-        pat.Op(paddle::dialect::LayerNormOp::name(),
-               {{"epsilon", pat.Attr("epsilon")},
-                {"begin_norm_axis", pat.Attr("begin_norm_axis")}});
+    const auto &layernorm = pat.Op(paddle::dialect::LayerNormOp::name(),
+                                   {{"epsilon", pat.Attr("epsilon")}});
 
     embedding_1({&pat.Tensor("x1"), &pat.Tensor("w1")},
                 {&pat.Tensor("embedding_1_out")});
@@ -57,14 +52,20 @@ class Fused2EmbeddingEltwiseLayernormPattern
         {&pat.Tensor("layernorm_out"),
          &pat.Tensor("layernorm_mean"),
          &pat.Tensor("layernorm_variance")});
-    pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
+
+    pat.RequireNativeCall([](const paddle::drr::MatchContext &match_ctx) {
+      auto w1_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("w1"));
+      auto w2_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("w2"));
+      if (w1_dtype != w2_dtype || (!w1_dtype.isa<pir::Float16Type>() &&
+                                   !w1_dtype.isa<pir::Float32Type>())) {
+        return false;
+      }
+
       auto x1_shape = pir::GetShapeFromValue(match_ctx.Tensor("x1"));
       auto x2_shape = pir::GetShapeFromValue(match_ctx.Tensor("x2"));
-
       if (x1_shape.size() != x2_shape.size()) {
         return false;
       }
-
       for (size_t i = 0; i < x1_shape.size(); i++) {
         if (x1_shape.at(i) != x2_shape.at(i)) {
           return false;
@@ -76,13 +77,25 @@ class Fused2EmbeddingEltwiseLayernormPattern
 
     paddle::drr::ResultPattern res = pat.ResultPattern();
 
-    auto &combine_op_1 = res.Op(pir::CombineOp::name());
+    const auto &combine_op_1 = res.Op(pir::CombineOp::name());
     combine_op_1({&res.Tensor("x1"), &res.Tensor("x2")},
                  {&res.Tensor("combine1_out")});
-    auto &combine_op_2 = res.Op(pir::CombineOp::name());
+    const auto &combine_op_2 = res.Op(pir::CombineOp::name());
     combine_op_2({&res.Tensor("w1"), &res.Tensor("w2")},
                  {&res.Tensor("combine2_out")});
 
+    const auto &cast_op_dtype = res.ComputeAttr(
+        [](const paddle::drr::MatchContext &match_ctx) -> phi::DataType {
+          auto w1_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("w1"));
+          return paddle::dialect::TransToPhiDataType(w1_dtype);
+        });
+    const auto &cast_op_1 =
+        res.Op(paddle::dialect::CastOp::name(), {{"dtype", cast_op_dtype}});
+    res.Tensor("casted_bias") = cast_op_1(res.Tensor("bias"));
+    const auto &cast_op_2 =
+        res.Op(paddle::dialect::CastOp::name(), {{"dtype", cast_op_dtype}});
+    res.Tensor("casted_scale") = cast_op_2(res.Tensor("scale"));
+
     const auto &fused_embedding_eltwise_layernorm_op =
         res.Op(paddle::dialect::FusedEmbeddingEltwiseLayernormOp::name(),
                {{
@@ -90,8 +103,8 @@ class Fused2EmbeddingEltwiseLayernormPattern
                }});
     fused_embedding_eltwise_layernorm_op({&res.Tensor("combine1_out"),
                                           &res.Tensor("combine2_out"),
-                                          &res.Tensor("bias"),
-                                          &res.Tensor("scale")},
+                                          &res.Tensor("casted_bias"),
+                                          &res.Tensor("casted_scale")},
                                          {&res.Tensor("layernorm_out")});
   }
 };
@@ -105,21 +118,13 @@ class Fused3EmbeddingEltwiseLayernormPattern
 
   void operator()(paddle::drr::DrrPatternContext *ctx) const override {
     paddle::drr::SourcePattern pat = ctx->SourcePattern();
-    const auto &embedding_1 = pat.Op(paddle::dialect::EmbeddingOp::name(),
-                                     {{{"padding_idx", pat.Attr("padding_idx")},
-                                       {"sparse", pat.Attr("sparse")}}});
-    const auto &embedding_2 = pat.Op(paddle::dialect::EmbeddingOp::name(),
-                                     {{{"padding_idx", pat.Attr("padding_idx")},
-                                       {"sparse", pat.Attr("sparse")}}});
-    const auto &embedding_3 = pat.Op(paddle::dialect::EmbeddingOp::name(),
-                                     {{{"padding_idx", pat.Attr("padding_idx")},
-                                       {"sparse", pat.Attr("sparse")}}});
+    const auto &embedding_1 = pat.Op(paddle::dialect::EmbeddingOp::name());
+    const auto &embedding_2 = pat.Op(paddle::dialect::EmbeddingOp::name());
+    const auto &embedding_3 = pat.Op(paddle::dialect::EmbeddingOp::name());
     const auto &add1 = pat.Op(paddle::dialect::AddOp::name());
     const auto &add2 = pat.Op(paddle::dialect::AddOp::name());
-    const auto &layernorm =
-        pat.Op(paddle::dialect::LayerNormOp::name(),
-               {{"epsilon", pat.Attr("epsilon")},
-                {"begin_norm_axis", pat.Attr("begin_norm_axis")}});
+    const auto &layernorm = pat.Op(paddle::dialect::LayerNormOp::name(),
+                                   {{"epsilon", pat.Attr("epsilon")}});
 
     embedding_1({&pat.Tensor("x1"), &pat.Tensor("w1")},
                 {&pat.Tensor("embedding_1_out")});
@@ -136,7 +141,17 @@ class Fused3EmbeddingEltwiseLayernormPattern
         {&pat.Tensor("layernorm_out"),
          &pat.Tensor("layernorm_mean"),
          &pat.Tensor("layernorm_variance")});
-    pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
+
+    pat.RequireNativeCall([](const paddle::drr::MatchContext &match_ctx) {
+      auto w1_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("w1"));
+      auto w2_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("w2"));
+      auto w3_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("w3"));
+      if (w1_dtype != w2_dtype || w1_dtype != w3_dtype ||
+          (!w1_dtype.isa<pir::Float16Type>() &&
+           !w1_dtype.isa<pir::Float32Type>())) {
+        return false;
+      }
+
       auto x1_shape = pir::GetShapeFromValue(match_ctx.Tensor("x1"));
       auto x2_shape = pir::GetShapeFromValue(match_ctx.Tensor("x2"));
       auto x3_shape = pir::GetShapeFromValue(match_ctx.Tensor("x3"));
@@ -146,7 +161,7 @@ class Fused3EmbeddingEltwiseLayernormPattern
       }
       for (size_t i = 0; i < x1_shape.size(); i++) {
         if (x1_shape.at(i) != x2_shape.at(i) ||
-            x1_shape.at(i) != x2_shape.at(i)) {
+            x1_shape.at(i) != x3_shape.at(i)) {
           return false;
         }
       }
@@ -162,6 +177,18 @@ class Fused3EmbeddingEltwiseLayernormPattern
     combine_op_2({&res.Tensor("w1"), &res.Tensor("w2"), &res.Tensor("w3")},
                  {&res.Tensor("combine2_out")});
 
+    const auto &cast_op_dtype = res.ComputeAttr(
+        [](const paddle::drr::MatchContext &match_ctx) -> phi::DataType {
+          auto w1_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("w1"));
+          return paddle::dialect::TransToPhiDataType(w1_dtype);
+        });
+    const auto &cast_op_1 =
+        res.Op(paddle::dialect::CastOp::name(), {{"dtype", cast_op_dtype}});
+    res.Tensor("casted_bias") = cast_op_1(res.Tensor("bias"));
+    const auto &cast_op_2 =
+        res.Op(paddle::dialect::CastOp::name(), {{"dtype", cast_op_dtype}});
+    res.Tensor("casted_scale") = cast_op_2(res.Tensor("scale"));
+
     const auto &fused_embedding_eltwise_layernorm_op =
         res.Op(paddle::dialect::FusedEmbeddingEltwiseLayernormOp::name(),
                {{
@@ -169,8 +196,8 @@ class Fused3EmbeddingEltwiseLayernormPattern
                }});
     fused_embedding_eltwise_layernorm_op({&res.Tensor("combine1_out"),
                                           &res.Tensor("combine2_out"),
-                                          &res.Tensor("bias"),
-                                          &res.Tensor("scale")},
+                                          &res.Tensor("casted_bias"),
+                                          &res.Tensor("casted_scale")},
                                          {&res.Tensor("layernorm_out")});
   }
 };
@@ -193,7 +220,6 @@ class EmbeddingEltwiseLayernormFusePass : public pir::PatternRewritePass {
 }  // namespace
 
 namespace pir {
-
 std::unique_ptr<Pass> CreateFusedEmbeddingEltwiseLayerNormPass() {
   return std::make_unique<EmbeddingEltwiseLayernormFusePass>();
 }
diff --git a/paddle/fluid/pir/transforms/gpu/fc_elementwise_layernorm_fuse_pass.cc b/paddle/fluid/pir/transforms/gpu/fc_elementwise_layernorm_fuse_pass.cc
index d3e4ed862e741..fa0436d3e5f78 100644
--- a/paddle/fluid/pir/transforms/gpu/fc_elementwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/pir/transforms/gpu/fc_elementwise_layernorm_fuse_pass.cc
@@ -36,13 +36,13 @@ class FcElementwiseLayerNormFusePattern : public paddle::drr::DrrPatternBase {
                {
                    {"in_num_col_dims", pat.Attr("in_num_col_dims")},
                    {"activation_type", pat.Attr("activation_type")},
-                   {"padding_weights", pat.Attr("padding_weights")},
                });
     const auto &add = pat.Op(paddle::dialect::AddOp::name());
     const auto &layernorm =
         pat.Op(paddle::dialect::LayerNormOp::name(),
                {{"epsilon", pat.Attr("epsilon")},
                 {"begin_norm_axis", pat.Attr("begin_norm_axis")}});
+
     fc({&pat.Tensor("x"), &pat.Tensor("w"), &pat.Tensor("bias0")},
        {&pat.Tensor("fc_out")});
     add({&pat.Tensor("fc_out"), &pat.Tensor("y")}, {&pat.Tensor("add_out")});
@@ -51,8 +51,14 @@ class FcElementwiseLayerNormFusePattern : public paddle::drr::DrrPatternBase {
         {&pat.Tensor("layernorm_out"),
          &pat.Tensor("layernorm_mean"),
          &pat.Tensor("layernorm_variance")});
-    // Constrains the activation is none
+
     pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
+      auto x_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("x"));
+      if (!x_dtype.isa<pir::Float16Type>() &&
+          !x_dtype.isa<pir::Float32Type>()) {
+        return false;
+      }
+
       int64_t layer_norm_x = 1;
       auto fc_out_dims = pir::GetShapeFromValue(match_ctx.Tensor("fc_out"));
       auto w_dims = pir::GetShapeFromValue(match_ctx.Tensor("w"));
@@ -68,6 +74,18 @@ class FcElementwiseLayerNormFusePattern : public paddle::drr::DrrPatternBase {
     });
 
     paddle::drr::ResultPattern res = pat.ResultPattern();
+    const auto &cast_op_dtype = res.ComputeAttr(
+        [](const paddle::drr::MatchContext &match_ctx) -> phi::DataType {
+          auto x_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("x"));
+          return paddle::dialect::TransToPhiDataType(x_dtype);
+        });
+    const auto &cast_op_1 =
+        res.Op(paddle::dialect::CastOp::name(), {{"dtype", cast_op_dtype}});
+    res.Tensor("casted_bias1") = cast_op_1(res.Tensor("bias1"));
+    const auto &cast_op_2 =
+        res.Op(paddle::dialect::CastOp::name(), {{"dtype", cast_op_dtype}});
+    res.Tensor("casted_scale") = cast_op_2(res.Tensor("scale"));
+
     const auto &fused_fc_elementwise_op =
         res.Op(paddle::dialect::FusedFcElementwiseLayernormOp::name(),
                {{
@@ -80,8 +98,8 @@ class FcElementwiseLayerNormFusePattern : public paddle::drr::DrrPatternBase {
                              &res.Tensor("w"),
                              &res.Tensor("y"),
                              &res.Tensor("bias0"),
-                             &res.Tensor("scale"),
-                             &res.Tensor("bias1")},
+                             &res.Tensor("casted_scale"),
+                             &res.Tensor("casted_bias1")},
                             {&res.Tensor("layernorm_out"),
                              &res.Tensor("layernorm_mean"),
                              &res.Tensor("layernorm_variance")});
diff --git a/paddle/fluid/pir/transforms/gpu/fused_flash_attn_pass.cc b/paddle/fluid/pir/transforms/gpu/fused_flash_attn_pass.cc
new file mode 100644
index 0000000000000..440aeee5f3ac5
--- /dev/null
+++ b/paddle/fluid/pir/transforms/gpu/fused_flash_attn_pass.cc
@@ -0,0 +1,489 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/transforms/gpu/fused_flash_attn_pass.h"
+
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/drr/include/drr_pattern_base.h"
+#include "paddle/fluid/pir/utils/general_functions.h"
+
+#include "paddle/pir/include/pass/pass.h"
+#include "paddle/pir/include/pass/pass_registry.h"
+
+namespace {
+
+class FlashAttnPatternQscale : public paddle::drr::DrrPatternBase {
+ private:
+  bool softmax_with_cast_;
+
+ public:
+  explicit FlashAttnPatternQscale(bool softmax_with_cast)
+      : softmax_with_cast_(softmax_with_cast) {}
+
+  std::string name() const override { return "FlashAttnPatternQscale"; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern src = ctx->SourcePattern();
+    // check the transpose
+    // q[b, s, head, head_dim] -> transpose -> q[b, head, s, head_dim] -> scale
+    const auto &transpose_q = src.Op("pd_op.transpose");
+    src.Tensor("q_transpose_out") = transpose_q(src.Tensor("q"));
+    // scale before matmul
+    const auto &scale_q = src.Op("pd_op.scale");
+    const auto &full_scale =
+        src.Op("pd_op.full", {{"value", src.Attr("scale_q_value")}});
+    src.Tensor("q_scale_out") =
+        scale_q(src.Tensor("q_transpose_out"), full_scale());
+    // k[b, s, head, head_dim] -> transpose -> k[b, head, s, head_dim]
+    // k[b, head, s, head_dim] -> transpose -> k[b, head, head_dim, s]
+    const auto &transpose_k = src.Op("pd_op.transpose");
+    src.Tensor("k_transpose_out") = transpose_k(src.Tensor("k"));
+    const auto &transpose_k2 = src.Op("pd_op.transpose");
+    src.Tensor("k_transpose2_out") =
+        transpose_k2(src.Tensor("k_transpose_out"));
+    // v[b, s, head, head_dim] -> transpose -> v[b, head, s, head_dim]
+    const auto &transpose_v = src.Op("pd_op.transpose");
+    src.Tensor("v_transpose_out") = transpose_v(src.Tensor("v"));
+    // qk
+    const auto &qk_matmul =
+        src.Op("pd_op.matmul",
+               {{"transpose_x", src.Attr("matmul_qk_transpose_x")},
+                {"transpose_y", src.Attr("matmul_qk_transpose_y")}});
+    src.Tensor("qk_out") =
+        qk_matmul(src.Tensor("q_scale_out"), src.Tensor("k_transpose2_out"));
+
+    // mask
+    const auto &mask_add = src.Op("pd_op.add");
+    src.Tensor("mask_add_out") =
+        mask_add(src.Tensor("qk_out"), src.Tensor("mask"));
+
+    if (softmax_with_cast_) {
+      // cast + softmax + cast
+      const auto &softmax_cast1 = src.Op("pd_op.cast");
+      src.Tensor("softmax_cast1_out") =
+          softmax_cast1(src.Tensor("mask_add_out"));
+      const auto &softmax =
+          src.Op("pd_op.softmax", {{"axis", src.Attr("softmax_axis")}});
+      src.Tensor("softmax_cast2_in") = softmax(src.Tensor("softmax_cast1_out"));
+      const auto &softmax_cast2 = src.Op("pd_op.cast");
+      src.Tensor("softmax_out") = softmax_cast2(src.Tensor("softmax_cast2_in"));
+    } else {
+      // softmax
+      const auto &softmax =
+          src.Op("pd_op.softmax", {{"axis", src.Attr("softmax_axis")}});
+      src.Tensor("softmax_out") = softmax(src.Tensor("mask_add_out"));
+    }
+
+    // o
+    const auto &context_matmul =
+        src.Op("pd_op.matmul",
+               {{"transpose_x", src.Attr("context_matmul_transpose_x")},
+                {"transpose_y", src.Attr("context_matmul_transpose_y")}});
+    src.Tensor("context_matmul_out") = context_matmul(
+        src.Tensor("softmax_out"), src.Tensor("v_transpose_out"));
+    const auto &o_transpose = src.Op("pd_op.transpose");
+    src.Tensor("out") = o_transpose(src.Tensor("context_matmul_out"));
+
+    // Constraints
+    src.RequireNativeCall(
+        [](const paddle::drr::MatchContext &match_ctx) -> bool {
+          auto q_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("q"));
+          if (!q_dtype.isa<pir::Float16Type>() &&
+              !q_dtype.isa<pir::BFloat16Type>()) {
+            return false;
+          }
+          // softmax
+          const auto &softmax_axis = match_ctx.Attr<int>("softmax_axis");
+          if (softmax_axis != -1 && softmax_axis != 3) return false;
+          // matmul transpose
+          bool matmul_qk_transpose_x =
+              match_ctx.Attr<bool>("matmul_qk_transpose_x");
+          bool matmul_qk_transpose_y =
+              match_ctx.Attr<bool>("matmul_qk_transpose_y");
+          if (matmul_qk_transpose_x || matmul_qk_transpose_y) return false;
+
+          bool matmul_o_transpose_x =
+              match_ctx.Attr<bool>("context_matmul_transpose_x");
+          bool matmul_o_transpose_y =
+              match_ctx.Attr<bool>("context_matmul_transpose_y");
+          if (matmul_o_transpose_x || matmul_o_transpose_y) return false;
+          // tensor shape
+          auto q_transpose_out =
+              pir::GetShapeFromValue(match_ctx.Tensor("q_transpose_out"));
+          auto k_transpose_out =
+              pir::GetShapeFromValue(match_ctx.Tensor("k_transpose_out"));
+          auto v_transpose_out =
+              pir::GetShapeFromValue(match_ctx.Tensor("v_transpose_out"));
+          if (q_transpose_out.size() != 4 || k_transpose_out.size() != 4 ||
+              v_transpose_out.size() != 4 ||
+              !(q_transpose_out.at(0) == k_transpose_out.at(0) &&
+                k_transpose_out.at(0) == v_transpose_out.at(0)) ||
+              !(q_transpose_out.at(1) == k_transpose_out.at(1) &&
+                k_transpose_out.at(1) == v_transpose_out.at(1)) ||
+              !(q_transpose_out.at(3) == k_transpose_out.at(3) &&
+                k_transpose_out.at(3) == v_transpose_out.at(3))) {
+            return false;
+          }
+          // mask's shape [bs, 1, seq_len, seq_len]
+          auto mask_add = pir::GetShapeFromValue(match_ctx.Tensor("mask"));
+          if (mask_add.size() != 4 || mask_add.at(1) != 1) {
+            return false;
+          }
+
+          return true;
+        });
+
+    //
+    // Result Pattern.
+    //
+    paddle::drr::ResultPattern res = src.ResultPattern();
+    const auto &flash_attn = res.Op("pd_op.flash_attn",
+                                    {{{"dropout", res.Float32Attr(0.0)},
+                                      {"causal", res.BoolAttr(false)},
+                                      {"return_softmax", res.BoolAttr(false)},
+                                      {"is_test", res.BoolAttr(true)},
+                                      {"rng_name", res.StrAttr("")}}});
+    flash_attn({&res.Tensor("q"),
+                &res.Tensor("k"),
+                &res.Tensor("v"),
+                &res.InputNoneTensor(),
+                &res.Tensor("mask")},
+               {&res.Tensor("out"),
+                &res.Tensor("softmax"),
+                &res.Tensor("softmax_lse"),
+                &res.Tensor("seed_offset")});
+  }
+};
+
+// 1. scale after matmul
+// 2. cast before and after softmax
+class FlashAttnPatternOutscale : public paddle::drr::DrrPatternBase {
+ private:
+  bool softmax_with_cast_;
+
+ public:
+  explicit FlashAttnPatternOutscale(bool softmax_with_cast)
+      : softmax_with_cast_(softmax_with_cast) {}
+
+ public:
+  std::string name() const override { return "FlashAttnPatternOutscale"; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern src = ctx->SourcePattern();
+    // check the transpose,
+    // q[b, s, head, head_dim] -> transpose -> q[b, head, s, head_dim] -> scale
+    const auto &transpose_q = src.Op("pd_op.transpose");
+    src.Tensor("q_transpose_out") = transpose_q(src.Tensor("q"));
+    // k[b, s, head, head_dim] -> transpose -> k[b, head, s, head_dim]
+    // k[b, head, s, head_dim] -> transpose -> k[b, head, head_dim, s]
+    const auto &transpose_k = src.Op("pd_op.transpose");
+    src.Tensor("k_transpose_out") = transpose_k(src.Tensor("k"));
+    const auto &transpose_k2 = src.Op("pd_op.transpose");
+    src.Tensor("k_transpose2_out") =
+        transpose_k2(src.Tensor("k_transpose_out"));
+    // v[b, s, head, head_dim] -> transpose -> v[b, head, s, head_dim]
+    const auto &transpose_v = src.Op("pd_op.transpose");
+    src.Tensor("v_transpose_out") = transpose_v(src.Tensor("v"));
+    // qk
+    const auto &qk_matmul =
+        src.Op("pd_op.matmul",
+               {{"transpose_x", src.Attr("matmul_qk_transpose_x")},
+                {"transpose_y", src.Attr("matmul_qk_transpose_y")}});
+    src.Tensor("qk_out") = qk_matmul(src.Tensor("q_transpose_out"),
+                                     src.Tensor("k_transpose2_out"));
+    const auto &scale_out = src.Op("pd_op.scale");
+    const auto &full_scale =
+        src.Op("pd_op.full", {{"value", src.Attr("scale_out_value")}});
+    src.Tensor("qk_scale_out") = scale_out(src.Tensor("qk_out"), full_scale());
+
+    // mask
+    const auto &mask_add = src.Op("pd_op.add");
+    src.Tensor("mask_add_out") =
+        mask_add(src.Tensor("qk_scale_out"), src.Tensor("mask"));
+
+    if (softmax_with_cast_) {
+      // cast + softmax + cast
+      const auto &softmax_cast1 = src.Op("pd_op.cast");
+      src.Tensor("softmax_cast1_out") =
+          softmax_cast1(src.Tensor("mask_add_out"));
+      const auto &softmax =
+          src.Op("pd_op.softmax", {{"axis", src.Attr("softmax_axis")}});
+      src.Tensor("softmax_cast2_in") = softmax(src.Tensor("softmax_cast1_out"));
+      const auto &softmax_cast2 = src.Op("pd_op.cast");
+      src.Tensor("softmax_out") = softmax_cast2(src.Tensor("softmax_cast2_in"));
+    } else {
+      // softmax
+      const auto &softmax =
+          src.Op("pd_op.softmax", {{"axis", src.Attr("softmax_axis")}});
+      src.Tensor("softmax_out") = softmax(src.Tensor("mask_add_out"));
+    }
+
+    // o
+    const auto &context_matmul =
+        src.Op("pd_op.matmul",
+               {{"transpose_x", src.Attr("context_matmul_transpose_x")},
+                {"transpose_y", src.Attr("context_matmul_transpose_y")}});
+    src.Tensor("context_matmul_out") = context_matmul(
+        src.Tensor("softmax_out"), src.Tensor("v_transpose_out"));
+    const auto &o_transpose = src.Op("pd_op.transpose");
+    src.Tensor("out") = o_transpose(src.Tensor("context_matmul_out"));
+
+    // Constraints
+    src.RequireNativeCall(
+        [](const paddle::drr::MatchContext &match_ctx) -> bool {
+          auto q_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("q"));
+          if (!q_dtype.isa<pir::Float16Type>() &&
+              !q_dtype.isa<pir::BFloat16Type>()) {
+            return false;
+          }
+          // softmax
+          const auto &softmax_axis = match_ctx.Attr<int>("softmax_axis");
+          if (softmax_axis != -1 && softmax_axis != 3) return false;
+          // matmul transpose
+          bool matmul_qk_transpose_x =
+              match_ctx.Attr<bool>("matmul_qk_transpose_x");
+          bool matmul_qk_transpose_y =
+              match_ctx.Attr<bool>("matmul_qk_transpose_y");
+          if (matmul_qk_transpose_x || matmul_qk_transpose_y) return false;
+
+          bool matmul_o_transpose_x =
+              match_ctx.Attr<bool>("context_matmul_transpose_x");
+          bool matmul_o_transpose_y =
+              match_ctx.Attr<bool>("context_matmul_transpose_y");
+          if (matmul_o_transpose_x || matmul_o_transpose_y) return false;
+          // tensor shape
+          auto q_transpose_out =
+              pir::GetShapeFromValue(match_ctx.Tensor("q_transpose_out"));
+          auto k_transpose_out =
+              pir::GetShapeFromValue(match_ctx.Tensor("k_transpose_out"));
+          auto v_transpose_out =
+              pir::GetShapeFromValue(match_ctx.Tensor("v_transpose_out"));
+          if (q_transpose_out.size() != 4 || k_transpose_out.size() != 4 ||
+              v_transpose_out.size() != 4 ||
+              !(q_transpose_out.at(0) == k_transpose_out.at(0) &&
+                k_transpose_out.at(0) == v_transpose_out.at(0)) ||
+              !(q_transpose_out.at(1) == k_transpose_out.at(1) &&
+                k_transpose_out.at(1) == v_transpose_out.at(1)) ||
+              !(q_transpose_out.at(3) == k_transpose_out.at(3) &&
+                k_transpose_out.at(3) == v_transpose_out.at(3))) {
+            return false;
+          }
+          // mask's shape [bs, 1, seq_len, seq_len]
+          auto mask_add = pir::GetShapeFromValue(match_ctx.Tensor("mask"));
+          if (mask_add.size() != 4 || mask_add.at(1) != 1) {
+            return false;
+          }
+
+          return true;
+        });
+
+    //
+    // Result Pattern.
+    //
+    paddle::drr::ResultPattern res = src.ResultPattern();
+    const auto &flash_attn = res.Op("pd_op.flash_attn",
+                                    {{{"dropout", res.Float32Attr(0.0)},
+                                      {"causal", res.BoolAttr(false)},
+                                      {"return_softmax", res.BoolAttr(false)},
+                                      {"is_test", res.BoolAttr(true)},
+                                      {"rng_name", res.StrAttr("")}}});
+    flash_attn({&res.Tensor("q"),
+                &res.Tensor("k"),
+                &res.Tensor("v"),
+                &res.InputNoneTensor(),
+                &res.Tensor("mask")},
+               {&res.Tensor("out"),
+                &res.Tensor("softmax"),
+                &res.Tensor("softmax_lse"),
+                &res.Tensor("seed_offset")});
+  }
+};
+
+// slice qkv
+class TransposeSliceFlashAttnPattern : public paddle::drr::DrrPatternBase {
+ public:
+  std::string name() const override { return "TransposeSliceFlashAttnPattern"; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern src = ctx->SourcePattern();
+    // transpose
+    const auto &transpose_qkv =
+        src.Op("pd_op.transpose", {{"perm", src.Attr("perm")}});
+    src.Tensor("qkv_transpose") = transpose_qkv(src.Tensor("qkv"));
+    // slice q -> [b, head, s, head_dim]
+    const auto &slice_q =
+        src.Op(paddle::dialect::SliceOp::name(),
+               {{"axes", src.Attr("axes_q")},
+                {"infer_flags", src.Attr("infer_flags_q")},
+                {"decrease_axis", src.Attr("decrease_axis_q")}});
+    const auto &full_int_array_q1 = src.Op("pd_op.full_int_array");
+    const auto &full_int_array_q2 = src.Op("pd_op.full_int_array");
+    src.Tensor("q") = slice_q(
+        src.Tensor("qkv_transpose"), full_int_array_q1(), full_int_array_q2());
+    // slice k -> [b, head, s, head_dim]
+    const auto &slice_k =
+        src.Op(paddle::dialect::SliceOp::name(),
+               {{"axes", src.Attr("axes_k")},
+                {"infer_flags", src.Attr("infer_flags_k")},
+                {"decrease_axis", src.Attr("decrease_axis_k")}});
+    const auto &full_int_array_k1 = src.Op("pd_op.full_int_array");
+    const auto &full_int_array_k2 = src.Op("pd_op.full_int_array");
+    src.Tensor("k") = slice_k(
+        src.Tensor("qkv_transpose"), full_int_array_k1(), full_int_array_k2());
+    // slice v -> [b, head, s, head_dim]
+    const auto &slice_v =
+        src.Op(paddle::dialect::SliceOp::name(),
+               {{"axes", src.Attr("axes_v")},
+                {"infer_flags", src.Attr("infer_flags_v")},
+                {"decrease_axis", src.Attr("decrease_axis_v")}});
+    const auto &full_int_array_v1 = src.Op("pd_op.full_int_array");
+    const auto &full_int_array_v2 = src.Op("pd_op.full_int_array");
+    src.Tensor("v") = slice_v(
+        src.Tensor("qkv_transpose"), full_int_array_v1(), full_int_array_v2());
+
+    // k[b, head, s, head_dim] -> transpose -> k[b, head, head_dim, s]
+    const auto &transpose_k = src.Op("pd_op.transpose");
+    src.Tensor("k_transpose_out") = transpose_k(src.Tensor("k"));
+    // qk
+    const auto &qk_matmul =
+        src.Op("pd_op.matmul",
+               {{"transpose_x", src.Attr("matmul_qk_transpose_x")},
+                {"transpose_y", src.Attr("matmul_qk_transpose_y")}});
+    src.Tensor("qk_out") =
+        qk_matmul(src.Tensor("q"), src.Tensor("k_transpose_out"));
+    // scale
+    const auto &scale_out = src.Op("pd_op.scale");
+    const auto &full_scale =
+        src.Op("pd_op.full", {{"value", src.Attr("scale_out_value")}});
+    src.Tensor("qk_scale_out") = scale_out(src.Tensor("qk_out"), full_scale());
+
+    // mask
+    const auto &mask_add = src.Op("pd_op.add");
+    src.Tensor("mask_add_out") =
+        mask_add(src.Tensor("qk_scale_out"), src.Tensor("mask"));
+
+    // softmax
+    const auto &softmax =
+        src.Op("pd_op.softmax", {{"axis", src.Attr("softmax_axis")}});
+    src.Tensor("softmax_out") = softmax(src.Tensor("mask_add_out"));
+    // o
+    const auto &context_matmul =
+        src.Op("pd_op.matmul",
+               {{"transpose_x", src.Attr("context_matmul_transpose_x")},
+                {"transpose_y", src.Attr("context_matmul_transpose_y")}});
+    src.Tensor("context_matmul_out") =
+        context_matmul(src.Tensor("softmax_out"), src.Tensor("v"));
+    // [b, head, s, head_dim] -> [b, s, head, head_dim]
+    const auto &o_transpose = src.Op("pd_op.transpose");
+    src.Tensor("out") = o_transpose(src.Tensor("context_matmul_out"));
+
+    // Constraints
+    src.RequireNativeCall(
+        [](const paddle::drr::MatchContext &match_ctx) -> bool {
+          auto q_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("q"));
+          if (!q_dtype.isa<pir::Float16Type>() &&
+              !q_dtype.isa<pir::BFloat16Type>()) {
+            return false;
+          }
+          // softmax
+          const auto &softmax_axis = match_ctx.Attr<int>("softmax_axis");
+          if (softmax_axis != -1 && softmax_axis != 3) return false;
+          // matmul transpose
+          bool matmul_qk_transpose_x =
+              match_ctx.Attr<bool>("matmul_qk_transpose_x");
+          bool matmul_qk_transpose_y =
+              match_ctx.Attr<bool>("matmul_qk_transpose_y");
+          if (matmul_qk_transpose_x || matmul_qk_transpose_y) return false;
+
+          bool matmul_o_transpose_x =
+              match_ctx.Attr<bool>("context_matmul_transpose_x");
+          bool matmul_o_transpose_y =
+              match_ctx.Attr<bool>("context_matmul_transpose_y");
+          if (matmul_o_transpose_x || matmul_o_transpose_y) return false;
+          // tensor shape
+          auto q = pir::GetShapeFromValue(match_ctx.Tensor("q"));
+          auto k = pir::GetShapeFromValue(match_ctx.Tensor("k"));
+          auto v = pir::GetShapeFromValue(match_ctx.Tensor("v"));
+          if (q.size() != 4 || k.size() != 4 || v.size() != 4 ||
+              !(q.at(0) == k.at(0) && k.at(0) == v.at(0)) ||
+              !(q.at(1) == k.at(1) && k.at(1) == v.at(1)) ||
+              !(q.at(3) == k.at(3) && k.at(3) == v.at(3))) {
+            return false;
+          }
+          // mask's shape [bs, 1, seq_len, seq_len]
+          auto mask_add = pir::GetShapeFromValue(match_ctx.Tensor("mask"));
+          if (mask_add.size() != 4 || mask_add.at(1) != 1) {
+            return false;
+          }
+
+          return true;
+        });
+
+    //
+    // Result Pattern.
+    //
+    paddle::drr::ResultPattern res = src.ResultPattern();
+    // [b, head, seq_len, head_dim] -> [b, seq_len, head, head_dim]
+    const auto &q_transpose = res.Op(
+        "pd_op.transpose", {{"perm", res.VectorInt32Attr({0, 2, 1, 3})}});
+    res.Tensor("q_transpose") = q_transpose(res.Tensor("q"));
+    const auto &k_transpose = res.Op(
+        "pd_op.transpose", {{"perm", res.VectorInt32Attr({0, 2, 1, 3})}});
+    res.Tensor("k_transpose") = k_transpose(res.Tensor("k"));
+    const auto &v_transpose = res.Op(
+        "pd_op.transpose", {{"perm", res.VectorInt32Attr({0, 2, 1, 3})}});
+    res.Tensor("v_transpose") = v_transpose(res.Tensor("v"));
+
+    const auto &flash_attn = res.Op("pd_op.flash_attn",
+                                    {{{"dropout", res.Float32Attr(0.0)},
+                                      {"causal", res.BoolAttr(false)},
+                                      {"return_softmax", res.BoolAttr(false)},
+                                      {"is_test", res.BoolAttr(true)},
+                                      {"rng_name", res.StrAttr("")}}});
+    flash_attn({&res.Tensor("q_transpose"),
+                &res.Tensor("k_transpose"),
+                &res.Tensor("v_transpose"),
+                &res.InputNoneTensor(),
+                &res.Tensor("mask")},
+               {&res.Tensor("out"),
+                &res.Tensor("softmax"),
+                &res.Tensor("softmax_lse"),
+                &res.Tensor("seed_offset")});
+  }
+};
+
+class FusedFlashAttnPass : public pir::PatternRewritePass {
+ public:
+  FusedFlashAttnPass() : pir::PatternRewritePass("fused_flash_attn_pass", 2) {}
+
+  pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override {
+    pir::RewritePatternSet ps(context);
+    ps.Add(paddle::drr::Create<FlashAttnPatternQscale>(context, true));
+    ps.Add(paddle::drr::Create<FlashAttnPatternQscale>(context, false));
+    ps.Add(paddle::drr::Create<FlashAttnPatternOutscale>(context, true));
+    ps.Add(paddle::drr::Create<FlashAttnPatternOutscale>(context, false));
+    ps.Add(paddle::drr::Create<TransposeSliceFlashAttnPattern>(context));
+    return ps;
+  }
+};
+
+}  // namespace
+
+namespace pir {
+std::unique_ptr<Pass> CreateFusedFlashAttnPass() {
+  return std::make_unique<FusedFlashAttnPass>();
+}
+}  // namespace pir
+
+REGISTER_IR_PASS(fused_flash_attn_pass, FusedFlashAttnPass);
diff --git a/paddle/fluid/pir/transforms/gpu/fused_flash_attn_pass.h b/paddle/fluid/pir/transforms/gpu/fused_flash_attn_pass.h
new file mode 100644
index 0000000000000..14183174760bc
--- /dev/null
+++ b/paddle/fluid/pir/transforms/gpu/fused_flash_attn_pass.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "paddle/pir/include/core/dll_decl.h"
+
+namespace pir {
+
+class Pass;
+
+IR_API std::unique_ptr<Pass> CreateFusedFlashAttnPass();
+
+}  // namespace pir
diff --git a/paddle/fluid/pir/transforms/gpu/fused_weight_only_linear_pass.cc b/paddle/fluid/pir/transforms/gpu/fused_weight_only_linear_pass.cc
index e9b522ce85189..db41a0d5cb78a 100644
--- a/paddle/fluid/pir/transforms/gpu/fused_weight_only_linear_pass.cc
+++ b/paddle/fluid/pir/transforms/gpu/fused_weight_only_linear_pass.cc
@@ -31,7 +31,7 @@ int getSMVersion() {
   sm_version = paddle::platform::GetGPUComputeCapability(
       paddle::platform::GetCurrentDeviceId());
 #else
-  PADDLE_THROW(paddle::platform::errors::Unavailable(
+  PADDLE_THROW(common::errors::Unavailable(
       "fused_weight_only_linear_pass needs paddle compiled with CUDA."));
 #endif
   return sm_version;
@@ -40,11 +40,15 @@ int getSMVersion() {
 class FusedWeightOnlyLinearWithBiasPattern
     : public paddle::drr::DrrPatternBase {
  private:
-  bool reverse_;
+  bool reverse_add_;
+  std::string algo_;
+  int sm_version_;
 
  public:
-  explicit FusedWeightOnlyLinearWithBiasPattern(bool reverse)
-      : reverse_(reverse) {}
+  FusedWeightOnlyLinearWithBiasPattern(bool reverse_add,
+                                       const std::string &algo,
+                                       int sm_version)
+      : reverse_add_(reverse_add), algo_(algo), sm_version_(sm_version) {}
 
   std::string name() const override {
     return "FusedWeightOnlyLinearWithBiasPattern";
@@ -65,8 +69,8 @@ class FusedWeightOnlyLinearWithBiasPattern
     const auto &add = src.Op(paddle::dialect::AddOp::name());
 
     src.Tensor("add_out") =
-        reverse_ ? add(src.Tensor("matmul_out"), src.Tensor("bias"))
-                 : add(src.Tensor("bias"), src.Tensor("matmul_out"));
+        reverse_add_ ? add(src.Tensor("matmul_out"), src.Tensor("bias"))
+                     : add(src.Tensor("bias"), src.Tensor("matmul_out"));
 
     //
     // Constraints.
@@ -80,21 +84,21 @@ class FusedWeightOnlyLinearWithBiasPattern
           bool matmul_trans_y = match_ctx.Attr<bool>("matmul_transpose_y");
           if (matmul_trans_x || matmul_trans_y) return false;
 
+          auto w_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("w"));
+          if (!w_dtype.isa<pir::Float16Type>() &&
+              !w_dtype.isa<pir::BFloat16Type>()) {
+            return false;
+          }
+
           auto w_dims = pir::GetShapeFromValue(match_ctx.Tensor("w"));
           auto x_dims = pir::GetShapeFromValue(match_ctx.Tensor("x"));
           auto bias_dims = pir::GetShapeFromValue(match_ctx.Tensor("bias"));
           if (!(w_dims.size() == 2 && x_dims.size() >= 2 &&
-                bias_dims.size() == x_dims.size())) {
+                bias_dims.size() == 1)) {
             return false;
           }
 
           if (w_dims.at(0) % 64 != 0 || w_dims.at(1) % 16 != 0) return false;
-
-          auto w_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("w"));
-          if (!w_dtype.isa<pir::Float16Type>() &&
-              !w_dtype.isa<pir::BFloat16Type>())
-            return false;
-
           if (x_dims.at(x_dims.size() - 1) != w_dims.at(0)) return false;
 
           return true;
@@ -104,19 +108,49 @@ class FusedWeightOnlyLinearWithBiasPattern
     //
     paddle::drr::ResultPattern res = src.ResultPattern();
 
-    const auto &weight_quantize =
-        res.Op(paddle::dialect::WeightQuantizeOp::name(),
-               {{"algo", res.StrAttr("weight_only_int8")},
-                {"arch", res.Int32Attr(getSMVersion())},
-                {"group_size", res.Int32Attr(-1)}});
-    weight_quantize({&res.Tensor("w")},
-                    {&res.Tensor("quanted_weight_tensor"),
-                     &res.Tensor("weight_scale_tensor")});
+    if (algo_ == "weight_only_int4") {
+      // TODO(liuyuanle): When the operator weight_quantize supports
+      // weight_only_int4 on gpu version, delete the memory copy.
+      const auto &memcpy_d2h =
+          res.Op(paddle::dialect::MemcpyD2hOp::name(),
+                 {{"dst_place_type", res.Int32Attr(0 /*cpu*/)}});
+      res.Tensor("w_cpu") = memcpy_d2h(res.Tensor("w"));
+      const auto &weight_quantize =
+          res.Op(paddle::dialect::WeightQuantizeOp::name(),
+                 {{"algo", res.StrAttr(algo_)},
+                  {"arch", res.Int32Attr(sm_version_)},
+                  {"group_size", res.Int32Attr(-1)}});
+      weight_quantize({&res.Tensor("w_cpu")},
+                      {&res.Tensor("quanted_weight_tensor_cpu"),
+                       &res.Tensor("weight_scale_tensor_cpu")});
+
+      const auto &memcpy_h2d_1 =
+          res.Op(paddle::dialect::MemcpyH2dOp::name(),
+                 {{"dst_place_type", res.Int32Attr(1 /*gpu*/)}});
+      res.Tensor("quanted_weight_tensor") =
+          memcpy_h2d_1(res.Tensor("quanted_weight_tensor_cpu"));
+      const auto &memcpy_h2d_2 =
+          res.Op(paddle::dialect::MemcpyH2dOp::name(),
+                 {{"dst_place_type", res.Int32Attr(1 /*gpu*/)}});
+      res.Tensor("weight_scale_tensor") =
+          memcpy_h2d_2(res.Tensor("weight_scale_tensor_cpu"));
+    } else {
+      const auto &weight_quantize =
+          res.Op(paddle::dialect::WeightQuantizeOp::name(),
+                 {{"algo", res.StrAttr(algo_)},
+                  {"arch", res.Int32Attr(sm_version_)},
+                  {"group_size", res.Int32Attr(-1)}});
+
+      weight_quantize({&res.Tensor("w")},
+                      {&res.Tensor("quanted_weight_tensor"),
+                       &res.Tensor("weight_scale_tensor")});
+    }
 
     const auto &weight_only_linear =
         res.Op(paddle::dialect::WeightOnlyLinearOp::name(),
-               {{"weight_dtype", res.StrAttr("int8")},
-                {"arch", res.Int32Attr(getSMVersion())},
+               {{"weight_dtype",
+                 res.StrAttr(algo_ == "weight_only_int8" ? "int8" : "int4")},
+                {"arch", res.Int32Attr(sm_version_)},
                 {"group_size", res.Int32Attr(-1)}});
     weight_only_linear({&res.Tensor("x"),
                         &res.Tensor("quanted_weight_tensor"),
@@ -127,6 +161,14 @@ class FusedWeightOnlyLinearWithBiasPattern
 };
 
 class FusedWeightOnlyLinearNoBiasPattern : public paddle::drr::DrrPatternBase {
+ private:
+  std::string algo_;
+  int sm_version_;
+
+ public:
+  FusedWeightOnlyLinearNoBiasPattern(const std::string &algo, int sm_version)
+      : algo_(algo), sm_version_(sm_version) {}
+
  public:
   std::string name() const override {
     return "FusedWeightOnlyLinearNoBiasPattern";
@@ -179,19 +221,48 @@ class FusedWeightOnlyLinearNoBiasPattern : public paddle::drr::DrrPatternBase {
     //
     paddle::drr::ResultPattern res = src.ResultPattern();
 
-    const auto &weight_quantize =
-        res.Op(paddle::dialect::WeightQuantizeOp::name(),
-               {{"algo", res.StrAttr("weight_only_int8")},
-                {"arch", res.Int32Attr(getSMVersion())},
-                {"group_size", res.Int32Attr(-1)}});
-    weight_quantize({&res.Tensor("w")},
-                    {&res.Tensor("quanted_weight_tensor"),
-                     &res.Tensor("weight_scale_tensor")});
-
+    if (algo_ == "weight_only_int4") {
+      // TODO(liuyuanle): When the operator weight_quantize supports
+      // weight_only_int4 on gpu version, delete the memory copy.
+      const auto &memcpy_d2h =
+          res.Op(paddle::dialect::MemcpyD2hOp::name(),
+                 {{"dst_place_type", res.Int32Attr(0 /*cpu*/)}});
+      res.Tensor("w_cpu") = memcpy_d2h(res.Tensor("w"));
+      const auto &weight_quantize =
+          res.Op(paddle::dialect::WeightQuantizeOp::name(),
+                 {{"algo", res.StrAttr(algo_)},
+                  {"arch", res.Int32Attr(sm_version_)},
+                  {"group_size", res.Int32Attr(-1)}});
+      weight_quantize({&res.Tensor("w_cpu")},
+                      {&res.Tensor("quanted_weight_tensor_cpu"),
+                       &res.Tensor("weight_scale_tensor_cpu")});
+
+      const auto &memcpy_h2d_1 =
+          res.Op(paddle::dialect::MemcpyH2dOp::name(),
+                 {{"dst_place_type", res.Int32Attr(1 /*gpu*/)}});
+      res.Tensor("quanted_weight_tensor") =
+          memcpy_h2d_1(res.Tensor("quanted_weight_tensor_cpu"));
+      const auto &memcpy_h2d_2 =
+          res.Op(paddle::dialect::MemcpyH2dOp::name(),
+                 {{"dst_place_type", res.Int32Attr(1 /*gpu*/)}});
+      res.Tensor("weight_scale_tensor") =
+          memcpy_h2d_2(res.Tensor("weight_scale_tensor_cpu"));
+    } else {
+      const auto &weight_quantize =
+          res.Op(paddle::dialect::WeightQuantizeOp::name(),
+                 {{"algo", res.StrAttr(algo_)},
+                  {"arch", res.Int32Attr(sm_version_)},
+                  {"group_size", res.Int32Attr(-1)}});
+
+      weight_quantize({&res.Tensor("w")},
+                      {&res.Tensor("quanted_weight_tensor"),
+                       &res.Tensor("weight_scale_tensor")});
+    }
     const auto &weight_only_linear =
         res.Op(paddle::dialect::WeightOnlyLinearOp::name(),
-               {{"weight_dtype", res.StrAttr("int8")},
-                {"arch", res.Int32Attr(getSMVersion())},
+               {{"weight_dtype",
+                 res.StrAttr(algo_ == "weight_only_int8" ? "int8" : "int4")},
+                {"arch", res.Int32Attr(sm_version_)},
                 {"group_size", res.Int32Attr(-1)}});
     weight_only_linear({&res.Tensor("x"),
                         &res.Tensor("quanted_weight_tensor"),
@@ -204,15 +275,28 @@ class FusedWeightOnlyLinearNoBiasPattern : public paddle::drr::DrrPatternBase {
 class FusedWeightOnlyLinearPass : public pir::PatternRewritePass {
  public:
   FusedWeightOnlyLinearPass()
-      : pir::PatternRewritePass("fused_weight_only_linear_pass", 4) {}
+      : pir::PatternRewritePass("fused_weight_only_linear_pass", 4),
+        sm_version_(getSMVersion()) {}
 
   pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override {
+    std::string algo = "weight_only_int4";
+    if (Has("weight_only_algo")) {
+      algo = Get<std::string>("weight_only_algo");
+    }
+    PADDLE_ENFORCE_EQ(algo == "weight_only_int8" || algo == "weight_only_int4",
+                      true,
+                      common::errors::InvalidArgument(
+                          "fused_weight_only_linear_pass only support "
+                          "weight_only_int8 or weight_only_int4, but get %s.",
+                          algo));
+
     pir::RewritePatternSet ps(context);
-    ps.Add(paddle::drr::Create<FusedWeightOnlyLinearWithBiasPattern>(context,
-                                                                     true));
-    ps.Add(paddle::drr::Create<FusedWeightOnlyLinearWithBiasPattern>(context,
-                                                                     false));
-    ps.Add(paddle::drr::Create<FusedWeightOnlyLinearNoBiasPattern>(context));
+    ps.Add(paddle::drr::Create<FusedWeightOnlyLinearWithBiasPattern>(
+        context, true, algo, sm_version_));
+    ps.Add(paddle::drr::Create<FusedWeightOnlyLinearWithBiasPattern>(
+        context, false, algo, sm_version_));
+    ps.Add(paddle::drr::Create<FusedWeightOnlyLinearNoBiasPattern>(
+        context, algo, sm_version_));
     return ps;
   }
 
@@ -228,15 +312,15 @@ class FusedWeightOnlyLinearPass : public pir::PatternRewritePass {
   }
 
   bool CanApplyOn(pir::Operation *op) const override {
-    int sm_version = getSMVersion();
-    if (sm_version != 70 && sm_version != 75 && sm_version != 80 &&
-        sm_version != 86) {
+    if (sm_version_ != 70 && sm_version_ != 75 && sm_version_ != 80 &&
+        sm_version_ != 86) {
       return false;
     }
     return op->num_regions() > 0;
   }
 
  private:
+  int sm_version_;
   pir::FrozenRewritePatternSet patterns_;
 };
 
diff --git a/paddle/fluid/pir/transforms/onednn/conv_activation_onednn_fuse_pass.cc b/paddle/fluid/pir/transforms/onednn/conv_activation_onednn_fuse_pass.cc
new file mode 100644
index 0000000000000..2c715ab9b437c
--- /dev/null
+++ b/paddle/fluid/pir/transforms/onednn/conv_activation_onednn_fuse_pass.cc
@@ -0,0 +1,574 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/transforms/onednn/conv_activation_onednn_fuse_pass.h"
+
+#include "paddle/fluid/pir/dialect/operator/ir/onednn_op.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/drr/include/drr_pattern_base.h"
+#include "paddle/fluid/pir/utils/general_functions.h"
+
+#include "paddle/pir/include/pass/pass.h"
+#include "paddle/pir/include/pass/pass_registry.h"
+
+namespace {
+
+class ConvActivationFusePattern : public paddle::drr::DrrPatternBase {
+ private:
+  const size_t activation_count_;
+  std::string activation_name_;
+  /*
+   * fused_level_ = 0 : conv2d + activation
+    fused_level_ > 0 : conv2d + bias + activation
+                     : conv2d + residual + activation
+                     : conv2d + + bias + residual + activation
+  */
+  const int fused_level_;
+
+ public:
+  ConvActivationFusePattern(size_t activation_count,
+                            const std::string &activation_name,
+                            int fused_level)
+      : activation_count_(activation_count),
+        activation_name_(activation_name),
+        fused_level_(fused_level) {}
+
+  std::string name() const override {
+    return "Conv" + std::to_string(fused_level_) + activation_name_ +
+           "FusePattern";
+  }
+
+  uint32_t benefit() const override { return activation_count_; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern pat = ctx->SourcePattern();
+    std::string conv_name = paddle::dialect::Conv2dOp::name();
+    if (fused_level_ > 0) {
+      conv_name = paddle::onednn::dialect::FusedConv2dOp::name();
+    }
+
+    const auto &conv =
+        fused_level_ == 0
+            ? pat.Op(conv_name,
+                     {{"strides", pat.Attr("strides")},
+                      {"paddings", pat.Attr("paddings")},
+                      {"padding_algorithm", pat.Attr("padding_algorithm")},
+                      {"dilations", pat.Attr("dilations")},
+                      {"groups", pat.Attr("groups")},
+                      {"data_format", pat.Attr("data_format")}})
+            : pat.Op(conv_name,
+                     {{
+                         {"strides", pat.Attr("strides")},
+                         {"paddings", pat.Attr("paddings")},
+                         {"padding_algorithm", pat.Attr("padding_algorithm")},
+                         {"dilations", pat.Attr("dilations")},
+                         {"groups", pat.Attr("groups")},
+                         {"data_format", pat.Attr("data_format")},
+                         {"mkldnn_data_type", pat.Attr("mkldnn_data_type")},
+                         {"fuse_activation", pat.Attr("fuse_activation")},
+                         {"fuse_residual_connection",
+                          pat.Attr("fuse_residual_connection")},
+                         {"force_fp32_output", pat.Attr("force_fp32_output")},
+                         {"fuse_alpha", pat.Attr("fuse_alpha")},
+                         {"fuse_beta", pat.Attr("fuse_beta")},
+                         {"scale_in", pat.Attr("scale_in")},
+                         {"scale_out", pat.Attr("scale_out")},
+                         {"scale_in_eltwise", pat.Attr("scale_in_eltwise")},
+                         {"scale_weights", pat.Attr("scale_weights")},
+                     }});
+
+    std::string activation_name_op = "pd_op." + activation_name_;
+    if (activation_name_ == "hard_swish") {
+      // oneDNN use hard_swish, paddle use hardswish
+      activation_name_op = "pd_op.hardswish";
+    } else if (activation_name_ == "hard_sigmoid") {
+      activation_name_op = "pd_op.hardsigmoid";
+    }
+
+    std::unordered_map<std::string, paddle::drr::Attribute> act_attrs;
+    if (activation_name_op == paddle::dialect::HardsigmoidOp::name()) {
+      act_attrs.emplace("slope", pat.Attr("slope"));
+      act_attrs.emplace("offset", pat.Attr("offset"));
+    } else if (activation_name_op == paddle::dialect::LeakyReluOp::name()) {
+      act_attrs.emplace("negative_slope", pat.Attr("negative_slope"));
+    } else if (activation_name_op == paddle::dialect::GeluOp::name()) {
+      act_attrs.emplace("approximate", pat.Attr("approximate"));
+    }
+    const auto &activation = pat.Op(activation_name_op, act_attrs);
+
+    if (fused_level_ > 0) {
+      conv({&pat.Tensor("input"),
+            &pat.Tensor("filter"),
+            &pat.Tensor("bias"),
+            &pat.Tensor("residual_param")},
+           {&pat.Tensor("conv2d_out")});
+    } else {
+      conv({&pat.Tensor("input"), &pat.Tensor("filter")},
+           {&pat.Tensor("conv2d_out")});
+    }
+    pat.Tensor("act_out") = activation(pat.Tensor("conv2d_out"));
+
+    if (fused_level_ > 0) {
+      pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
+        auto act_type = match_ctx.Attr<std::string>("fuse_activation");
+        if (act_type != "") {
+          return false;
+        }
+        return true;
+      });
+    }
+
+    pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
+      if (activation_name_ == "leaky_relu") {
+        float negative_slope = match_ctx.Attr<float>("negative_slope");
+        // leaky relu alpha is a positive number
+        if (negative_slope <= 0.0) {
+          return false;
+        }
+      }
+      return true;
+    });
+
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+    auto fuse_beta = res.Float32Attr(0.0f);
+    auto fuse_alpha = res.Float32Attr(0.0f);
+    if (activation_name_ == "relu6") {
+      fuse_beta = res.Float32Attr(6.0f);
+    } else if (activation_name_ == "hard_swish") {
+      // hard swish have not attr float threshold = 6.0f, float scale = 6.0f,
+      // float offset = 3.0f attr But in previous implementation hard swish,
+      // fuse_alpha=1.f / 6.f， fuse_beta=1.f / 2.f, it has fixed
+      fuse_beta = res.Float32Attr(1.f / 2.f);
+      fuse_alpha = res.Float32Attr(1.f / 6.f);
+    } else if (activation_name_ == "swish") {
+      fuse_alpha = res.Float32Attr(1.0f);
+    } else if (activation_name_ == "leaky_relu") {
+      fuse_alpha = pat.Attr("negative_slope");
+    } else if (activation_name_ == "hard_sigmoid") {
+      fuse_alpha = pat.Attr("slope");
+      fuse_beta = pat.Attr("offset");
+    }
+
+    const auto &fused_conv =
+        fused_level_ == 0
+            ? res.Op(paddle::onednn::dialect::FusedConv2dOp::name(),
+                     {{
+                         {"strides", pat.Attr("strides")},
+                         {"paddings", pat.Attr("paddings")},
+                         {"padding_algorithm", pat.Attr("padding_algorithm")},
+                         {"dilations", pat.Attr("dilations")},
+                         {"groups", pat.Attr("groups")},
+                         {"data_format", pat.Attr("data_format")},
+                         {"mkldnn_data_type", res.StrAttr("float32")},
+                         {"fuse_activation", res.StrAttr(activation_name_)},
+                         {"fuse_residual_connection", res.BoolAttr(false)},
+                         {"force_fp32_output", res.BoolAttr(false)},
+                         {"fuse_alpha", fuse_alpha},
+                         {"fuse_beta", fuse_beta},
+                         {"scale_in", res.Float32Attr(1.0f)},
+                         {"scale_out", res.Float32Attr(1.0f)},
+                         {"scale_in_eltwise", res.Float32Attr(1.0f)},
+                         {"scale_weights", res.VectorFloatAttr({1.0f})},
+                     }})
+            : res.Op(paddle::onednn::dialect::FusedConv2dOp::name(),
+                     {{
+                         {"strides", pat.Attr("strides")},
+                         {"paddings", pat.Attr("paddings")},
+                         {"padding_algorithm", pat.Attr("padding_algorithm")},
+                         {"dilations", pat.Attr("dilations")},
+                         {"groups", pat.Attr("groups")},
+                         {"data_format", pat.Attr("data_format")},
+                         {"mkldnn_data_type", pat.Attr("mkldnn_data_type")},
+                         {"fuse_activation", res.StrAttr(activation_name_)},
+                         {"fuse_residual_connection",
+                          pat.Attr("fuse_residual_connection")},
+                         {"force_fp32_output", pat.Attr("force_fp32_output")},
+                         {"fuse_alpha", fuse_alpha},
+                         {"fuse_beta", fuse_beta},
+                         {"scale_in", pat.Attr("scale_in")},
+                         {"scale_out", pat.Attr("scale_out")},
+                         {"scale_in_eltwise", pat.Attr("scale_in_eltwise")},
+                         {"scale_weights", pat.Attr("scale_weights")},
+                     }});
+
+    if (fused_level_ > 0) {
+      fused_conv({&res.Tensor("input"),
+                  &res.Tensor("filter"),
+                  &res.Tensor("bias"),
+                  &res.Tensor("residual_param")},
+                 {&res.Tensor("act_out")});
+    } else {
+      fused_conv({&res.Tensor("input"),
+                  &res.Tensor("filter"),
+                  &res.InputNoneTensor(),
+                  &res.InputNoneTensor()},
+                 {&res.Tensor("act_out")});
+    }
+  }
+};
+
+class ConvGeluFusePattern : public paddle::drr::DrrPatternBase {
+ private:
+  std::string activation_name_;
+  const int fused_level_;
+
+ public:
+  ConvGeluFusePattern(const std::string &activation_name, int fused_level)
+      : activation_name_(activation_name), fused_level_(fused_level) {}
+
+  std::string name() const override { return "ConvGeluFusePattern"; }
+
+  uint32_t benefit() const override { return fused_level_ + 1; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern pat = ctx->SourcePattern();
+    std::string conv_name = paddle::dialect::Conv2dOp::name();
+    if (fused_level_ > 0) {
+      conv_name = paddle::onednn::dialect::FusedConv2dOp::name();
+    }
+
+    const auto &conv =
+        fused_level_ == 0
+            ? pat.Op(conv_name,
+                     {{"strides", pat.Attr("strides")},
+                      {"paddings", pat.Attr("paddings")},
+                      {"padding_algorithm", pat.Attr("padding_algorithm")},
+                      {"dilations", pat.Attr("dilations")},
+                      {"groups", pat.Attr("groups")},
+                      {"data_format", pat.Attr("data_format")}})
+            : pat.Op(conv_name,
+                     {{
+                         {"strides", pat.Attr("strides")},
+                         {"paddings", pat.Attr("paddings")},
+                         {"padding_algorithm", pat.Attr("padding_algorithm")},
+                         {"dilations", pat.Attr("dilations")},
+                         {"groups", pat.Attr("groups")},
+                         {"data_format", pat.Attr("data_format")},
+                         {"mkldnn_data_type", pat.Attr("mkldnn_data_type")},
+                         {"fuse_activation", pat.Attr("fuse_activation")},
+                         {"fuse_residual_connection",
+                          pat.Attr("fuse_residual_connection")},
+                         {"force_fp32_output", pat.Attr("force_fp32_output")},
+                         {"fuse_alpha", pat.Attr("fuse_alpha")},
+                         {"fuse_beta", pat.Attr("fuse_beta")},
+                         {"scale_in", pat.Attr("scale_in")},
+                         {"scale_out", pat.Attr("scale_out")},
+                         {"scale_in_eltwise", pat.Attr("scale_in_eltwise")},
+                         {"scale_weights", pat.Attr("scale_weights")},
+                     }});
+
+    const auto &activation =
+        pat.Op(activation_name_, {{"approximate", pat.Attr("approximate")}});
+    if (fused_level_ > 0) {
+      conv({&pat.Tensor("input"),
+            &pat.Tensor("filter"),
+            &pat.Tensor("bias"),
+            &pat.Tensor("residual_param")},
+           {&pat.Tensor("conv2d_out")});
+
+    } else {
+      conv({&pat.Tensor("input"), &pat.Tensor("filter")},
+           {&pat.Tensor("conv2d_out")});
+    }
+
+    pat.Tensor("act_out") = activation(pat.Tensor("conv2d_out"));
+
+    if (fused_level_ > 0) {
+      pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
+        auto act_type = match_ctx.Attr<std::string>("fuse_activation");
+        if (act_type != "") {
+          return false;
+        }
+        return true;
+      });
+    }
+
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+    const auto &gelu = res.ComputeAttr(
+        [](const paddle::drr::MatchContext &match_ctx) -> std::string {
+          bool approximate = match_ctx.Attr<bool>("approximate");
+          if (approximate) return "gelu_tanh";
+          return "gelu_erf";
+        });
+    auto fuse_residual = res.BoolAttr(false);
+
+    const auto &fused_conv =
+        fused_level_ == 0
+            ? res.Op(paddle::onednn::dialect::FusedConv2dOp::name(),
+                     {{
+                         {"strides", pat.Attr("strides")},
+                         {"paddings", pat.Attr("paddings")},
+                         {"padding_algorithm", pat.Attr("padding_algorithm")},
+                         {"dilations", pat.Attr("dilations")},
+                         {"groups", pat.Attr("groups")},
+                         {"data_format", pat.Attr("data_format")},
+                         {"mkldnn_data_type", res.StrAttr("float32")},
+                         {"fuse_activation", gelu},
+                         {"fuse_residual_connection", res.BoolAttr(false)},
+                         {"force_fp32_output", res.BoolAttr(false)},
+                         {"fuse_alpha", res.Float32Attr(0.0f)},
+                         {"fuse_beta", res.Float32Attr(0.0f)},
+                         {"scale_in", res.Float32Attr(1.0f)},
+                         {"scale_out", res.Float32Attr(1.0f)},
+                         {"scale_in_eltwise", res.Float32Attr(1.0f)},
+                         {"scale_weights", res.VectorFloatAttr({1.0f})},
+                     }})
+            : res.Op(paddle::onednn::dialect::FusedConv2dOp::name(),
+                     {{
+                         {"strides", pat.Attr("strides")},
+                         {"paddings", pat.Attr("paddings")},
+                         {"padding_algorithm", pat.Attr("padding_algorithm")},
+                         {"dilations", pat.Attr("dilations")},
+                         {"groups", pat.Attr("groups")},
+                         {"data_format", pat.Attr("data_format")},
+                         {"mkldnn_data_type", pat.Attr("mkldnn_data_type")},
+                         {"fuse_activation", gelu},
+                         {"fuse_residual_connection",
+                          pat.Attr("fuse_residual_connection")},
+                         {"force_fp32_output", pat.Attr("force_fp32_output")},
+                         {"fuse_alpha", pat.Attr("fuse_alpha")},
+                         {"fuse_beta", pat.Attr("fuse_beta")},
+                         {"scale_in", pat.Attr("scale_in")},
+                         {"scale_out", pat.Attr("scale_out")},
+                         {"scale_in_eltwise", pat.Attr("scale_in_eltwise")},
+                         {"scale_weights", pat.Attr("scale_weights")},
+                     }});
+
+    if (fused_level_ > 0) {
+      fused_conv({&res.Tensor("input"),
+                  &res.Tensor("filter"),
+                  &res.Tensor("bias"),
+                  &res.Tensor("residual_param")},
+                 {&res.Tensor("act_out")});
+    } else {
+      fused_conv({&res.Tensor("input"),
+                  &res.Tensor("filter"),
+                  &res.InputNoneTensor(),
+                  &res.InputNoneTensor()},
+                 {&res.Tensor("act_out")});
+    }
+  }
+};
+
+class ConvClipFusePattern : public paddle::drr::DrrPatternBase {
+ private:
+  std::string activation_name_;
+  const int fused_level_;
+
+ public:
+  ConvClipFusePattern(const std::string &activation_name, int fused_level)
+      : activation_name_(activation_name), fused_level_(fused_level) {}
+
+  std::string name() const override { return "ConvClipFusePattern"; }
+
+  uint32_t benefit() const override { return fused_level_ + 1; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern pat = ctx->SourcePattern();
+    std::string conv_name = paddle::dialect::Conv2dOp::name();
+    if (fused_level_ > 0) {
+      conv_name = paddle::onednn::dialect::FusedConv2dOp::name();
+    }
+
+    const auto &full_1 = pat.Op(paddle::dialect::FullOp::name(),
+                                {{"value", pat.Attr("full_1_value")}});
+    const auto &full_2 = pat.Op(paddle::dialect::FullOp::name(),
+                                {{"value", pat.Attr("full_2_value")}});
+    pat.Tensor("min") = full_1();
+    pat.Tensor("max") = full_2();
+    const auto &conv =
+        fused_level_ == 0
+            ? pat.Op(conv_name,
+                     {{"strides", pat.Attr("strides")},
+                      {"paddings", pat.Attr("paddings")},
+                      {"padding_algorithm", pat.Attr("padding_algorithm")},
+                      {"dilations", pat.Attr("dilations")},
+                      {"groups", pat.Attr("groups")},
+                      {"data_format", pat.Attr("data_format")}})
+            : pat.Op(conv_name,
+                     {{
+                         {"strides", pat.Attr("strides")},
+                         {"paddings", pat.Attr("paddings")},
+                         {"padding_algorithm", pat.Attr("padding_algorithm")},
+                         {"dilations", pat.Attr("dilations")},
+                         {"groups", pat.Attr("groups")},
+                         {"data_format", pat.Attr("data_format")},
+                         {"mkldnn_data_type", pat.Attr("mkldnn_data_type")},
+                         {"fuse_activation", pat.Attr("fuse_activation")},
+                         {"fuse_residual_connection",
+                          pat.Attr("fuse_residual_connection")},
+                         {"force_fp32_output", pat.Attr("force_fp32_output")},
+                         {"fuse_alpha", pat.Attr("fuse_alpha")},
+                         {"fuse_beta", pat.Attr("fuse_beta")},
+                         {"scale_in", pat.Attr("scale_in")},
+                         {"scale_out", pat.Attr("scale_out")},
+                         {"scale_in_eltwise", pat.Attr("scale_in_eltwise")},
+                         {"scale_weights", pat.Attr("scale_weights")},
+                     }});
+
+    const auto &activation = pat.Op(activation_name_);
+    if (fused_level_ > 0) {
+      conv({&pat.Tensor("input"),
+            &pat.Tensor("filter"),
+            &pat.Tensor("bias"),
+            &pat.Tensor("residual_param")},
+           {&pat.Tensor("conv2d_out")});
+
+    } else {
+      conv({&pat.Tensor("input"), &pat.Tensor("filter")},
+           {&pat.Tensor("conv2d_out")});
+    }
+    pat.Tensor("act_out") = activation(
+        pat.Tensor("conv2d_out"), pat.Tensor("min"), pat.Tensor("max"));
+
+    if (fused_level_ > 0) {
+      pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
+        auto act_type = match_ctx.Attr<std::string>("fuse_activation");
+        if (act_type != "") {
+          return false;
+        }
+        return true;
+      });
+    }
+
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+
+    const auto &fused_conv =
+        fused_level_ == 0
+            ? res.Op(paddle::onednn::dialect::FusedConv2dOp::name(),
+                     {{
+                         {"strides", pat.Attr("strides")},
+                         {"paddings", pat.Attr("paddings")},
+                         {"padding_algorithm", pat.Attr("padding_algorithm")},
+                         {"dilations", pat.Attr("dilations")},
+                         {"groups", pat.Attr("groups")},
+                         {"data_format", pat.Attr("data_format")},
+                         {"mkldnn_data_type", res.StrAttr("float32")},
+                         {"fuse_activation", res.StrAttr("clip")},
+                         {"fuse_residual_connection", res.BoolAttr(false)},
+                         {"force_fp32_output", res.BoolAttr(false)},
+                         {"fuse_alpha", pat.Attr("full_1_value")},
+                         {"fuse_beta", pat.Attr("full_2_value")},
+                         {"scale_in", res.Float32Attr(1.0f)},
+                         {"scale_out", res.Float32Attr(1.0f)},
+                         {"scale_in_eltwise", res.Float32Attr(1.0f)},
+                         {"scale_weights", res.VectorFloatAttr({1.0f})},
+                     }})
+            : res.Op(paddle::onednn::dialect::FusedConv2dOp::name(),
+                     {{
+                         {"strides", pat.Attr("strides")},
+                         {"paddings", pat.Attr("paddings")},
+                         {"padding_algorithm", pat.Attr("padding_algorithm")},
+                         {"dilations", pat.Attr("dilations")},
+                         {"groups", pat.Attr("groups")},
+                         {"data_format", pat.Attr("data_format")},
+                         {"mkldnn_data_type", pat.Attr("mkldnn_data_type")},
+                         {"fuse_activation", res.StrAttr("clip")},
+                         {"fuse_residual_connection",
+                          pat.Attr("fuse_residual_connection")},
+                         {"force_fp32_output", pat.Attr("force_fp32_output")},
+                         {"fuse_alpha", pat.Attr("full_1_value")},
+                         {"fuse_beta", pat.Attr("full_2_value")},
+                         {"scale_in", pat.Attr("scale_in")},
+                         {"scale_out", pat.Attr("scale_out")},
+                         {"scale_in_eltwise", pat.Attr("scale_in_eltwise")},
+                         {"scale_weights", pat.Attr("scale_weights")},
+                     }});
+
+    if (fused_level_ > 0) {
+      fused_conv({&res.Tensor("input"),
+                  &res.Tensor("filter"),
+                  &res.Tensor("bias"),
+                  &res.Tensor("residual_param")},
+                 {&res.Tensor("act_out")});
+    } else {
+      fused_conv({&res.Tensor("input"),
+                  &res.Tensor("filter"),
+                  &res.InputNoneTensor(),
+                  &res.InputNoneTensor()},
+                 {&res.Tensor("act_out")});
+    }
+  }
+};
+
+class ConvActFusePass : public pir::PatternRewritePass {
+ public:
+  ConvActFusePass()
+      : pir::PatternRewritePass("conv_activation_mkldnn_fuse_pass", 3) {}
+
+  pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override {
+    pir::RewritePatternSet ps(context);
+
+    // This eleven activations have no extra attribute, can use the same pattern
+    std::vector<std::string> supported_activations_name = {"abs",
+                                                           "sqrt",
+                                                           "mish",
+                                                           "relu",
+                                                           "sigmoid",
+                                                           "tanh",
+                                                           "relu6",
+                                                           "hard_swish",
+                                                           "swish",
+                                                           "leaky_relu",
+                                                           "hard_sigmoid"};
+
+    size_t pattern_num = 1;
+    // conv + activation -> fused_conv2d
+    for (auto activation : supported_activations_name) {
+      ps.Add(paddle::drr::Create<ConvActivationFusePattern>(
+          context, pattern_num, activation, 0));
+      pattern_num++;
+    }
+
+    // conv + bias(residual / residual + bias)
+    // -> fused_conv2d + activation -> fused_conv2d
+    for (auto activation : supported_activations_name) {
+      ps.Add(paddle::drr::Create<ConvActivationFusePattern>(
+          context, pattern_num, activation, 1));
+      pattern_num++;
+    }
+
+    ps.Add(paddle::drr::Create<ConvGeluFusePattern>(
+        context, paddle::dialect::GeluOp::name(), 0));
+    ps.Add(paddle::drr::Create<ConvGeluFusePattern>(
+        context, paddle::dialect::GeluOp::name(), 1));
+
+    ps.Add(paddle::drr::Create<ConvClipFusePattern>(
+        context, paddle::dialect::ClipOp::name(), 0));
+    ps.Add(paddle::drr::Create<ConvClipFusePattern>(
+        context, paddle::dialect::ClipOp::name(), 1));
+    return ps;
+  }
+};
+
+}  // namespace
+
+namespace pir {
+
+std::unique_ptr<Pass> CreateConv2dActFusePass() {
+  /**
+   *   conv
+   *    |     ->  fused_conv
+   * activation
+   *
+   * fused_conv2d (bias or residual)
+   *      |                         -> fused_conv2d
+   *  activation
+   */
+  return std::make_unique<ConvActFusePass>();
+}
+
+}  // namespace pir
+
+REGISTER_IR_PASS(conv_activation_onednn_fuse_pass, ConvActFusePass);
diff --git a/paddle/fluid/pir/transforms/onednn/conv_activation_onednn_fuse_pass.h b/paddle/fluid/pir/transforms/onednn/conv_activation_onednn_fuse_pass.h
new file mode 100644
index 0000000000000..520449bbd028e
--- /dev/null
+++ b/paddle/fluid/pir/transforms/onednn/conv_activation_onednn_fuse_pass.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "paddle/pir/include/core/dll_decl.h"
+
+namespace pir {
+
+class Pass;
+
+IR_API std::unique_ptr<Pass> CreateConv2dActFusePass();
+
+}  // namespace pir
diff --git a/paddle/fluid/pir/transforms/onednn/conv_concat_activation_onednn_fuse_pass.cc b/paddle/fluid/pir/transforms/onednn/conv_concat_activation_onednn_fuse_pass.cc
new file mode 100644
index 0000000000000..5f2da932bb2af
--- /dev/null
+++ b/paddle/fluid/pir/transforms/onednn/conv_concat_activation_onednn_fuse_pass.cc
@@ -0,0 +1,1039 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/transforms/onednn/conv_concat_activation_onednn_fuse_pass.h"
+
+#include "paddle/fluid/pir/dialect/operator/ir/onednn_op.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/drr/include/drr_pattern_base.h"
+
+#include "paddle/pir/include/pass/pass.h"
+#include "paddle/pir/include/pass/pass_registry.h"
+
+namespace {
+
+class NConvConcatActivationFusePattern : public paddle::drr::DrrPatternBase {
+ private:
+  const size_t concat_count_;
+  std::string activation_name_;
+  /*
+   * fused_level_ = 0 : conv2d + activation
+    fused_level_ = 1 : conv2d + bias + activation
+                       conv2d + residual + activation
+                       conv2d + bias + residual + activation
+  */
+  const int fused_level_;
+  const int benefit_;
+
+ public:
+  NConvConcatActivationFusePattern(size_t concat_count,
+                                   const std::string &activation_name,
+                                   int fused_level,
+                                   int benefit)
+      : concat_count_(concat_count),
+        activation_name_(activation_name),
+        fused_level_(fused_level),
+        benefit_(benefit) {}
+
+  std::string name() const override {
+    return "Conv" + std::to_string(concat_count_) + "Concat" + "Level" +
+           std::to_string(fused_level_) + activation_name_ + "Pattern";
+  }
+
+  uint32_t benefit() const override { return benefit_; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern pat = ctx->SourcePattern();
+    std::string conv_name = paddle::dialect::Conv2dOp::name();
+    if (fused_level_ > 0) {
+      conv_name = paddle::onednn::dialect::FusedConv2dOp::name();
+    }
+    std::vector<const paddle::drr::Tensor *> combine_in;
+    for (size_t i = 1; i <= concat_count_; i++) {
+      const auto &conv =
+          fused_level_ == 0
+              ? pat.Op(
+                    conv_name,
+                    {{"strides", pat.Attr("strides" + std::to_string(i))},
+                     {"paddings", pat.Attr("paddings" + std::to_string(i))},
+                     {"padding_algorithm",
+                      pat.Attr("padding_algorithm" + std::to_string(i))},
+                     {"dilations", pat.Attr("dilations" + std::to_string(i))},
+                     {"groups", pat.Attr("groups" + std::to_string(i))},
+                     {"data_format",
+                      pat.Attr("data_format" + std::to_string(i))}})
+              : pat.Op(
+                    conv_name,
+                    {{
+                        {"strides", pat.Attr("strides" + std::to_string(i))},
+                        {"paddings", pat.Attr("paddings" + std::to_string(i))},
+                        {"padding_algorithm",
+                         pat.Attr("padding_algorithm" + std::to_string(i))},
+                        {"dilations",
+                         pat.Attr("dilations" + std::to_string(i))},
+                        {"groups", pat.Attr("groups" + std::to_string(i))},
+                        {"data_format",
+                         pat.Attr("data_format" + std::to_string(i))},
+                        {"mkldnn_data_type",
+                         pat.Attr("mkldnn_data_type" + std::to_string(i))},
+                        {"fuse_activation",
+                         pat.Attr("fuse_activation" + std::to_string(i))},
+                        {"fuse_residual_connection",
+                         pat.Attr("fuse_residual_connection" +
+                                  std::to_string(i))},
+                        {"force_fp32_output",
+                         pat.Attr("force_fp32_output" + std::to_string(i))},
+                        {"fuse_alpha",
+                         pat.Attr("fuse_alpha" + std::to_string(i))},
+                        {"fuse_beta",
+                         pat.Attr("fuse_beta" + std::to_string(i))},
+                        {"scale_in", pat.Attr("scale_in" + std::to_string(i))},
+                        {"scale_out",
+                         pat.Attr("scale_out" + std::to_string(i))},
+                        {"scale_in_eltwise",
+                         pat.Attr("scale_in_eltwise" + std::to_string(i))},
+                        {"scale_weights",
+                         pat.Attr("scale_weights" + std::to_string(i))},
+                    }});
+
+      if (fused_level_ > 0) {
+        conv({&pat.Tensor("input" + std::to_string(i)),
+              &pat.Tensor("filter" + std::to_string(i)),
+              &pat.Tensor("__@bias" + std::to_string(i) + "@__"),
+              &pat.Tensor("__@residual" + std::to_string(i) + "@__")},
+             {&pat.Tensor("conv2d_out_" + std::to_string(i))});
+
+      } else {
+        conv({&pat.Tensor("input" + std::to_string(i)),
+              &pat.Tensor("filter" + std::to_string(i))},
+             {&pat.Tensor("conv2d_out_" + std::to_string(i))});
+      }
+
+      combine_in.push_back(&pat.Tensor("conv2d_out_" + std::to_string(i)));
+    }
+    const auto &combine_op = pat.Op(pir::CombineOp::name());
+    const auto &full_op = pat.Op(paddle::dialect::FullOp::name(),
+                                 {{"shape", pat.Attr("shape")},
+                                  {"value", pat.Attr("value")},
+                                  {"dtype", pat.Attr("dtype")},
+                                  {"place", pat.Attr("place")}});
+
+    combine_op(combine_in, {&pat.Tensor("combine_out")});
+    const auto &concat_op = pat.Op(paddle::dialect::ConcatOp::name());
+    concat_op({&pat.Tensor("combine_out"), &full_op()},
+              {&pat.Tensor("concat_out")});
+
+    std::string activation_name_op = "pd_op." + activation_name_;
+    if (activation_name_ == "hard_swish") {
+      // oneDNN use hard_swish, paddle use hardswish
+      activation_name_op = "pd_op.hardswish";
+    }
+    const auto &activation =
+        activation_name_op != "pd_op.leaky_relu"
+            ? pat.Op(activation_name_op)
+            : pat.Op(activation_name_op,
+                     {{"negative_slope", pat.Attr("negative_slope")}});
+    pat.Tensor("activation_out") = activation(pat.Tensor("concat_out"));
+
+    if (fused_level_ > 0) {
+      pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
+        auto act_type = match_ctx.Attr<std::string>("fuse_activation");
+        if (act_type != "") {
+          return false;
+        }
+        return true;
+      });
+    }
+    pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
+      if (activation_name_ == "leaky_relu") {
+        float negative_slope = match_ctx.Attr<float>("negative_slope");
+        // leaky relu alpha is a positive number
+        if (negative_slope <= 0.0) {
+          return false;
+        }
+      }
+      return true;
+    });
+
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+    auto fuse_beta = res.Float32Attr(0.0f);
+    auto fuse_alpha = res.Float32Attr(0.0f);
+    if (activation_name_ == "relu6") {
+      fuse_beta = res.Float32Attr(6.0f);
+    } else if (activation_name_ == "hard_swish") {
+      // hard swish have not attr float threshold = 6.0f, float scale = 6.0f,
+      // float offset = 3.0f attr But in previous implementation hard swish,
+      // fuse_alpha=1.f / 6.f， fuse_beta=1.f / 2.f, it has fixed
+      fuse_beta = res.Float32Attr(1.f / 2.f);
+      fuse_alpha = res.Float32Attr(1.f / 6.f);
+    } else if (activation_name_ == "swish") {
+      fuse_alpha = res.Float32Attr(1.0f);
+    } else if (activation_name_ == "leaky_relu") {
+      fuse_alpha = pat.Attr("negative_slope");
+    }
+
+    std::vector<const paddle::drr::Tensor *> combine_result_in;
+    // int input_num = 1;
+    for (size_t i = 1; i <= concat_count_; i++) {
+      const auto &fused_conv =
+          fused_level_ == 0
+              ? res.Op(
+                    paddle::onednn::dialect::FusedConv2dOp::name(),
+                    {{
+                        {"strides", pat.Attr("strides" + std::to_string(i))},
+                        {"paddings", pat.Attr("paddings" + std::to_string(i))},
+                        {"padding_algorithm",
+                         pat.Attr("padding_algorithm" + std::to_string(i))},
+                        {"dilations",
+                         pat.Attr("dilations" + std::to_string(i))},
+                        {"groups", pat.Attr("groups" + std::to_string(i))},
+                        {"data_format",
+                         pat.Attr("data_format" + std::to_string(i))},
+                        {"mkldnn_data_type", res.StrAttr("float32")},
+                        {"fuse_activation", res.StrAttr(activation_name_)},
+                        {"fuse_residual_connection", res.BoolAttr(false)},
+                        {"force_fp32_output", res.BoolAttr(false)},
+                        {"fuse_alpha", fuse_alpha},
+                        {"fuse_beta", fuse_beta},
+                        {"scale_in", res.Float32Attr(1.0f)},
+                        {"scale_out", res.Float32Attr(1.0f)},
+                        {"scale_in_eltwise", res.Float32Attr(1.0f)},
+                        {"scale_weights", res.VectorFloatAttr({1.0f})},
+                    }})
+              : res.Op(
+                    paddle::onednn::dialect::FusedConv2dOp::name(),
+                    {{
+                        {"strides", pat.Attr("strides" + std::to_string(i))},
+                        {"paddings", pat.Attr("paddings" + std::to_string(i))},
+                        {"padding_algorithm",
+                         pat.Attr("padding_algorithm" + std::to_string(i))},
+                        {"dilations",
+                         pat.Attr("dilations" + std::to_string(i))},
+                        {"groups", pat.Attr("groups" + std::to_string(i))},
+                        {"data_format",
+                         pat.Attr("data_format" + std::to_string(i))},
+                        {"mkldnn_data_type",
+                         pat.Attr("mkldnn_data_type" + std::to_string(i))},
+                        {"fuse_activation", res.StrAttr(activation_name_)},
+                        {"fuse_residual_connection",
+                         pat.Attr("fuse_residual_connection" +
+                                  std::to_string(i))},
+                        {"force_fp32_output",
+                         pat.Attr("force_fp32_output" + std::to_string(i))},
+                        {"fuse_alpha", fuse_alpha},
+                        {"fuse_beta", fuse_beta},
+                        {"scale_in", pat.Attr("scale_in" + std::to_string(i))},
+                        {"scale_out",
+                         pat.Attr("scale_out" + std::to_string(i))},
+                        {"scale_in_eltwise",
+                         pat.Attr("scale_in_eltwise" + std::to_string(i))},
+                        {"scale_weights",
+                         pat.Attr("scale_weights" + std::to_string(i))},
+                    }});
+
+      if (fused_level_ > 0) {
+        fused_conv({&res.Tensor("input" + std::to_string(i)),
+                    &res.Tensor("filter" + std::to_string(i)),
+                    &res.Tensor("__@bias" + std::to_string(i) + "@__"),
+                    &res.Tensor("__@residual" + std::to_string(i) + "@__")},
+                   {&res.Tensor("act_out_" + std::to_string(i))});
+
+      } else {
+        fused_conv({&res.Tensor("input" + std::to_string(i)),
+                    &res.Tensor("filter" + std::to_string(i)),
+                    &res.InputNoneTensor(),
+                    &res.InputNoneTensor()},
+                   {&res.Tensor("act_out_" + std::to_string(i))});
+      }
+      combine_result_in.push_back(&res.Tensor("act_out_" + std::to_string(i)));
+    }
+
+    const auto &combine = res.Op(pir::CombineOp::name());
+
+    combine(combine_result_in, {&res.Tensor("combine_result_out")});
+
+    // const auto &concat_result_op =
+    // res.Op(paddle::dialect::ConcatOp::name(),{{"axis", res.Int32Attr(0) }});
+    const auto &full_result_op = res.Op(paddle::dialect::FullOp::name(),
+                                        {{"shape", pat.Attr("shape")},
+                                         {"value", pat.Attr("value")},
+                                         {"dtype", pat.Attr("dtype")},
+                                         {"place", pat.Attr("place")}});
+
+    const auto &concat_result_op = res.Op(paddle::dialect::ConcatOp::name());
+    concat_result_op({&res.Tensor("combine_result_out"), &full_result_op()},
+                     {&res.Tensor("activation_out")});
+
+    // concat_result_op(combine_result_in, {&res.Tensor("concat_out")});
+  }
+};
+
+class NConvConcatHardSigmoidFusePattern : public paddle::drr::DrrPatternBase {
+ private:
+  const size_t concat_count_;
+  std::string activation_name_;
+  /*
+   * fused_level_ = 0 : conv2d + activation
+    fused_level_ = 1 : conv2d + bias + activation
+                       conv2d + residual + activation
+                       conv2d + bias + residual + activation
+  */
+  const int fused_level_;
+
+ public:
+  NConvConcatHardSigmoidFusePattern(size_t concat_count,
+                                    const std::string &activation_name,
+                                    int fused_level)
+      : concat_count_(concat_count),
+        activation_name_(activation_name),
+        fused_level_(fused_level) {}
+
+  std::string name() const override {
+    return "Conv" + std::to_string(concat_count_) + "Concat" + "Level" +
+           std::to_string(fused_level_) + "HardSigmoidPattern";
+  }
+
+  uint32_t benefit() const override { return concat_count_; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern pat = ctx->SourcePattern();
+    std::string conv_name = paddle::dialect::Conv2dOp::name();
+    if (fused_level_ > 0) {
+      conv_name = paddle::onednn::dialect::FusedConv2dOp::name();
+    }
+    std::vector<const paddle::drr::Tensor *> combine_in;
+    for (size_t i = 1; i <= concat_count_; i++) {
+      const auto &conv =
+          fused_level_ == 0
+              ? pat.Op(
+                    conv_name,
+                    {{"strides", pat.Attr("strides" + std::to_string(i))},
+                     {"paddings", pat.Attr("paddings" + std::to_string(i))},
+                     {"padding_algorithm",
+                      pat.Attr("padding_algorithm" + std::to_string(i))},
+                     {"dilations", pat.Attr("dilations" + std::to_string(i))},
+                     {"groups", pat.Attr("groups" + std::to_string(i))},
+                     {"data_format",
+                      pat.Attr("data_format" + std::to_string(i))}})
+              : pat.Op(
+                    conv_name,
+                    {{
+                        {"strides", pat.Attr("strides" + std::to_string(i))},
+                        {"paddings", pat.Attr("paddings" + std::to_string(i))},
+                        {"padding_algorithm",
+                         pat.Attr("padding_algorithm" + std::to_string(i))},
+                        {"dilations",
+                         pat.Attr("dilations" + std::to_string(i))},
+                        {"groups", pat.Attr("groups" + std::to_string(i))},
+                        {"data_format",
+                         pat.Attr("data_format" + std::to_string(i))},
+                        {"mkldnn_data_type",
+                         pat.Attr("mkldnn_data_type" + std::to_string(i))},
+                        {"fuse_activation",
+                         pat.Attr("fuse_activation" + std::to_string(i))},
+                        {"fuse_residual_connection",
+                         pat.Attr("fuse_residual_connection" +
+                                  std::to_string(i))},
+                        {"force_fp32_output",
+                         pat.Attr("force_fp32_output" + std::to_string(i))},
+                        {"fuse_alpha",
+                         pat.Attr("fuse_alpha" + std::to_string(i))},
+                        {"fuse_beta",
+                         pat.Attr("fuse_beta" + std::to_string(i))},
+                        {"scale_in", pat.Attr("scale_in" + std::to_string(i))},
+                        {"scale_out",
+                         pat.Attr("scale_out" + std::to_string(i))},
+                        {"scale_in_eltwise",
+                         pat.Attr("scale_in_eltwise" + std::to_string(i))},
+                        {"scale_weights",
+                         pat.Attr("scale_weights" + std::to_string(i))},
+                    }});
+
+      if (fused_level_ > 0) {
+        conv({&pat.Tensor("input" + std::to_string(i)),
+              &pat.Tensor("filter" + std::to_string(i)),
+              &pat.Tensor("__@bias" + std::to_string(i) + "@__"),
+              &pat.Tensor("__@residual" + std::to_string(i) + "@__")},
+             {&pat.Tensor("conv2d_out_" + std::to_string(i))});
+
+      } else {
+        conv({&pat.Tensor("input" + std::to_string(i)),
+              &pat.Tensor("filter" + std::to_string(i))},
+             {&pat.Tensor("conv2d_out_" + std::to_string(i))});
+      }
+
+      combine_in.push_back(&pat.Tensor("conv2d_out_" + std::to_string(i)));
+    }
+    const auto &combine_op = pat.Op(pir::CombineOp::name());
+    const auto &full_op = pat.Op(paddle::dialect::FullOp::name(),
+                                 {{"shape", pat.Attr("shape")},
+                                  {"value", pat.Attr("value")},
+                                  {"dtype", pat.Attr("dtype")},
+                                  {"place", pat.Attr("place")}});
+
+    combine_op(combine_in, {&pat.Tensor("combine_out")});
+    const auto &concat_op = pat.Op(paddle::dialect::ConcatOp::name());
+    concat_op({&pat.Tensor("combine_out"), &full_op()},
+              {&pat.Tensor("concat_out")});
+
+    const auto &activation =
+        pat.Op(activation_name_,
+               {{"slope", pat.Attr("slope")}, {"offset", pat.Attr("offset")}});
+    pat.Tensor("activation_out") = activation(pat.Tensor("concat_out"));
+
+    if (fused_level_ > 0) {
+      pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
+        auto act_type = match_ctx.Attr<std::string>("fuse_activation");
+        if (act_type != "") {
+          return false;
+        }
+        return true;
+      });
+    }
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+
+    std::vector<const paddle::drr::Tensor *> combine_result_in;
+    for (size_t i = 1; i <= concat_count_; i++) {
+      const auto &fused_conv =
+          fused_level_ == 0
+              ? res.Op(
+                    paddle::onednn::dialect::FusedConv2dOp::name(),
+                    {{
+                        {"strides", pat.Attr("strides" + std::to_string(i))},
+                        {"paddings", pat.Attr("paddings" + std::to_string(i))},
+                        {"padding_algorithm",
+                         pat.Attr("padding_algorithm" + std::to_string(i))},
+                        {"dilations",
+                         pat.Attr("dilations" + std::to_string(i))},
+                        {"groups", pat.Attr("groups" + std::to_string(i))},
+                        {"data_format",
+                         pat.Attr("data_format" + std::to_string(i))},
+                        {"mkldnn_data_type", res.StrAttr("float32")},
+                        {"fuse_activation", res.StrAttr("hard_sigmoid")},
+                        {"fuse_residual_connection", res.BoolAttr(false)},
+                        {"force_fp32_output", res.BoolAttr(false)},
+                        {"fuse_alpha", pat.Attr("slope")},
+                        {"fuse_beta", pat.Attr("offset")},
+                        {"scale_in", res.Float32Attr(1.0f)},
+                        {"scale_out", res.Float32Attr(1.0f)},
+                        {"scale_in_eltwise", res.Float32Attr(1.0f)},
+                        {"scale_weights", res.VectorFloatAttr({1.0f})},
+                    }})
+              : res.Op(
+                    paddle::onednn::dialect::FusedConv2dOp::name(),
+                    {{
+                        {"strides", pat.Attr("strides" + std::to_string(i))},
+                        {"paddings", pat.Attr("paddings" + std::to_string(i))},
+                        {"padding_algorithm",
+                         pat.Attr("padding_algorithm" + std::to_string(i))},
+                        {"dilations",
+                         pat.Attr("dilations" + std::to_string(i))},
+                        {"groups", pat.Attr("groups" + std::to_string(i))},
+                        {"data_format",
+                         pat.Attr("data_format" + std::to_string(i))},
+                        {"mkldnn_data_type",
+                         pat.Attr("mkldnn_data_type" + std::to_string(i))},
+                        {"fuse_activation", res.StrAttr("hard_sigmoid")},
+                        {"fuse_residual_connection",
+                         pat.Attr("fuse_residual_connection" +
+                                  std::to_string(i))},
+                        {"force_fp32_output",
+                         pat.Attr("force_fp32_output" + std::to_string(i))},
+                        {"fuse_alpha", pat.Attr("slope")},
+                        {"fuse_beta", pat.Attr("offset")},
+                        {"scale_in", pat.Attr("scale_in" + std::to_string(i))},
+                        {"scale_out",
+                         pat.Attr("scale_out" + std::to_string(i))},
+                        {"scale_in_eltwise",
+                         pat.Attr("scale_in_eltwise" + std::to_string(i))},
+                        {"scale_weights",
+                         pat.Attr("scale_weights" + std::to_string(i))},
+                    }});
+
+      if (fused_level_ > 0) {
+        fused_conv({&res.Tensor("input" + std::to_string(i)),
+                    &res.Tensor("filter" + std::to_string(i)),
+                    &res.Tensor("__@bias" + std::to_string(i) + "@__"),
+                    &res.Tensor("__@residual" + std::to_string(i) + "@__")},
+                   {&res.Tensor("act_out_" + std::to_string(i))});
+
+      } else {
+        fused_conv({&res.Tensor("input" + std::to_string(i)),
+                    &res.Tensor("filter" + std::to_string(i)),
+                    &res.InputNoneTensor(),
+                    &res.InputNoneTensor()},
+                   {&res.Tensor("act_out_" + std::to_string(i))});
+      }
+      combine_result_in.push_back(&res.Tensor("act_out_" + std::to_string(i)));
+    }
+
+    const auto &combine = res.Op(pir::CombineOp::name());
+
+    combine(combine_result_in, {&res.Tensor("combine_result_out")});
+
+    // const auto &concat_result_op =
+    // res.Op(paddle::dialect::ConcatOp::name(),{{"axis", res.Int32Attr(0) }});
+    const auto &full_result_op = res.Op(paddle::dialect::FullOp::name(),
+                                        {{"shape", pat.Attr("shape")},
+                                         {"value", pat.Attr("value")},
+                                         {"dtype", pat.Attr("dtype")},
+                                         {"place", pat.Attr("place")}});
+
+    const auto &concat_result_op = res.Op(paddle::dialect::ConcatOp::name());
+    concat_result_op({&res.Tensor("combine_result_out"), &full_result_op()},
+                     {&res.Tensor("activation_out")});
+
+    // concat_result_op(combine_result_in, {&res.Tensor("concat_out")});
+  }
+};
+
+class NConvConcatGeluFusePattern : public paddle::drr::DrrPatternBase {
+ private:
+  const size_t concat_count_;
+  std::string activation_name_;
+  /*
+   * fused_level_ = 0 : conv2d + activation
+    fused_level_ = 1 : conv2d + bias + activation
+                       conv2d + residual + activation
+                       conv2d + bias + residual + activation
+  */
+  const int fused_level_;
+
+ public:
+  NConvConcatGeluFusePattern(size_t concat_count,
+                             const std::string &activation_name,
+                             int fused_level)
+      : concat_count_(concat_count),
+        activation_name_(activation_name),
+        fused_level_(fused_level) {}
+
+  std::string name() const override {
+    return "Conv" + std::to_string(concat_count_) + "Concat" + "Level" +
+           std::to_string(fused_level_) + "GeluPattern";
+  }
+
+  uint32_t benefit() const override { return concat_count_; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern pat = ctx->SourcePattern();
+    std::string conv_name = paddle::dialect::Conv2dOp::name();
+    if (fused_level_ > 0) {
+      conv_name = paddle::onednn::dialect::FusedConv2dOp::name();
+    }
+    std::vector<const paddle::drr::Tensor *> combine_in;
+    for (size_t i = 1; i <= concat_count_; i++) {
+      const auto &conv =
+          fused_level_ == 0
+              ? pat.Op(
+                    conv_name,
+                    {{"strides", pat.Attr("strides" + std::to_string(i))},
+                     {"paddings", pat.Attr("paddings" + std::to_string(i))},
+                     {"padding_algorithm",
+                      pat.Attr("padding_algorithm" + std::to_string(i))},
+                     {"dilations", pat.Attr("dilations" + std::to_string(i))},
+                     {"groups", pat.Attr("groups" + std::to_string(i))},
+                     {"data_format",
+                      pat.Attr("data_format" + std::to_string(i))}})
+              : pat.Op(
+                    conv_name,
+                    {{
+                        {"strides", pat.Attr("strides" + std::to_string(i))},
+                        {"paddings", pat.Attr("paddings" + std::to_string(i))},
+                        {"padding_algorithm",
+                         pat.Attr("padding_algorithm" + std::to_string(i))},
+                        {"dilations",
+                         pat.Attr("dilations" + std::to_string(i))},
+                        {"groups", pat.Attr("groups" + std::to_string(i))},
+                        {"data_format",
+                         pat.Attr("data_format" + std::to_string(i))},
+                        {"mkldnn_data_type",
+                         pat.Attr("mkldnn_data_type" + std::to_string(i))},
+                        {"fuse_activation",
+                         pat.Attr("fuse_activation" + std::to_string(i))},
+                        {"fuse_residual_connection",
+                         pat.Attr("fuse_residual_connection" +
+                                  std::to_string(i))},
+                        {"force_fp32_output",
+                         pat.Attr("force_fp32_output" + std::to_string(i))},
+                        {"fuse_alpha",
+                         pat.Attr("fuse_alpha" + std::to_string(i))},
+                        {"fuse_beta",
+                         pat.Attr("fuse_beta" + std::to_string(i))},
+                        {"scale_in", pat.Attr("scale_in" + std::to_string(i))},
+                        {"scale_out",
+                         pat.Attr("scale_out" + std::to_string(i))},
+                        {"scale_in_eltwise",
+                         pat.Attr("scale_in_eltwise" + std::to_string(i))},
+                        {"scale_weights",
+                         pat.Attr("scale_weights" + std::to_string(i))},
+                    }});
+
+      if (fused_level_ > 0) {
+        conv({&pat.Tensor("input" + std::to_string(i)),
+              &pat.Tensor("filter" + std::to_string(i)),
+              &pat.Tensor("__@bias" + std::to_string(i) + "@__"),
+              &pat.Tensor("__@residual" + std::to_string(i) + "@__")},
+             {&pat.Tensor("conv2d_out_" + std::to_string(i))});
+
+      } else {
+        conv({&pat.Tensor("input" + std::to_string(i)),
+              &pat.Tensor("filter" + std::to_string(i))},
+             {&pat.Tensor("conv2d_out_" + std::to_string(i))});
+      }
+
+      combine_in.push_back(&pat.Tensor("conv2d_out_" + std::to_string(i)));
+    }
+    const auto &combine_op = pat.Op(pir::CombineOp::name());
+    const auto &full_op = pat.Op(paddle::dialect::FullOp::name(),
+                                 {{"shape", pat.Attr("shape")},
+                                  {"value", pat.Attr("value")},
+                                  {"dtype", pat.Attr("dtype")},
+                                  {"place", pat.Attr("place")}});
+
+    combine_op(combine_in, {&pat.Tensor("combine_out")});
+    const auto &concat_op = pat.Op(paddle::dialect::ConcatOp::name());
+    concat_op({&pat.Tensor("combine_out"), &full_op()},
+              {&pat.Tensor("concat_out")});
+
+    const auto &activation =
+        pat.Op(activation_name_, {{"approximate", pat.Attr("approximate")}});
+    pat.Tensor("activation_out") = activation(pat.Tensor("concat_out"));
+
+    if (fused_level_ > 0) {
+      pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
+        auto act_type = match_ctx.Attr<std::string>("fuse_activation");
+        if (act_type != "") {
+          return false;
+        }
+        return true;
+      });
+    }
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+
+    std::vector<const paddle::drr::Tensor *> combine_result_in;
+    const auto &gelu = res.ComputeAttr(
+        [](const paddle::drr::MatchContext &match_ctx) -> std::string {
+          bool approximate = match_ctx.Attr<bool>("approximate");
+          if (approximate) return "gelu_tanh";
+          return "gelu_erf";
+        });
+
+    for (size_t i = 1; i <= concat_count_; i++) {
+      const auto &fused_conv =
+          fused_level_ == 0
+              ? res.Op(
+                    paddle::onednn::dialect::FusedConv2dOp::name(),
+                    {{
+                        {"strides", pat.Attr("strides" + std::to_string(i))},
+                        {"paddings", pat.Attr("paddings" + std::to_string(i))},
+                        {"padding_algorithm",
+                         pat.Attr("padding_algorithm" + std::to_string(i))},
+                        {"dilations",
+                         pat.Attr("dilations" + std::to_string(i))},
+                        {"groups", pat.Attr("groups" + std::to_string(i))},
+                        {"data_format",
+                         pat.Attr("data_format" + std::to_string(i))},
+                        {"mkldnn_data_type", res.StrAttr("float32")},
+                        {"fuse_activation", gelu},
+                        {"fuse_residual_connection", res.BoolAttr(false)},
+                        {"force_fp32_output", res.BoolAttr(false)},
+                        {"fuse_alpha", res.Float32Attr(0.0f)},
+                        {"fuse_beta", res.Float32Attr(0.0f)},
+                        {"scale_in", res.Float32Attr(1.0f)},
+                        {"scale_out", res.Float32Attr(1.0f)},
+                        {"scale_in_eltwise", res.Float32Attr(1.0f)},
+                        {"scale_weights", res.VectorFloatAttr({1.0f})},
+                    }})
+              : res.Op(
+                    paddle::onednn::dialect::FusedConv2dOp::name(),
+                    {{
+                        {"strides", pat.Attr("strides" + std::to_string(i))},
+                        {"paddings", pat.Attr("paddings" + std::to_string(i))},
+                        {"padding_algorithm",
+                         pat.Attr("padding_algorithm" + std::to_string(i))},
+                        {"dilations",
+                         pat.Attr("dilations" + std::to_string(i))},
+                        {"groups", pat.Attr("groups" + std::to_string(i))},
+                        {"data_format",
+                         pat.Attr("data_format" + std::to_string(i))},
+                        {"mkldnn_data_type",
+                         pat.Attr("mkldnn_data_type" + std::to_string(i))},
+                        {"fuse_activation", gelu},
+                        {"fuse_residual_connection",
+                         pat.Attr("fuse_residual_connection" +
+                                  std::to_string(i))},
+                        {"force_fp32_output",
+                         pat.Attr("force_fp32_output" + std::to_string(i))},
+                        {"fuse_alpha", res.Float32Attr(0.0f)},
+                        {"fuse_beta", res.Float32Attr(0.0f)},
+                        {"scale_in", pat.Attr("scale_in" + std::to_string(i))},
+                        {"scale_out",
+                         pat.Attr("scale_out" + std::to_string(i))},
+                        {"scale_in_eltwise",
+                         pat.Attr("scale_in_eltwise" + std::to_string(i))},
+                        {"scale_weights",
+                         pat.Attr("scale_weights" + std::to_string(i))},
+                    }});
+
+      if (fused_level_ > 0) {
+        fused_conv({&res.Tensor("input" + std::to_string(i)),
+                    &res.Tensor("filter" + std::to_string(i)),
+                    &res.Tensor("__@bias" + std::to_string(i) + "@__"),
+                    &res.Tensor("__@residual" + std::to_string(i) + "@__")},
+                   {&res.Tensor("act_out_" + std::to_string(i))});
+
+      } else {
+        fused_conv({&res.Tensor("input" + std::to_string(i)),
+                    &res.Tensor("filter" + std::to_string(i)),
+                    &res.InputNoneTensor(),
+                    &res.InputNoneTensor()},
+                   {&res.Tensor("act_out_" + std::to_string(i))});
+      }
+      combine_result_in.push_back(&res.Tensor("act_out_" + std::to_string(i)));
+    }
+
+    const auto &combine = res.Op(pir::CombineOp::name());
+
+    combine(combine_result_in, {&res.Tensor("combine_result_out")});
+
+    // const auto &concat_result_op =
+    // res.Op(paddle::dialect::ConcatOp::name(),{{"axis", res.Int32Attr(0) }});
+    const auto &full_result_op = res.Op(paddle::dialect::FullOp::name(),
+                                        {{"shape", pat.Attr("shape")},
+                                         {"value", pat.Attr("value")},
+                                         {"dtype", pat.Attr("dtype")},
+                                         {"place", pat.Attr("place")}});
+
+    const auto &concat_result_op = res.Op(paddle::dialect::ConcatOp::name());
+    concat_result_op({&res.Tensor("combine_result_out"), &full_result_op()},
+                     {&res.Tensor("activation_out")});
+
+    // concat_result_op(combine_result_in, {&res.Tensor("concat_out")});
+  }
+};
+
+class NConvConcatClipFusePattern : public paddle::drr::DrrPatternBase {
+ private:
+  const size_t concat_count_;
+  std::string activation_name_;
+  /*
+   * fused_level_ = 0 : conv2d + activation
+    fused_level_ = 1 : conv2d + bias + activation
+                       conv2d + residual + activation
+                       conv2d + bias + residual + activation
+  */
+  const int fused_level_;
+
+ public:
+  NConvConcatClipFusePattern(size_t concat_count,
+                             const std::string &activation_name,
+                             int fused_level)
+      : concat_count_(concat_count),
+        activation_name_(activation_name),
+        fused_level_(fused_level) {}
+
+  std::string name() const override {
+    return "Conv" + std::to_string(concat_count_) + "Concat" + "Level" +
+           std::to_string(fused_level_) + "ClipPattern";
+  }
+
+  uint32_t benefit() const override { return concat_count_; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern pat = ctx->SourcePattern();
+    std::string conv_name = paddle::dialect::Conv2dOp::name();
+    if (fused_level_ > 0) {
+      conv_name = paddle::onednn::dialect::FusedConv2dOp::name();
+    }
+
+    std::vector<const paddle::drr::Tensor *> combine_in;
+    for (size_t i = 1; i <= concat_count_; i++) {
+      const auto &conv =
+          fused_level_ == 0
+              ? pat.Op(
+                    conv_name,
+                    {{"strides", pat.Attr("strides" + std::to_string(i))},
+                     {"paddings", pat.Attr("paddings" + std::to_string(i))},
+                     {"padding_algorithm",
+                      pat.Attr("padding_algorithm" + std::to_string(i))},
+                     {"dilations", pat.Attr("dilations" + std::to_string(i))},
+                     {"groups", pat.Attr("groups" + std::to_string(i))},
+                     {"data_format",
+                      pat.Attr("data_format" + std::to_string(i))}})
+              : pat.Op(
+                    conv_name,
+                    {{
+                        {"strides", pat.Attr("strides" + std::to_string(i))},
+                        {"paddings", pat.Attr("paddings" + std::to_string(i))},
+                        {"padding_algorithm",
+                         pat.Attr("padding_algorithm" + std::to_string(i))},
+                        {"dilations",
+                         pat.Attr("dilations" + std::to_string(i))},
+                        {"groups", pat.Attr("groups" + std::to_string(i))},
+                        {"data_format",
+                         pat.Attr("data_format" + std::to_string(i))},
+                        {"mkldnn_data_type",
+                         pat.Attr("mkldnn_data_type" + std::to_string(i))},
+                        {"fuse_activation",
+                         pat.Attr("fuse_activation" + std::to_string(i))},
+                        {"fuse_residual_connection",
+                         pat.Attr("fuse_residual_connection" +
+                                  std::to_string(i))},
+                        {"force_fp32_output",
+                         pat.Attr("force_fp32_output" + std::to_string(i))},
+                        {"fuse_alpha",
+                         pat.Attr("fuse_alpha" + std::to_string(i))},
+                        {"fuse_beta",
+                         pat.Attr("fuse_beta" + std::to_string(i))},
+                        {"scale_in", pat.Attr("scale_in" + std::to_string(i))},
+                        {"scale_out",
+                         pat.Attr("scale_out" + std::to_string(i))},
+                        {"scale_in_eltwise",
+                         pat.Attr("scale_in_eltwise" + std::to_string(i))},
+                        {"scale_weights",
+                         pat.Attr("scale_weights" + std::to_string(i))},
+                    }});
+
+      if (fused_level_ > 0) {
+        conv({&pat.Tensor("input" + std::to_string(i)),
+              &pat.Tensor("filter" + std::to_string(i)),
+              &pat.Tensor("__@bias" + std::to_string(i) + "@__"),
+              &pat.Tensor("__@residual" + std::to_string(i) + "@__")},
+             {&pat.Tensor("conv2d_out_" + std::to_string(i))});
+
+      } else {
+        conv({&pat.Tensor("input" + std::to_string(i)),
+              &pat.Tensor("filter" + std::to_string(i))},
+             {&pat.Tensor("conv2d_out_" + std::to_string(i))});
+      }
+
+      combine_in.push_back(&pat.Tensor("conv2d_out_" + std::to_string(i)));
+    }
+    const auto &combine_op = pat.Op(pir::CombineOp::name());
+    const auto &full_op = pat.Op(paddle::dialect::FullOp::name(),
+                                 {{"shape", pat.Attr("shape")},
+                                  {"value", pat.Attr("value")},
+                                  {"dtype", pat.Attr("dtype")},
+                                  {"place", pat.Attr("place")}});
+
+    combine_op(combine_in, {&pat.Tensor("combine_out")});
+    const auto &concat_op = pat.Op(paddle::dialect::ConcatOp::name());
+    concat_op({&pat.Tensor("combine_out"), &full_op()},
+              {&pat.Tensor("concat_out")});
+
+    const auto &full_1 = pat.Op(paddle::dialect::FullOp::name(),
+                                {{"value", pat.Attr("full_1_value")}});
+    const auto &full_2 = pat.Op(paddle::dialect::FullOp::name(),
+                                {{"value", pat.Attr("full_2_value")}});
+    pat.Tensor("min") = full_1();
+    pat.Tensor("max") = full_2();
+
+    const auto &activation = pat.Op(activation_name_);
+
+    pat.Tensor("activation_out") = activation(
+        pat.Tensor("concat_out"), pat.Tensor("min"), pat.Tensor("max"));
+
+    if (fused_level_ > 0) {
+      pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
+        auto act_type = match_ctx.Attr<std::string>("fuse_activation");
+        if (act_type != "") {
+          return false;
+        }
+        return true;
+      });
+    }
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+
+    std::vector<const paddle::drr::Tensor *> combine_result_in;
+    for (size_t i = 1; i <= concat_count_; i++) {
+      const auto &fused_conv =
+          fused_level_ == 0
+              ? res.Op(
+                    paddle::onednn::dialect::FusedConv2dOp::name(),
+                    {{
+                        {"strides", pat.Attr("strides" + std::to_string(i))},
+                        {"paddings", pat.Attr("paddings" + std::to_string(i))},
+                        {"padding_algorithm",
+                         pat.Attr("padding_algorithm" + std::to_string(i))},
+                        {"dilations",
+                         pat.Attr("dilations" + std::to_string(i))},
+                        {"groups", pat.Attr("groups" + std::to_string(i))},
+                        {"data_format",
+                         pat.Attr("data_format" + std::to_string(i))},
+                        {"mkldnn_data_type", res.StrAttr("float32")},
+                        {"fuse_activation", res.StrAttr("clip")},
+                        {"fuse_residual_connection", res.BoolAttr(false)},
+                        {"force_fp32_output", res.BoolAttr(false)},
+                        {"fuse_alpha", pat.Attr("full_1_value")},
+                        {"fuse_beta", pat.Attr("full_2_value")},
+                        {"scale_in", res.Float32Attr(1.0f)},
+                        {"scale_out", res.Float32Attr(1.0f)},
+                        {"scale_in_eltwise", res.Float32Attr(1.0f)},
+                        {"scale_weights", res.VectorFloatAttr({1.0f})},
+                    }})
+              : res.Op(
+                    paddle::onednn::dialect::FusedConv2dOp::name(),
+                    {{
+                        {"strides", pat.Attr("strides" + std::to_string(i))},
+                        {"paddings", pat.Attr("paddings" + std::to_string(i))},
+                        {"padding_algorithm",
+                         pat.Attr("padding_algorithm" + std::to_string(i))},
+                        {"dilations",
+                         pat.Attr("dilations" + std::to_string(i))},
+                        {"groups", pat.Attr("groups" + std::to_string(i))},
+                        {"data_format",
+                         pat.Attr("data_format" + std::to_string(i))},
+                        {"mkldnn_data_type",
+                         pat.Attr("mkldnn_data_type" + std::to_string(i))},
+                        {"fuse_activation", res.StrAttr("clip")},
+                        {"fuse_residual_connection",
+                         pat.Attr("fuse_residual_connection" +
+                                  std::to_string(i))},
+                        {"force_fp32_output",
+                         pat.Attr("force_fp32_output" + std::to_string(i))},
+                        {"fuse_alpha", pat.Attr("full_1_value")},
+                        {"fuse_beta", pat.Attr("full_2_value")},
+                        {"scale_in", pat.Attr("scale_in" + std::to_string(i))},
+                        {"scale_out",
+                         pat.Attr("scale_out" + std::to_string(i))},
+                        {"scale_in_eltwise",
+                         pat.Attr("scale_in_eltwise" + std::to_string(i))},
+                        {"scale_weights",
+                         pat.Attr("scale_weights" + std::to_string(i))},
+                    }});
+
+      if (fused_level_ > 0) {
+        fused_conv({&res.Tensor("input" + std::to_string(i)),
+                    &res.Tensor("filter" + std::to_string(i)),
+                    &res.Tensor("__@bias" + std::to_string(i) + "@__"),
+                    &res.Tensor("__@residual" + std::to_string(i) + "@__")},
+                   {&res.Tensor("act_out_" + std::to_string(i))});
+
+      } else {
+        fused_conv({&res.Tensor("input" + std::to_string(i)),
+                    &res.Tensor("filter" + std::to_string(i)),
+                    &res.InputNoneTensor(),
+                    &res.InputNoneTensor()},
+                   {&res.Tensor("act_out_" + std::to_string(i))});
+      }
+      combine_result_in.push_back(&res.Tensor("act_out_" + std::to_string(i)));
+    }
+
+    const auto &combine = res.Op(pir::CombineOp::name());
+
+    combine(combine_result_in, {&res.Tensor("combine_result_out")});
+
+    // const auto &concat_result_op =
+    // res.Op(paddle::dialect::ConcatOp::name(),{{"axis", res.Int32Attr(0) }});
+    const auto &full_result_op = res.Op(paddle::dialect::FullOp::name(),
+                                        {{"shape", pat.Attr("shape")},
+                                         {"value", pat.Attr("value")},
+                                         {"dtype", pat.Attr("dtype")},
+                                         {"place", pat.Attr("place")}});
+
+    const auto &concat_result_op = res.Op(paddle::dialect::ConcatOp::name());
+    concat_result_op({&res.Tensor("combine_result_out"), &full_result_op()},
+                     {&res.Tensor("activation_out")});
+
+    // concat_result_op(combine_result_in, {&res.Tensor("concat_out")});
+  }
+};
+
+class ConvConcatActFusePass : public pir::PatternRewritePass {
+ public:
+  ConvConcatActFusePass()
+      : pir::PatternRewritePass("conv_concat_activation_mkldnn_fuse_pass", 3) {}
+
+  pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override {
+    pir::RewritePatternSet ps(context);
+    std::vector<std::string> supported_activations_name = {"abs",
+                                                           "sqrt",
+                                                           "mish",
+                                                           "relu",
+                                                           "sigmoid",
+                                                           "tanh",
+                                                           "relu6",
+                                                           "hard_swish",
+                                                           "swish",
+                                                           "leaky_relu"};
+    int benefit = 1;
+    /**
+     * To avoid many for loop patterns to reduce efficiency
+     * We just support 6 conv2d concat now
+     * And concat in OneDNN with a large number of concat ops
+     * performance is worse than CPU kernel.
+     */
+    /**
+     * fused_level 0:  conv2d + activation
+     *             1:  fused_conv2d + activation
+     */
+    for (size_t concat_num = 1; concat_num <= 6; concat_num++) {
+      for (auto activation : supported_activations_name) {
+        ps.Add(paddle::drr::Create<NConvConcatActivationFusePattern>(
+            context, concat_num, activation, 0, benefit++));
+        ps.Add(paddle::drr::Create<NConvConcatActivationFusePattern>(
+            context, concat_num, activation, 1, benefit++));
+      }
+    }
+
+    /**
+     * These activation use separate pattern to avoid to too large of benefit
+     */
+    for (size_t concat_num = 1; concat_num <= 6; concat_num++) {
+      ps.Add(paddle::drr::Create<NConvConcatHardSigmoidFusePattern>(
+          context, concat_num, paddle::dialect::HardsigmoidOp::name(), 0));
+      ps.Add(paddle::drr::Create<NConvConcatHardSigmoidFusePattern>(
+          context, concat_num, paddle::dialect::HardsigmoidOp::name(), 1));
+    }
+
+    for (size_t concat_num = 1; concat_num <= 6; concat_num++) {
+      ps.Add(paddle::drr::Create<NConvConcatGeluFusePattern>(
+          context, concat_num, paddle::dialect::GeluOp::name(), 0));
+      ps.Add(paddle::drr::Create<NConvConcatGeluFusePattern>(
+          context, concat_num, paddle::dialect::GeluOp::name(), 1));
+    }
+
+    for (size_t concat_num = 1; concat_num <= 6; concat_num++) {
+      ps.Add(paddle::drr::Create<NConvConcatClipFusePattern>(
+          context, concat_num, paddle::dialect::ClipOp::name(), 0));
+      ps.Add(paddle::drr::Create<NConvConcatClipFusePattern>(
+          context, concat_num, paddle::dialect::ClipOp::name(), 1));
+    }
+
+    return ps;
+  }
+};
+
+}  // namespace
+
+namespace pir {
+
+std::unique_ptr<Pass> CreateConv2dConcatActFusePass() {
+  // /**
+  //  * This pass must execution before conv_activation_mkldnn_fuse_pass
+  //  *  conv   conv   conv      conv     conv    conv        fused_conv
+  //  fused_conv  fused_conv
+  //  *     \   /  ...             |        |   ...              \           /
+  //  .....
+  //  *     concat      ->        act      act          ->           concat
+  //  *       |                      \       /
+  //  *      act                      concat
+  // */
+  return std::make_unique<ConvConcatActFusePass>();
+}
+
+}  // namespace pir
+
+REGISTER_IR_PASS(conv_concat_activation_onednn_fuse_pass,
+                 ConvConcatActFusePass);
diff --git a/paddle/fluid/pir/transforms/onednn/conv_concat_activation_onednn_fuse_pass.h b/paddle/fluid/pir/transforms/onednn/conv_concat_activation_onednn_fuse_pass.h
new file mode 100644
index 0000000000000..972d594569684
--- /dev/null
+++ b/paddle/fluid/pir/transforms/onednn/conv_concat_activation_onednn_fuse_pass.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "paddle/pir/include/core/dll_decl.h"
+
+namespace pir {
+
+class Pass;
+
+IR_API std::unique_ptr<Pass> CreateConv2dConcatActFusePass();
+
+}  // namespace pir
diff --git a/paddle/fluid/pir/transforms/onednn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/pir/transforms/onednn/conv_elementwise_add_onednn_fuse_pass.cc
similarity index 99%
rename from paddle/fluid/pir/transforms/onednn/conv_elementwise_add_mkldnn_fuse_pass.cc
rename to paddle/fluid/pir/transforms/onednn/conv_elementwise_add_onednn_fuse_pass.cc
index 4ecd752b85997..c367712927dcc 100644
--- a/paddle/fluid/pir/transforms/onednn/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/pir/transforms/onednn/conv_elementwise_add_onednn_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/pir/transforms/onednn/conv_elementwise_add_mkldnn_fuse_pass.h"
+#include "paddle/fluid/pir/transforms/onednn/conv_elementwise_add_onednn_fuse_pass.h"
 
 #include "paddle/fluid/pir/dialect/operator/ir/onednn_op.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
@@ -385,7 +385,7 @@ class FusedConvBiasElementwiseAddAsYPattern
 class ConvElementwiseAddFusePass : public pir::PatternRewritePass {
  public:
   ConvElementwiseAddFusePass()
-      : pir::PatternRewritePass("conv_elementwise_add_mkldnn_fuse_pass", 3) {}
+      : pir::PatternRewritePass("conv_elementwise_add_onednn_fuse_pass", 3) {}
 
   pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override {
     pir::RewritePatternSet ps(context);
@@ -421,5 +421,5 @@ std::unique_ptr<Pass> CreateConvElementwiseAddFusePass() {
 
 }  // namespace pir
 
-REGISTER_IR_PASS(conv_elementwise_add_mkldnn_fuse_pass,
+REGISTER_IR_PASS(conv_elementwise_add_onednn_fuse_pass,
                  ConvElementwiseAddFusePass);
diff --git a/paddle/fluid/pir/transforms/onednn/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/pir/transforms/onednn/conv_elementwise_add_onednn_fuse_pass.h
similarity index 100%
rename from paddle/fluid/pir/transforms/onednn/conv_elementwise_add_mkldnn_fuse_pass.h
rename to paddle/fluid/pir/transforms/onednn/conv_elementwise_add_onednn_fuse_pass.h
diff --git a/paddle/fluid/pir/transforms/onednn/depthwise_conv_onednn_pass.cc b/paddle/fluid/pir/transforms/onednn/depthwise_conv_onednn_pass.cc
new file mode 100644
index 0000000000000..5b89ac9a1f0f7
--- /dev/null
+++ b/paddle/fluid/pir/transforms/onednn/depthwise_conv_onednn_pass.cc
@@ -0,0 +1,107 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/transforms/onednn/depthwise_conv_onednn_pass.h"
+
+#include "paddle/fluid/pir/dialect/operator/ir/onednn_op.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/drr/include/drr_pattern_base.h"
+
+#include "paddle/pir/include/pass/pass.h"
+#include "paddle/pir/include/pass/pass_registry.h"
+
+namespace {
+
+class DepthwiseConvPattern : public paddle::drr::DrrPatternBase {
+ private:
+  std::string depthwise_conv_name_;
+
+ public:
+  explicit DepthwiseConvPattern(const std::string &conv_name)
+      : depthwise_conv_name_(conv_name) {}
+
+  std::string name() const override { return "DepthwiseConvPattern"; }
+
+  uint32_t benefit() const override { return 2; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern pat = ctx->SourcePattern();
+
+    const auto &depthwise_conv =
+        pat.Op(depthwise_conv_name_,
+               {{"strides", pat.Attr("strides")},
+                {"paddings", pat.Attr("paddings")},
+                {"padding_algorithm", pat.Attr("padding_algorithm")},
+                {"dilations", pat.Attr("dilations")},
+                {"groups", pat.Attr("groups")},
+                {"data_format", pat.Attr("data_format")}});
+
+    depthwise_conv({&pat.Tensor("input"), &pat.Tensor("filter")},
+                   {&pat.Tensor("conv_out")});
+
+    pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
+      std::set<std::string> padding_algorithm = {"EXPLICIT", "SAME", "VALID"};
+      std::set<std::string> data_format = {"NCHW", "NHWC", "AnyLayout"};
+      if (padding_algorithm.count(
+              match_ctx.Attr<std::string>("padding_algorithm")) == 0 ||
+          data_format.count(match_ctx.Attr<std::string>("data_format")) == 0 ||
+          match_ctx.Attr<int>("groups") < 1) {
+        return false;
+      }
+      return true;
+    });
+
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+
+    const auto &conv2d =
+        res.Op(paddle::dialect::Conv2dOp::name(),
+               {{
+                   {"strides", pat.Attr("strides")},
+                   {"paddings", pat.Attr("paddings")},
+                   {"padding_algorithm", pat.Attr("padding_algorithm")},
+                   {"dilations", pat.Attr("dilations")},
+                   {"groups", pat.Attr("groups")},
+                   {"data_format", pat.Attr("data_format")},
+               }});
+
+    conv2d({&res.Tensor("input"), &res.Tensor("filter")},
+           {&res.Tensor("conv_out")});
+  }
+};
+
+class DepthwiseConvMKLDNNPass : public pir::PatternRewritePass {
+ public:
+  DepthwiseConvMKLDNNPass()
+      : pir::PatternRewritePass("depthwise_conv_mkldnn_pass", 2) {}
+
+  pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override {
+    pir::RewritePatternSet ps(context);
+    ps.Add(paddle::drr::Create<DepthwiseConvPattern>(
+        context, paddle::dialect::DepthwiseConv2dOp::name()));
+    return ps;
+  }
+};
+
+}  // namespace
+
+namespace pir {
+
+std::unique_ptr<Pass> CreateDepthwiseConvMKLDNNPass() {
+  // pd_op.depthwise_conv  -> pd_op.conv2d
+  return std::make_unique<DepthwiseConvMKLDNNPass>();
+}
+
+}  // namespace pir
+
+REGISTER_IR_PASS(depthwise_conv_onednn_pass, DepthwiseConvMKLDNNPass);
diff --git a/paddle/fluid/pir/transforms/onednn/depthwise_conv_onednn_pass.h b/paddle/fluid/pir/transforms/onednn/depthwise_conv_onednn_pass.h
new file mode 100644
index 0000000000000..9f91993ce8dbe
--- /dev/null
+++ b/paddle/fluid/pir/transforms/onednn/depthwise_conv_onednn_pass.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "paddle/pir/include/core/dll_decl.h"
+
+namespace pir {
+
+class Pass;
+
+IR_API std::unique_ptr<Pass> CreateDepthwiseConvMKLDNNPass();
+
+}  // namespace pir
diff --git a/paddle/fluid/pir/transforms/onednn/matmul_activation_fuse_pass.cc b/paddle/fluid/pir/transforms/onednn/matmul_activation_fuse_pass.cc
index 1db28281578d4..45f182c955f16 100644
--- a/paddle/fluid/pir/transforms/onednn/matmul_activation_fuse_pass.cc
+++ b/paddle/fluid/pir/transforms/onednn/matmul_activation_fuse_pass.cc
@@ -92,16 +92,6 @@ class MatmulActivationFusePattern : public paddle::drr::DrrPatternBase {
 
     pat.Tensor("act_out") = act(pat.Tensor("Out"));
 
-    pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
-      std::set<bool> bool_sets = {true, false};
-      auto result_x = match_ctx.Attr<bool>("transpose_x");
-      auto result_y = match_ctx.Attr<bool>("transpose_y");
-      if (bool_sets.count(result_x) == 0 || bool_sets.count(result_y) == 0) {
-        return false;
-      }
-      return true;
-    });
-
     if (act_type_ == paddle::dialect::GeluOp::name()) {
       pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
         auto result_gelu = match_ctx.Attr<bool>("approximate");
@@ -187,15 +177,6 @@ class MatmulGeluTanhFusePattern : public paddle::drr::DrrPatternBase {
 
     pat.Tensor("act_out") = act(pat.Tensor("Out"));
 
-    pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
-      std::set<bool> bool_sets = {true, false};
-      auto result_x = match_ctx.Attr<bool>("transpose_x");
-      auto result_y = match_ctx.Attr<bool>("transpose_y");
-      if (bool_sets.count(result_x) == 0 || bool_sets.count(result_y) == 0) {
-        return false;
-      }
-      return true;
-    });
     pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
       auto result_gelu = match_ctx.Attr<bool>("approximate");
       if (!result_gelu) return false;
@@ -272,16 +253,6 @@ class MatmulClipFusePattern : public paddle::drr::DrrPatternBase {
     pat.Tensor("act_out") =
         act(pat.Tensor("Out"), pat.Tensor("min"), pat.Tensor("max"));
 
-    pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
-      std::set<bool> bool_sets = {true, false};
-      auto result_x = match_ctx.Attr<bool>("transpose_x");
-      auto result_y = match_ctx.Attr<bool>("transpose_y");
-      if (bool_sets.count(result_x) == 0 || bool_sets.count(result_y) == 0) {
-        return false;
-      }
-      return true;
-    });
-
     paddle::drr::ResultPattern res = pat.ResultPattern();
 
     std::unordered_map<std::string, paddle::drr::Attribute> fused_attrs{
@@ -375,16 +346,11 @@ class FusedMatmulActivationFusePattern : public paddle::drr::DrrPatternBase {
     pat.Tensor("act_out") = act(pat.Tensor("Out"));
 
     pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
-      std::set<bool> bool_sets = {true, false};
-      auto result_x = match_ctx.Attr<bool>("transpose_x");
-      auto result_y = match_ctx.Attr<bool>("transpose_y");
       auto act_type = match_ctx.Attr<std::string>("fuse_activation");
-      if (bool_sets.count(result_x) == 0 || bool_sets.count(result_y) == 0 ||
-          act_type != "") {
-        return false;
-      }
+      if (act_type != "") return false;
       return true;
     });
+
     if (act_type_ == paddle::dialect::GeluOp::name()) {
       pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
         auto result_gelu = match_ctx.Attr<bool>("approximate");
@@ -490,16 +456,11 @@ class FusedMatmulGeluTanhFusePattern : public paddle::drr::DrrPatternBase {
     pat.Tensor("act_out") = act(pat.Tensor("Out"));
 
     pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
-      std::set<bool> bool_sets = {true, false};
-      auto result_x = match_ctx.Attr<bool>("transpose_x");
-      auto result_y = match_ctx.Attr<bool>("transpose_y");
       auto act_type = match_ctx.Attr<std::string>("fuse_activation");
-      if (bool_sets.count(result_x) == 0 || bool_sets.count(result_y) == 0 ||
-          act_type != "") {
-        return false;
-      }
+      if (act_type != "") return false;
       return true;
     });
+
     pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
       auto result_gelu = match_ctx.Attr<bool>("approximate");
       if (!result_gelu) return false;
@@ -597,14 +558,8 @@ class FusedMatmulClipFusePattern : public paddle::drr::DrrPatternBase {
         act(pat.Tensor("Out"), pat.Tensor("min"), pat.Tensor("max"));
 
     pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
-      std::set<bool> bool_sets = {true, false};
-      auto result_x = match_ctx.Attr<bool>("transpose_x");
-      auto result_y = match_ctx.Attr<bool>("transpose_y");
       auto act_type = match_ctx.Attr<std::string>("fuse_activation");
-      if (bool_sets.count(result_x) == 0 || bool_sets.count(result_y) == 0 ||
-          act_type != "") {
-        return false;
-      }
+      if (act_type != "") return false;
       return true;
     });
 
@@ -645,7 +600,6 @@ class MatmulActivationFusePass : public pir::PatternRewritePass {
 
   pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override {
     pir::RewritePatternSet ps(context);
-    // std::vector<bool> bool_set = {false, true};
     int benefit_idx = 1;
     for (auto act_op : act_ops) {
       ps.Add(paddle::drr::Create<MatmulActivationFusePattern>(
diff --git a/paddle/fluid/pir/transforms/onednn/matmul_elementwise_add_fuse_pass.cc b/paddle/fluid/pir/transforms/onednn/matmul_elementwise_add_fuse_pass.cc
index e4ebc7d79378e..91ce0f80018c5 100644
--- a/paddle/fluid/pir/transforms/onednn/matmul_elementwise_add_fuse_pass.cc
+++ b/paddle/fluid/pir/transforms/onednn/matmul_elementwise_add_fuse_pass.cc
@@ -59,16 +59,6 @@ class MatmulElementwiseAddFusePattern : public paddle::drr::DrrPatternBase {
         as_x_ ? add(pat.Tensor("Out"), pat.Tensor("residual"))
               : add(pat.Tensor("residual"), pat.Tensor("Out"));
 
-    pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
-      std::set<bool> bool_sets = {true, false};
-      auto result_x = match_ctx.Attr<bool>("transpose_x");
-      auto result_y = match_ctx.Attr<bool>("transpose_y");
-      if (bool_sets.count(result_x) == 0 || bool_sets.count(result_y) == 0) {
-        return false;
-      }
-      return true;
-    });
-
     paddle::drr::ResultPattern res = pat.ResultPattern();
 
     const auto &fused_matmul =
@@ -106,20 +96,17 @@ class FusedMatmulElementwiseAddFusePattern
   std::string matmul_name_;
   std::string fused_matmul_name_;
   uint32_t benefit_;
-  bool as_x_;   // Decide input direction of 1st add
-  bool as_x2_;  // Decide input direction of 2nd add
+  bool as_x_;  // Decide input direction of add
 
  public:
   FusedMatmulElementwiseAddFusePattern(const std::string &matmul_name,
                                        const std::string &fused_matmul_name,
                                        uint32_t benefit,
-                                       bool as_x,
-                                       bool as_x2)
+                                       bool as_x)
       : matmul_name_(matmul_name),
         fused_matmul_name_(fused_matmul_name),
         benefit_(benefit),
-        as_x_(as_x),
-        as_x2_(as_x2) {}
+        as_x_(as_x) {}
 
   std::string name() const override {
     return "FusedMatmulElementwiseAddFusePattern";
@@ -130,26 +117,39 @@ class FusedMatmulElementwiseAddFusePattern
   void operator()(paddle::drr::DrrPatternContext *ctx) const override {
     paddle::drr::SourcePattern pat = ctx->SourcePattern();
 
-    const auto &matmul = pat.Op(matmul_name_,
-                                {{"transpose_x", pat.Attr("transpose_x")},
-                                 {"transpose_y", pat.Attr("transpose_y")}});
+    const auto &matmul =
+        pat.Op(matmul_name_,
+               {{"trans_x", pat.Attr("transpose_x")},
+                {"trans_y", pat.Attr("transpose_y")},
+                {"matmul_alpha", pat.Attr("matmul_alpha")},
+                {"fuse_activation", pat.Attr("fuse_activation")},
+                {"fuse_alpha", pat.Attr("fuse_alpha")},
+                {"fuse_beta", pat.Attr("fuse_beta")},
+                {"fused_output_scale", pat.Attr("fused_output_scale")},
+                {"fused_reshape_x", pat.Attr("fused_reshape_x")},
+                {"fused_transpose_x", pat.Attr("fused_transpose_x")},
+                {"fused_reshape_y", pat.Attr("fused_reshape_y")},
+                {"fused_transpose_y", pat.Attr("fused_transpose_y")},
+                {"fused_reshape_out", pat.Attr("fused_reshape_out")},
+                {"fused_transpose_out", pat.Attr("fused_transpose_out")},
+                {"mkldnn_data_type", pat.Attr("mkldnn_data_type")},
+                {"scale_x", pat.Attr("scale_x")},
+                {"scale_y", pat.Attr("scale_y")},
+                {"scale_in_eltwise", pat.Attr("scale_in_eltwise")},
+                {"scale_out", pat.Attr("scale_out")},
+                {"force_fp32_output", pat.Attr("force_fp32_output")}});
 
     const auto &add = pat.Op(paddle::dialect::AddOp::name());
-    const auto &add2 = pat.Op(paddle::dialect::AddOp::name());
-    matmul({&pat.Tensor("X"), &pat.Tensor("Y")}, {&pat.Tensor("Out")});
+    matmul({&pat.Tensor("X"), &pat.Tensor("Y"), &pat.Tensor("none")},
+           {&pat.Tensor("Out")});
 
     pat.Tensor("add_out") =
         as_x_ ? add(pat.Tensor("Out"), pat.Tensor("residual"))
               : add(pat.Tensor("residual"), pat.Tensor("Out"));
-    pat.Tensor("add_out_end") =
-        as_x2_ ? add2(pat.Tensor("add_out"), pat.Tensor("residual2"))
-               : add2(pat.Tensor("residual2"), pat.Tensor("add_out"));
 
     pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
-      std::set<bool> bool_sets = {true, false};
-      auto result_x = match_ctx.Attr<bool>("transpose_x");
-      auto result_y = match_ctx.Attr<bool>("transpose_y");
-      if (bool_sets.count(result_x) == 0 || bool_sets.count(result_y) == 0) {
+      auto none_tensor = match_ctx.Tensor("none");
+      if (none_tensor.impl() != nullptr) {
         return false;
       }
       return true;
@@ -157,36 +157,32 @@ class FusedMatmulElementwiseAddFusePattern
 
     paddle::drr::ResultPattern res = pat.ResultPattern();
 
-    const auto &fused_add = res.Op(paddle::dialect::AddOp::name());
-    res.Tensor("residual3") =
-        fused_add(res.Tensor("residual1"), res.Tensor("residual2"));
-
     const auto &fused_matmul =
         res.Op(fused_matmul_name_,
                {{
                    {"trans_x", pat.Attr("transpose_x")},
                    {"trans_y", pat.Attr("transpose_y")},
-                   {"matmul_alpha", res.Float32Attr(1.0f)},
-                   {"fuse_activation", res.StrAttr("")},
-                   {"fuse_alpha", res.Float32Attr(0.0f)},
-                   {"fuse_beta", res.Float32Attr(0.0f)},
-                   {"fused_output_scale", res.Float32Attr(1.0f)},
-                   {"fused_reshape_x", res.VectorInt32Attr({})},
-                   {"fused_transpose_x", res.VectorInt32Attr({})},
-                   {"fused_reshape_y", res.VectorInt32Attr({})},
-                   {"fused_transpose_y", res.VectorInt32Attr({})},
-                   {"fused_reshape_out", res.VectorInt32Attr({})},
-                   {"fused_transpose_out", res.VectorInt32Attr({})},
-                   {"mkldnn_data_type", res.StrAttr("float32")},
-                   {"scale_x", res.Float32Attr(1.0f)},
-                   {"scale_y", res.Float32Attr(1.0f)},
-                   {"scale_in_eltwise", res.Float32Attr(0.0f)},
-                   {"scale_out", res.Float32Attr(1.0f)},
-                   {"force_fp32_output", res.BoolAttr(false)},
+                   {"matmul_alpha", pat.Attr("matmul_alpha")},
+                   {"fuse_activation", pat.Attr("fuse_activation")},
+                   {"fuse_alpha", pat.Attr("fuse_alpha")},
+                   {"fuse_beta", pat.Attr("fuse_beta")},
+                   {"fused_output_scale", pat.Attr("fused_output_scale")},
+                   {"fused_reshape_x", pat.Attr("fused_reshape_x")},
+                   {"fused_transpose_x", pat.Attr("fused_transpose_x")},
+                   {"fused_reshape_y", pat.Attr("fused_reshape_y")},
+                   {"fused_transpose_y", pat.Attr("fused_transpose_y")},
+                   {"fused_reshape_out", pat.Attr("fused_reshape_out")},
+                   {"fused_transpose_out", pat.Attr("fused_transpose_out")},
+                   {"mkldnn_data_type", pat.Attr("mkldnn_data_type")},
+                   {"scale_x", pat.Attr("scale_x")},
+                   {"scale_y", pat.Attr("scale_y")},
+                   {"scale_in_eltwise", pat.Attr("scale_in_eltwise")},
+                   {"scale_out", pat.Attr("scale_out")},
+                   {"force_fp32_output", pat.Attr("force_fp32_output")},
                }});
 
-    fused_matmul({&res.Tensor("X"), &res.Tensor("Y"), &res.Tensor("residual3")},
-                 {&res.Tensor("add_out_end")});
+    fused_matmul({&res.Tensor("X"), &res.Tensor("Y"), &res.Tensor("residual")},
+                 {&res.Tensor("add_out")});
   }
 };
 
@@ -209,17 +205,15 @@ class MatmulElementwiseAddFusePass : public pir::PatternRewritePass {
       benefit_idx++;
     }
 
-    for (auto as_x : bool_set)
-      for (auto as_x2 : bool_set) {
-        ps.Add(paddle::drr::Create<FusedMatmulElementwiseAddFusePattern>(
-            context,
-            paddle::dialect::MatmulOp::name(),
-            paddle::onednn::dialect::FusedMatmulOp::name(),
-            benefit_idx,
-            as_x,
-            as_x2));
-        benefit_idx++;
-      }
+    for (auto as_x : bool_set) {
+      ps.Add(paddle::drr::Create<FusedMatmulElementwiseAddFusePattern>(
+          context,
+          paddle::onednn::dialect::FusedMatmulOp::name(),
+          paddle::onednn::dialect::FusedMatmulOp::name(),
+          benefit_idx,
+          as_x));
+      benefit_idx++;
+    }
     return ps;
   }
 };
@@ -230,8 +224,7 @@ namespace pir {
 
 std::unique_ptr<Pass> CreateMatmulElementwiseAddFusePass() {
   // pd_op.matmul + pd_op.add -> onednn_op.fused_matmul
-  // pd_op.matmul + pd_op.add + pd_op.add -> pd_op.add + onednn_op.fused_matmul
-  // -> onednn_op.fused_matmul
+  // onednn_op.fused_matmul + pd_op.add -> onednn_op.fused_matmul
   return std::make_unique<MatmulElementwiseAddFusePass>();
 }
 }  // namespace pir
diff --git a/paddle/fluid/pir/transforms/onednn/matmul_transpose_reshape_fuse_pass.cc b/paddle/fluid/pir/transforms/onednn/matmul_transpose_reshape_fuse_pass.cc
new file mode 100644
index 0000000000000..246cde678593c
--- /dev/null
+++ b/paddle/fluid/pir/transforms/onednn/matmul_transpose_reshape_fuse_pass.cc
@@ -0,0 +1,271 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/transforms/onednn/matmul_transpose_reshape_fuse_pass.h"
+
+#include "paddle/fluid/pir/dialect/operator/ir/onednn_op.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/drr/include/drr_pattern_base.h"
+
+#include "paddle/pir/include/pass/pass.h"
+#include "paddle/pir/include/pass/pass_registry.h"
+
+namespace {
+class MatmulTransposeReshapeFusePattern : public paddle::drr::DrrPatternBase {
+ private:
+  std::string matmul_name_;
+  std::string fused_matmul_name_;
+  uint32_t benefit_;
+
+ public:
+  MatmulTransposeReshapeFusePattern(const std::string &matmul_name,
+                                    const std::string &fused_matmul_name,
+                                    uint32_t benefit)
+      : matmul_name_(matmul_name),
+        fused_matmul_name_(fused_matmul_name),
+        benefit_(benefit) {}
+
+  std::string name() const override {
+    return "MatmulTransposeReshapeFusePattern";
+  }
+
+  uint32_t benefit() const override { return benefit_; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern pat = ctx->SourcePattern();
+
+    const auto &matmul = pat.Op(matmul_name_,
+                                {{"transpose_x", pat.Attr("transpose_x")},
+                                 {"transpose_y", pat.Attr("transpose_y")}});
+    matmul({&pat.Tensor("X"), &pat.Tensor("Y")}, {&pat.Tensor("Out")});
+
+    const auto &transpose = pat.Op(paddle::dialect::TransposeOp::name(),
+                                   {{"perm", pat.Attr("perm")}});
+    pat.Tensor("transpose_out") = transpose(pat.Tensor("Out"));
+
+    const auto &full_int_array = pat.Op(paddle::dialect::FullIntArrayOp::name(),
+                                        {{"value", pat.Attr("int_array")}});
+    pat.Tensor("shape") = full_int_array();
+
+    const auto &reshape = pat.Op(paddle::dialect::ReshapeOp::name());
+    reshape({&pat.Tensor("transpose_out"), &pat.Tensor("shape")},
+            {&pat.Tensor("reshape_out"), &pat.Tensor("Xshape")});
+
+    pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
+      auto shape = match_ctx.Attr<std::vector<int64_t>>("int_array");
+      auto perm = match_ctx.Attr<std::vector<int>>("perm");
+      const std::vector<int> supported_axis{0, 2, 1, 3};
+      if (perm != supported_axis) return false;
+      if (shape.size() != 3) return false;
+      if (std::count(shape.begin(), shape.end(), -1) > 1) return false;
+      return true;
+    });
+
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+
+    std::unordered_map<std::string, paddle::drr::Attribute> fused_attrs{
+        {"trans_x", pat.Attr("transpose_x")},
+        {"trans_y", pat.Attr("transpose_y")},
+        {"matmul_alpha", res.Float32Attr(1.0f)},
+        {"fuse_activation", res.StrAttr("")},
+        {"fuse_alpha", res.Float32Attr(0.0f)},
+        {"fuse_beta", res.Float32Attr(0.0f)},
+        {"fused_output_scale", res.Float32Attr(1.0f)},
+        {"fused_reshape_x", res.VectorInt32Attr({})},
+        {"fused_transpose_x", res.VectorInt32Attr({})},
+        {"fused_reshape_y", res.VectorInt32Attr({})},
+        {"fused_transpose_y", res.VectorInt32Attr({})},
+        {"mkldnn_data_type", res.StrAttr("float32")},
+        {"scale_x", res.Float32Attr(1.0f)},
+        {"scale_y", res.Float32Attr(1.0f)},
+        {"scale_in_eltwise", res.Float32Attr(0.0f)},
+        {"scale_out", res.Float32Attr(1.0f)},
+        {"force_fp32_output", res.BoolAttr(false)}};
+
+    const auto &fused_reshape_attr = res.ComputeAttr(
+        [](const paddle::drr::MatchContext &match_ctx) -> std::vector<int> {
+          std::vector<int> int_array_value;
+          auto shape = match_ctx.Attr<std::vector<int64_t>>("int_array");
+          for (auto i : shape) {
+            int_array_value.emplace_back(static_cast<int>(i));
+          }
+          return int_array_value;
+        });
+
+    fused_attrs.emplace("fused_reshape_out", fused_reshape_attr);
+    fused_attrs.emplace("fused_transpose_out", pat.Attr("perm"));
+
+    const auto &fused_matmul = res.Op(fused_matmul_name_, fused_attrs);
+
+    fused_matmul({&res.Tensor("X"), &res.Tensor("Y"), &res.InputNoneTensor()},
+                 {&res.Tensor("reshape_out")});
+  }
+};
+
+class FusedMatmulTransposeReshapeFusePattern
+    : public paddle::drr::DrrPatternBase {
+ private:
+  std::string matmul_name_;
+  std::string fused_matmul_name_;
+  uint32_t benefit_;
+
+ public:
+  FusedMatmulTransposeReshapeFusePattern(const std::string &matmul_name,
+                                         const std::string &fused_matmul_name,
+                                         uint32_t benefit)
+      : matmul_name_(matmul_name),
+        fused_matmul_name_(fused_matmul_name),
+        benefit_(benefit) {}
+
+  std::string name() const override {
+    return "FusedMatmulTransposeReshapeFusePattern";
+  }
+
+  uint32_t benefit() const override { return benefit_; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern pat = ctx->SourcePattern();
+
+    const auto &matmul =
+        pat.Op(matmul_name_,
+               {{"trans_x", pat.Attr("transpose_x")},
+                {"trans_y", pat.Attr("transpose_y")},
+                {"matmul_alpha", pat.Attr("matmul_alpha")},
+                {"fuse_activation", pat.Attr("fuse_activation")},
+                {"fuse_alpha", pat.Attr("fuse_alpha")},
+                {"fuse_beta", pat.Attr("fuse_beta")},
+                {"fused_output_scale", pat.Attr("fused_output_scale")},
+                {"fused_reshape_x", pat.Attr("fused_reshape_x")},
+                {"fused_transpose_x", pat.Attr("fused_transpose_x")},
+                {"fused_reshape_y", pat.Attr("fused_reshape_y")},
+                {"fused_transpose_y", pat.Attr("fused_transpose_y")},
+                {"fused_reshape_out", pat.Attr("fused_reshape_out")},
+                {"fused_transpose_out", pat.Attr("fused_transpose_out")},
+                {"mkldnn_data_type", pat.Attr("mkldnn_data_type")},
+                {"scale_x", pat.Attr("scale_x")},
+                {"scale_y", pat.Attr("scale_y")},
+                {"scale_in_eltwise", pat.Attr("scale_in_eltwise")},
+                {"scale_out", pat.Attr("scale_out")},
+                {"force_fp32_output", pat.Attr("force_fp32_output")}});
+
+    matmul({&pat.Tensor("X"), &pat.Tensor("Y"), &pat.Tensor("residual")},
+           {&pat.Tensor("Out")});
+
+    const auto &transpose = pat.Op(paddle::dialect::TransposeOp::name(),
+                                   {{"perm", pat.Attr("perm")}});
+    pat.Tensor("transpose_out") = transpose(pat.Tensor("Out"));
+
+    const auto &full_int_array = pat.Op(paddle::dialect::FullIntArrayOp::name(),
+                                        {{"value", pat.Attr("int_array")}});
+    pat.Tensor("shape") = full_int_array();
+
+    const auto &reshape = pat.Op(paddle::dialect::ReshapeOp::name());
+    reshape({&pat.Tensor("transpose_out"), &pat.Tensor("shape")},
+            {&pat.Tensor("reshape_out"), &pat.Tensor("Xshape")});
+
+    pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
+      auto shape = match_ctx.Attr<std::vector<int64_t>>("int_array");
+      auto perm = match_ctx.Attr<std::vector<int>>("perm");
+      const std::vector<int> supported_axis{0, 2, 1, 3};
+      if (perm != supported_axis) return false;
+      if (shape.size() != 3) return false;
+      if (std::count(shape.begin(), shape.end(), -1) > 1) return false;
+
+      return true;
+    });
+
+    pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
+      if (!(match_ctx.Attr<std::vector<int>>("fused_reshape_out").empty()))
+        return false;
+      return true;
+    });
+
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+
+    std::unordered_map<std::string, paddle::drr::Attribute> fused_attrs{
+        {"trans_x", pat.Attr("transpose_x")},
+        {"trans_y", pat.Attr("transpose_y")},
+        {"matmul_alpha", pat.Attr("matmul_alpha")},
+        {"fuse_activation", pat.Attr("fuse_activation")},
+        {"fuse_alpha", pat.Attr("fuse_alpha")},
+        {"fuse_beta", pat.Attr("fuse_beta")},
+        {"fused_output_scale", pat.Attr("fused_output_scale")},
+        {"fused_reshape_x", pat.Attr("fused_reshape_x")},
+        {"fused_transpose_x", pat.Attr("fused_transpose_x")},
+        {"fused_reshape_y", pat.Attr("fused_reshape_y")},
+        {"fused_transpose_y", pat.Attr("fused_transpose_y")},
+        {"mkldnn_data_type", pat.Attr("mkldnn_data_type")},
+        {"scale_x", pat.Attr("scale_x")},
+        {"scale_y", pat.Attr("scale_y")},
+        {"scale_in_eltwise", pat.Attr("scale_in_eltwise")},
+        {"scale_out", pat.Attr("scale_out")},
+        {"force_fp32_output", pat.Attr("force_fp32_output")}};
+
+    const auto &fused_reshape_attr = res.ComputeAttr(
+        [](const paddle::drr::MatchContext &match_ctx) -> std::vector<int> {
+          std::vector<int> int_array_value;
+          auto shape = match_ctx.Attr<std::vector<int64_t>>("int_array");
+          for (auto i : shape) {
+            int_array_value.emplace_back(static_cast<int>(i));
+          }
+          return int_array_value;
+        });
+
+    fused_attrs.emplace("fused_reshape_out", fused_reshape_attr);
+    fused_attrs.emplace("fused_transpose_out", pat.Attr("perm"));
+
+    const auto &fused_matmul = res.Op(fused_matmul_name_, fused_attrs);
+
+    fused_matmul({&res.Tensor("X"), &res.Tensor("Y"), &res.Tensor("residual")},
+                 {&res.Tensor("reshape_out")});
+  }
+};
+
+class MatmulTransposeReshapeFusePass : public pir::PatternRewritePass {
+ public:
+  MatmulTransposeReshapeFusePass()
+      : pir::PatternRewritePass("matmul_transpose_reshape_fuse_pass", 3) {}
+
+  pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override {
+    pir::RewritePatternSet ps(context);
+    int benefit_idx = 1;
+    ps.Add(paddle::drr::Create<MatmulTransposeReshapeFusePattern>(
+        context,
+        paddle::dialect::MatmulOp::name(),
+        paddle::onednn::dialect::FusedMatmulOp::name(),
+        benefit_idx++));
+
+    ps.Add(paddle::drr::Create<FusedMatmulTransposeReshapeFusePattern>(
+        context,
+        paddle::onednn::dialect::FusedMatmulOp::name(),
+        paddle::onednn::dialect::FusedMatmulOp::name(),
+        benefit_idx++));
+    return ps;
+  }
+};
+
+}  // namespace
+
+namespace pir {
+
+std::unique_ptr<Pass> CreateMatmulTransposeReshapeFusePass() {
+  // pd_op.matmul + pd_op.transpose + pd_op.reshape -> onednn_op.fused_matmul
+  // pd_op.fused_matmul + pd_op.transpose + pd_op.reshape ->
+  // onednn_op.fused_matmul
+  return std::make_unique<MatmulTransposeReshapeFusePass>();
+}
+}  // namespace pir
+
+REGISTER_IR_PASS(matmul_transpose_reshape_fuse_pass,
+                 MatmulTransposeReshapeFusePass);
diff --git a/paddle/fluid/pir/transforms/onednn/matmul_transpose_reshape_fuse_pass.h b/paddle/fluid/pir/transforms/onednn/matmul_transpose_reshape_fuse_pass.h
new file mode 100644
index 0000000000000..c56fa7ee62d7a
--- /dev/null
+++ b/paddle/fluid/pir/transforms/onednn/matmul_transpose_reshape_fuse_pass.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "paddle/pir/include/core/dll_decl.h"
+
+namespace pir {
+
+class Pass;
+
+IR_API std::unique_ptr<Pass> CreateMatmulTransposeReshapeFusePass();
+
+}  // namespace pir
diff --git a/paddle/fluid/pir/transforms/onednn/reshape_transpose_matmul_fuse_pass.cc b/paddle/fluid/pir/transforms/onednn/reshape_transpose_matmul_fuse_pass.cc
new file mode 100644
index 0000000000000..d249a2174ed88
--- /dev/null
+++ b/paddle/fluid/pir/transforms/onednn/reshape_transpose_matmul_fuse_pass.cc
@@ -0,0 +1,335 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/transforms/onednn/reshape_transpose_matmul_fuse_pass.h"
+
+#include "paddle/fluid/pir/dialect/operator/ir/onednn_op.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/drr/include/drr_pattern_base.h"
+
+#include "paddle/pir/include/pass/pass.h"
+#include "paddle/pir/include/pass/pass_registry.h"
+
+namespace {
+class ReshapeTransposeMatmulFusePattern : public paddle::drr::DrrPatternBase {
+ private:
+  std::string matmul_name_;
+  std::string fused_matmul_name_;
+  uint32_t benefit_;
+  bool as_x_;  // decide if the output of transpose is for input_x of matmul
+
+ public:
+  ReshapeTransposeMatmulFusePattern(const std::string &matmul_name,
+                                    const std::string &fused_matmul_name,
+                                    uint32_t benefit,
+                                    bool as_x)
+      : matmul_name_(matmul_name),
+        fused_matmul_name_(fused_matmul_name),
+        benefit_(benefit),
+        as_x_(as_x) {}
+
+  std::string name() const override {
+    return "ReshapeTransposeMatmulFusePattern";
+  }
+
+  uint32_t benefit() const override { return benefit_; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern pat = ctx->SourcePattern();
+
+    const auto &full_int_array = pat.Op(paddle::dialect::FullIntArrayOp::name(),
+                                        {{"value", pat.Attr("int_array")}});
+    pat.Tensor("shape") = full_int_array();
+
+    const auto &reshape = pat.Op(paddle::dialect::ReshapeOp::name());
+    reshape({&pat.Tensor("reshape_in"), &pat.Tensor("shape")},
+            {&pat.Tensor("reshape_out"), &pat.Tensor("Xshape")});
+
+    const auto &transpose = pat.Op(paddle::dialect::TransposeOp::name(),
+                                   {{"perm", pat.Attr("perm")}});
+    pat.Tensor("transpose_out") = transpose(pat.Tensor("reshape_out"));
+
+    const auto &matmul = pat.Op(matmul_name_,
+                                {{"transpose_x", pat.Attr("transpose_x")},
+                                 {"transpose_y", pat.Attr("transpose_y")}});
+    if (as_x_) {
+      matmul({&pat.Tensor("transpose_out"), &pat.Tensor("other")},
+             {&pat.Tensor("Out")});
+    } else {
+      matmul({&pat.Tensor("other"), &pat.Tensor("transpose_out")},
+             {&pat.Tensor("Out")});
+    }
+
+    pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
+      auto shape = match_ctx.Attr<std::vector<int64_t>>("int_array");
+      auto perm = match_ctx.Attr<std::vector<int>>("perm");
+      if (shape.size() < 2 || shape.size() > 4) return false;
+      if (shape.size() != perm.size()) return false;
+      if (std::count(shape.begin(), shape.end(), -1) > 1) return false;
+      return true;
+    });
+
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+
+    std::unordered_map<std::string, paddle::drr::Attribute> fused_attrs{
+        {"trans_x", pat.Attr("transpose_x")},
+        {"trans_y", pat.Attr("transpose_y")},
+        {"matmul_alpha", res.Float32Attr(1.0f)},
+        {"fuse_activation", res.StrAttr("")},
+        {"fuse_alpha", res.Float32Attr(0.0f)},
+        {"fuse_beta", res.Float32Attr(0.0f)},
+        {"fused_output_scale", res.Float32Attr(1.0f)},
+        {"fused_reshape_out", res.VectorInt32Attr({})},
+        {"fused_transpose_out", res.VectorInt32Attr({})},
+        {"mkldnn_data_type", res.StrAttr("float32")},
+        {"scale_x", res.Float32Attr(1.0f)},
+        {"scale_y", res.Float32Attr(1.0f)},
+        {"scale_in_eltwise", res.Float32Attr(0.0f)},
+        {"scale_out", res.Float32Attr(1.0f)},
+        {"force_fp32_output", res.BoolAttr(false)}};
+
+    const auto &fused_reshape_attr = res.ComputeAttr(
+        [](const paddle::drr::MatchContext &match_ctx) -> std::vector<int> {
+          std::vector<int> int_array_value;
+          auto shape = match_ctx.Attr<std::vector<int64_t>>("int_array");
+          for (auto i : shape) {
+            int_array_value.emplace_back(static_cast<int>(i));
+          }
+          return int_array_value;
+        });
+
+    if (as_x_) {
+      fused_attrs.emplace("fused_reshape_x", fused_reshape_attr);
+      fused_attrs.emplace("fused_transpose_x", pat.Attr("perm"));
+      fused_attrs.emplace("fused_reshape_y", res.VectorInt32Attr({}));
+      fused_attrs.emplace("fused_transpose_y", res.VectorInt32Attr({}));
+    } else {
+      fused_attrs.emplace("fused_reshape_x", res.VectorInt32Attr({}));
+      fused_attrs.emplace("fused_transpose_x", res.VectorInt32Attr({}));
+      fused_attrs.emplace("fused_reshape_y", fused_reshape_attr);
+      fused_attrs.emplace("fused_transpose_y", pat.Attr("perm"));
+    }
+
+    const auto &fused_matmul = res.Op(fused_matmul_name_, fused_attrs);
+
+    if (as_x_) {
+      fused_matmul({&res.Tensor("reshape_in"),
+                    &res.Tensor("other"),
+                    &res.InputNoneTensor()},
+                   {&res.Tensor("Out")});
+    } else {
+      fused_matmul({&res.Tensor("other"),
+                    &res.Tensor("reshape_in"),
+                    &res.InputNoneTensor()},
+                   {&res.Tensor("Out")});
+    }
+  }
+};
+
+class ReshapeTransposeFusedMatmulFusePattern
+    : public paddle::drr::DrrPatternBase {
+ private:
+  std::string matmul_name_;
+  std::string fused_matmul_name_;
+  uint32_t benefit_;
+  bool as_x_;  // decide if the output of transpose is for input_x of matmul
+
+ public:
+  ReshapeTransposeFusedMatmulFusePattern(const std::string &matmul_name,
+                                         const std::string &fused_matmul_name,
+                                         uint32_t benefit,
+                                         bool as_x)
+      : matmul_name_(matmul_name),
+        fused_matmul_name_(fused_matmul_name),
+        benefit_(benefit),
+        as_x_(as_x) {}
+
+  std::string name() const override {
+    return "ReshapeTransposFusedMatmulFusePattern";
+  }
+
+  uint32_t benefit() const override { return benefit_; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern pat = ctx->SourcePattern();
+
+    const auto &full_int_array = pat.Op(paddle::dialect::FullIntArrayOp::name(),
+                                        {{"value", pat.Attr("int_array")}});
+    pat.Tensor("shape") = full_int_array();
+
+    const auto &reshape = pat.Op(paddle::dialect::ReshapeOp::name());
+    reshape({&pat.Tensor("reshape_in"), &pat.Tensor("shape")},
+            {&pat.Tensor("reshape_out"), &pat.Tensor("Xshape")});
+
+    const auto &transpose = pat.Op(paddle::dialect::TransposeOp::name(),
+                                   {{"perm", pat.Attr("perm")}});
+    pat.Tensor("transpose_out") = transpose(pat.Tensor("reshape_out"));
+
+    const auto &matmul =
+        pat.Op(matmul_name_,
+               {{"trans_x", pat.Attr("transpose_x")},
+                {"trans_y", pat.Attr("transpose_y")},
+                {"matmul_alpha", pat.Attr("matmul_alpha")},
+                {"fuse_activation", pat.Attr("fuse_activation")},
+                {"fuse_alpha", pat.Attr("fuse_alpha")},
+                {"fuse_beta", pat.Attr("fuse_beta")},
+                {"fused_output_scale", pat.Attr("fused_output_scale")},
+                {"fused_reshape_x", pat.Attr("fused_reshape_x")},
+                {"fused_transpose_x", pat.Attr("fused_transpose_x")},
+                {"fused_reshape_y", pat.Attr("fused_reshape_y")},
+                {"fused_transpose_y", pat.Attr("fused_transpose_y")},
+                {"fused_reshape_out", pat.Attr("fused_reshape_out")},
+                {"fused_transpose_out", pat.Attr("fused_transpose_out")},
+                {"mkldnn_data_type", pat.Attr("mkldnn_data_type")},
+                {"scale_x", pat.Attr("scale_x")},
+                {"scale_y", pat.Attr("scale_y")},
+                {"scale_in_eltwise", pat.Attr("scale_in_eltwise")},
+                {"scale_out", pat.Attr("scale_out")},
+                {"force_fp32_output", pat.Attr("force_fp32_output")}});
+    if (as_x_) {
+      matmul({&pat.Tensor("transpose_out"),
+              &pat.Tensor("other"),
+              &pat.Tensor("residual")},
+             {&pat.Tensor("Out")});
+    } else {
+      matmul({&pat.Tensor("other"),
+              &pat.Tensor("transpose_out"),
+              &pat.Tensor("residual")},
+             {&pat.Tensor("Out")});
+    }
+
+    pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
+      auto shape = match_ctx.Attr<std::vector<int64_t>>("int_array");
+      auto perm = match_ctx.Attr<std::vector<int>>("perm");
+      if (shape.size() < 2 || shape.size() > 4) return false;
+      if (shape.size() != perm.size()) return false;
+      if (std::count(shape.begin(), shape.end(), -1) > 1) return false;
+
+      return true;
+    });
+
+    pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
+      if (as_x_) {
+        if (!(match_ctx.Attr<std::vector<int>>("fused_reshape_x").empty()))
+          return false;
+      } else {
+        if (!(match_ctx.Attr<std::vector<int>>("fused_reshape_y").empty()))
+          return false;
+      }
+      return true;
+    });
+
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+
+    std::unordered_map<std::string, paddle::drr::Attribute> fused_attrs{
+        {"trans_x", pat.Attr("transpose_x")},
+        {"trans_y", pat.Attr("transpose_y")},
+        {"matmul_alpha", pat.Attr("matmul_alpha")},
+        {"fuse_activation", pat.Attr("fuse_activation")},
+        {"fuse_alpha", pat.Attr("fuse_alpha")},
+        {"fuse_beta", pat.Attr("fuse_beta")},
+        {"fused_output_scale", pat.Attr("fused_output_scale")},
+        {"fused_reshape_out", pat.Attr("fused_reshape_out")},
+        {"fused_transpose_out", pat.Attr("fused_transpose_out")},
+        {"mkldnn_data_type", pat.Attr("mkldnn_data_type")},
+        {"scale_x", pat.Attr("scale_x")},
+        {"scale_y", pat.Attr("scale_y")},
+        {"scale_in_eltwise", pat.Attr("scale_in_eltwise")},
+        {"scale_out", pat.Attr("scale_out")},
+        {"force_fp32_output", pat.Attr("force_fp32_output")}};
+
+    const auto &fused_reshape_attr = res.ComputeAttr(
+        [](const paddle::drr::MatchContext &match_ctx) -> std::vector<int> {
+          std::vector<int> int_array_value;
+          auto shape = match_ctx.Attr<std::vector<int64_t>>("int_array");
+          for (auto i : shape) {
+            int_array_value.emplace_back(static_cast<int>(i));
+          }
+          return int_array_value;
+        });
+
+    if (as_x_) {
+      fused_attrs.emplace("fused_reshape_x", fused_reshape_attr);
+      fused_attrs.emplace("fused_transpose_x", pat.Attr("perm"));
+      fused_attrs.emplace("fused_reshape_y", pat.Attr("fused_reshape_y"));
+      fused_attrs.emplace("fused_transpose_y", pat.Attr("fused_transpose_y"));
+    } else {
+      fused_attrs.emplace("fused_reshape_x", pat.Attr("fused_reshape_x"));
+      fused_attrs.emplace("fused_transpose_x", pat.Attr("fused_transpose_x"));
+      fused_attrs.emplace("fused_reshape_y", fused_reshape_attr);
+      fused_attrs.emplace("fused_transpose_y", pat.Attr("perm"));
+    }
+
+    const auto &fused_matmul = res.Op(fused_matmul_name_, fused_attrs);
+
+    if (as_x_) {
+      fused_matmul({&res.Tensor("reshape_in"),
+                    &res.Tensor("other"),
+                    &res.Tensor("residual")},
+                   {&res.Tensor("Out")});
+    } else {
+      fused_matmul({&res.Tensor("other"),
+                    &res.Tensor("reshape_in"),
+                    &res.Tensor("residual")},
+                   {&res.Tensor("Out")});
+    }
+  }
+};
+
+class ReshapeTransposeMatmulFusePass : public pir::PatternRewritePass {
+ public:
+  ReshapeTransposeMatmulFusePass()
+      : pir::PatternRewritePass("reshape_transpose_matmul_fuse_pass", 3) {}
+
+  pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override {
+    pir::RewritePatternSet ps(context);
+    std::vector<bool> bool_set = {false, true};
+    int benefit_idx = 5;
+    for (auto as_x : bool_set) {
+      ps.Add(paddle::drr::Create<ReshapeTransposeMatmulFusePattern>(
+          context,
+          paddle::dialect::MatmulOp::name(),
+          paddle::onednn::dialect::FusedMatmulOp::name(),
+          benefit_idx,
+          as_x));
+      benefit_idx--;
+    }
+
+    for (auto as_x : bool_set) {
+      ps.Add(paddle::drr::Create<ReshapeTransposeFusedMatmulFusePattern>(
+          context,
+          paddle::onednn::dialect::FusedMatmulOp::name(),
+          paddle::onednn::dialect::FusedMatmulOp::name(),
+          benefit_idx,
+          as_x));
+      benefit_idx--;
+    }
+    return ps;
+  }
+};
+
+}  // namespace
+
+namespace pir {
+
+std::unique_ptr<Pass> CreateReshapeTransposeMatmulFusePass() {
+  // pd_op.reshape + pd_op.transpose + pd_op.matmul -> onednn_op.fused_matmul
+  // pd_op.reshape + pd_op.transpose + pd_op.fused_matmul ->
+  // onednn_op.fused_matmul
+  return std::make_unique<ReshapeTransposeMatmulFusePass>();
+}
+}  // namespace pir
+
+REGISTER_IR_PASS(reshape_transpose_matmul_fuse_pass,
+                 ReshapeTransposeMatmulFusePass);
diff --git a/paddle/fluid/pir/transforms/onednn/reshape_transpose_matmul_fuse_pass.h b/paddle/fluid/pir/transforms/onednn/reshape_transpose_matmul_fuse_pass.h
new file mode 100644
index 0000000000000..71b5fe47f034b
--- /dev/null
+++ b/paddle/fluid/pir/transforms/onednn/reshape_transpose_matmul_fuse_pass.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "paddle/pir/include/core/dll_decl.h"
+
+namespace pir {
+
+class Pass;
+
+IR_API std::unique_ptr<Pass> CreateReshapeTransposeMatmulFusePass();
+
+}  // namespace pir
diff --git a/paddle/fluid/pir/transforms/onednn/scale_matmul_fuse_pass.cc b/paddle/fluid/pir/transforms/onednn/scale_matmul_fuse_pass.cc
new file mode 100644
index 0000000000000..07a26a6beee34
--- /dev/null
+++ b/paddle/fluid/pir/transforms/onednn/scale_matmul_fuse_pass.cc
@@ -0,0 +1,296 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/transforms/onednn/scale_matmul_fuse_pass.h"
+
+#include "paddle/fluid/pir/dialect/operator/ir/onednn_op.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/drr/include/drr_pattern_base.h"
+
+#include "paddle/pir/include/pass/pass.h"
+#include "paddle/pir/include/pass/pass_registry.h"
+
+namespace {
+class ScaleMatmulFusePattern : public paddle::drr::DrrPatternBase {
+ private:
+  std::string matmul_name_;
+  std::string fused_matmul_name_;
+  uint32_t benefit_;
+  bool as_x_;  // decide if the output of scale is for input_x of matmul
+
+ public:
+  ScaleMatmulFusePattern(const std::string &matmul_name,
+                         const std::string &fused_matmul_name,
+                         uint32_t benefit,
+                         bool as_x)
+      : matmul_name_(matmul_name),
+        fused_matmul_name_(fused_matmul_name),
+        benefit_(benefit),
+        as_x_(as_x) {}
+
+  std::string name() const override { return "ScaleMatmulFusePattern"; }
+
+  uint32_t benefit() const override { return benefit_; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern pat = ctx->SourcePattern();
+
+    const auto &full = pat.Op(paddle::dialect::FullOp::name(),
+                              {{"value", pat.Attr("scale_")}});
+    pat.Tensor("scale") = full();
+
+    const auto &scale =
+        pat.Op(paddle::dialect::ScaleOp::name(),
+               {{"bias", pat.Attr("bias")},
+                {"bias_after_scale", pat.Attr("bias_after_scale")}});
+    scale({&pat.Tensor("scale_in"), &pat.Tensor("scale")},
+          {&pat.Tensor("scale_out")});
+
+    const auto &matmul = pat.Op(matmul_name_,
+                                {{"transpose_x", pat.Attr("transpose_x")},
+                                 {"transpose_y", pat.Attr("transpose_y")}});
+    if (as_x_) {
+      matmul({&pat.Tensor("scale_out"), &pat.Tensor("other")},
+             {&pat.Tensor("Out")});
+    } else {
+      matmul({&pat.Tensor("other"), &pat.Tensor("scale_out")},
+             {&pat.Tensor("Out")});
+    }
+
+    pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
+      auto scale = match_ctx.Attr<float>("scale_");
+      auto bias = match_ctx.Attr<float>("bias");
+      // conditions align with fluid pass
+      if (bias != 0.0f) return false;
+      if (scale <= 0.0f) return false;
+      return true;
+    });
+
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+
+    std::unordered_map<std::string, paddle::drr::Attribute> fused_attrs{
+        {"trans_x", pat.Attr("transpose_x")},
+        {"trans_y", pat.Attr("transpose_y")},
+        {"fuse_activation", res.StrAttr("")},
+        {"fuse_alpha", res.Float32Attr(0.0f)},
+        {"fuse_beta", res.Float32Attr(0.0f)},
+        {"fused_reshape_x", res.VectorInt32Attr({})},
+        {"fused_transpose_x", res.VectorInt32Attr({})},
+        {"fused_reshape_y", res.VectorInt32Attr({})},
+        {"fused_transpose_y", res.VectorInt32Attr({})},
+        {"fused_output_scale", res.Float32Attr(1.0f)},
+        {"fused_reshape_out", res.VectorInt32Attr({})},
+        {"fused_transpose_out", res.VectorInt32Attr({})},
+        {"mkldnn_data_type", res.StrAttr("float32")},
+        {"scale_x", res.Float32Attr(1.0f)},
+        {"scale_y", res.Float32Attr(1.0f)},
+        {"scale_in_eltwise", res.Float32Attr(0.0f)},
+        {"scale_out", res.Float32Attr(1.0f)},
+        {"force_fp32_output", res.BoolAttr(false)}};
+
+    const auto &matmul_alpha_attr = res.ComputeAttr(
+        [](const paddle::drr::MatchContext &match_ctx) -> float {
+          auto scale = match_ctx.Attr<float>("scale_");
+          return scale;
+        });
+
+    fused_attrs.emplace("matmul_alpha", matmul_alpha_attr);
+
+    const auto &fused_matmul = res.Op(fused_matmul_name_, fused_attrs);
+
+    if (as_x_) {
+      fused_matmul({&res.Tensor("scale_in"),
+                    &res.Tensor("other"),
+                    &res.InputNoneTensor()},
+                   {&res.Tensor("Out")});
+    } else {
+      fused_matmul({&res.Tensor("other"),
+                    &res.Tensor("scale_in"),
+                    &res.InputNoneTensor()},
+                   {&res.Tensor("Out")});
+    }
+  }
+};
+
+class ScaleFusedMatmulFusePattern : public paddle::drr::DrrPatternBase {
+ private:
+  std::string matmul_name_;
+  std::string fused_matmul_name_;
+  uint32_t benefit_;
+  bool as_x_;  // decide if the output of transpose is for input_x of matmul
+
+ public:
+  ScaleFusedMatmulFusePattern(const std::string &matmul_name,
+                              const std::string &fused_matmul_name,
+                              uint32_t benefit,
+                              bool as_x)
+      : matmul_name_(matmul_name),
+        fused_matmul_name_(fused_matmul_name),
+        benefit_(benefit),
+        as_x_(as_x) {}
+
+  std::string name() const override { return "ScaleFusedMatmulFusePattern"; }
+
+  uint32_t benefit() const override { return benefit_; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern pat = ctx->SourcePattern();
+
+    const auto &full = pat.Op(paddle::dialect::FullOp::name(),
+                              {{"value", pat.Attr("scale_")}});
+    pat.Tensor("scale") = full();
+
+    const auto &scale =
+        pat.Op(paddle::dialect::ScaleOp::name(),
+               {{"bias", pat.Attr("bias")},
+                {"bias_after_scale", pat.Attr("bias_after_scale")}});
+    scale({&pat.Tensor("scale_in"), &pat.Tensor("scale")},
+          {&pat.Tensor("scale_out")});
+
+    const auto &matmul =
+        pat.Op(matmul_name_,
+               {{"trans_x", pat.Attr("transpose_x")},
+                {"trans_y", pat.Attr("transpose_y")},
+                {"matmul_alpha", pat.Attr("matmul_alpha")},
+                {"fuse_activation", pat.Attr("fuse_activation")},
+                {"fuse_alpha", pat.Attr("fuse_alpha")},
+                {"fuse_beta", pat.Attr("fuse_beta")},
+                {"fused_output_scale", pat.Attr("fused_output_scale")},
+                {"fused_reshape_x", pat.Attr("fused_reshape_x")},
+                {"fused_transpose_x", pat.Attr("fused_transpose_x")},
+                {"fused_reshape_y", pat.Attr("fused_reshape_y")},
+                {"fused_transpose_y", pat.Attr("fused_transpose_y")},
+                {"fused_reshape_out", pat.Attr("fused_reshape_out")},
+                {"fused_transpose_out", pat.Attr("fused_transpose_out")},
+                {"mkldnn_data_type", pat.Attr("mkldnn_data_type")},
+                {"scale_x", pat.Attr("scale_x")},
+                {"scale_y", pat.Attr("scale_y")},
+                {"scale_in_eltwise", pat.Attr("scale_in_eltwise")},
+                {"scale_out", pat.Attr("scale_out")},
+                {"force_fp32_output", pat.Attr("force_fp32_output")}});
+    if (as_x_) {
+      matmul({&pat.Tensor("scale_out"),
+              &pat.Tensor("other"),
+              &pat.Tensor("residual")},
+             {&pat.Tensor("Out")});
+    } else {
+      matmul({&pat.Tensor("other"),
+              &pat.Tensor("scale_out"),
+              &pat.Tensor("residual")},
+             {&pat.Tensor("Out")});
+    }
+
+    pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
+      auto matmul_alpha = match_ctx.Attr<float>("matmul_alpha");
+      auto scale = match_ctx.Attr<float>("scale_");
+      auto bias = match_ctx.Attr<float>("bias");
+      // conditions align with fluid pass
+      if (matmul_alpha == 0.0f) return false;
+      if (bias != 0.0f) return false;
+      if (scale <= 0.0f) return false;
+      return true;
+    });
+
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+
+    std::unordered_map<std::string, paddle::drr::Attribute> fused_attrs{
+        {"trans_x", pat.Attr("transpose_x")},
+        {"trans_y", pat.Attr("transpose_y")},
+        {"fuse_activation", pat.Attr("fuse_activation")},
+        {"fuse_alpha", pat.Attr("fuse_alpha")},
+        {"fuse_beta", pat.Attr("fuse_beta")},
+        {"fused_reshape_x", pat.Attr("fused_reshape_x")},
+        {"fused_transpose_x", pat.Attr("fused_transpose_x")},
+        {"fused_reshape_y", pat.Attr("fused_reshape_y")},
+        {"fused_transpose_y", pat.Attr("fused_transpose_y")},
+        {"fused_output_scale", pat.Attr("fused_output_scale")},
+        {"fused_reshape_out", pat.Attr("fused_reshape_out")},
+        {"fused_transpose_out", pat.Attr("fused_transpose_out")},
+        {"mkldnn_data_type", pat.Attr("mkldnn_data_type")},
+        {"scale_x", pat.Attr("scale_x")},
+        {"scale_y", pat.Attr("scale_y")},
+        {"scale_in_eltwise", pat.Attr("scale_in_eltwise")},
+        {"scale_out", pat.Attr("scale_out")},
+        {"force_fp32_output", pat.Attr("force_fp32_output")}};
+
+    const auto &matmul_alpha_attr = res.ComputeAttr(
+        [](const paddle::drr::MatchContext &match_ctx) -> float {
+          auto scale = match_ctx.Attr<float>("scale_");
+          auto matmul_alpha = match_ctx.Attr<float>("matmul_alpha");
+          return scale * matmul_alpha;
+        });
+
+    fused_attrs.emplace("matmul_alpha", matmul_alpha_attr);
+
+    const auto &fused_matmul = res.Op(fused_matmul_name_, fused_attrs);
+
+    if (as_x_) {
+      fused_matmul({&res.Tensor("scale_in"),
+                    &res.Tensor("other"),
+                    &res.Tensor("residual")},
+                   {&res.Tensor("Out")});
+    } else {
+      fused_matmul({&res.Tensor("other"),
+                    &res.Tensor("scale_in"),
+                    &res.Tensor("residual")},
+                   {&res.Tensor("Out")});
+    }
+  }
+};
+
+class ScaleMatmulFusePass : public pir::PatternRewritePass {
+ public:
+  ScaleMatmulFusePass()
+      : pir::PatternRewritePass("reshape_transpose_matmul_fuse_pass", 3) {}
+
+  pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override {
+    pir::RewritePatternSet ps(context);
+    std::vector<bool> bool_set = {false, true};
+    int benefit_idx = 5;
+    for (auto as_x : bool_set) {
+      ps.Add(paddle::drr::Create<ScaleMatmulFusePattern>(
+          context,
+          paddle::dialect::MatmulOp::name(),
+          paddle::onednn::dialect::FusedMatmulOp::name(),
+          benefit_idx,
+          as_x));
+      benefit_idx--;
+    }
+
+    for (auto as_x : bool_set) {
+      ps.Add(paddle::drr::Create<ScaleFusedMatmulFusePattern>(
+          context,
+          paddle::onednn::dialect::FusedMatmulOp::name(),
+          paddle::onednn::dialect::FusedMatmulOp::name(),
+          benefit_idx,
+          as_x));
+      benefit_idx--;
+    }
+    return ps;
+  }
+};
+
+}  // namespace
+
+namespace pir {
+
+std::unique_ptr<Pass> CreateScaleMatmulFusePass() {
+  // pd_op.reshape + pd_op.transpose + pd_op.matmul -> onednn_op.fused_matmul
+  // pd_op.reshape + pd_op.transpose + pd_op.fused_matmul ->
+  // onednn_op.fused_matmul
+  return std::make_unique<ScaleMatmulFusePass>();
+}
+}  // namespace pir
+
+REGISTER_IR_PASS(scale_matmul_fuse_pass, ScaleMatmulFusePass);
diff --git a/paddle/fluid/pir/transforms/onednn/scale_matmul_fuse_pass.h b/paddle/fluid/pir/transforms/onednn/scale_matmul_fuse_pass.h
new file mode 100644
index 0000000000000..2ba8a3787e5dc
--- /dev/null
+++ b/paddle/fluid/pir/transforms/onednn/scale_matmul_fuse_pass.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "paddle/pir/include/core/dll_decl.h"
+
+namespace pir {
+
+class Pass;
+
+IR_API std::unique_ptr<Pass> CreateScaleMatmulFusePass();
+
+}  // namespace pir
diff --git a/paddle/fluid/pir/transforms/onednn/squeeze_transpose_onednn_fuse_pass.cc b/paddle/fluid/pir/transforms/onednn/squeeze_transpose_onednn_fuse_pass.cc
new file mode 100644
index 0000000000000..e1f7250de2932
--- /dev/null
+++ b/paddle/fluid/pir/transforms/onednn/squeeze_transpose_onednn_fuse_pass.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/transforms/onednn/squeeze_transpose_onednn_fuse_pass.h"
+
+#include "paddle/fluid/pir/dialect/operator/ir/onednn_op.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/drr/include/drr_pattern_base.h"
+
+#include "paddle/pir/include/pass/pass.h"
+#include "paddle/pir/include/pass/pass_registry.h"
+
+namespace {
+
+class SqueezeTransposePattern : public paddle::drr::DrrPatternBase {
+ public:
+  SqueezeTransposePattern() {}
+
+  std::string name() const override { return "SqueezeTransposePattern"; }
+
+  uint32_t benefit() const override { return 2; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern pat = ctx->SourcePattern();
+
+    const auto &squeeze = pat.Op(paddle::dialect::SqueezeOp::name());
+    const auto &full_1 = pat.Op(paddle::dialect::FullIntArrayOp::name(),
+                                {{"value", pat.Attr("full_1_value")}});
+
+    squeeze({&pat.Tensor("x"), &full_1()},
+            {&pat.Tensor("squeeze_out"), &pat.Tensor("xshape")});
+
+    const auto &transpose = pat.Op(paddle::dialect::TransposeOp::name(),
+                                   {{"perm", pat.Attr("perm")}});
+
+    transpose({&pat.Tensor("squeeze_out")}, {&pat.Tensor("transpose_op_out")});
+
+    pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
+      auto axis = match_ctx.Attr<std::vector<int64_t>>("full_1_value");
+      auto perm = match_ctx.Attr<std::vector<int>>("perm");
+      if (perm.size() <= 0) return false;
+      if (axis.size() <= 0) return false;
+      return true;
+    });
+
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+
+    const auto &fused_reshape_attr = res.ComputeAttr(
+        [](const paddle::drr::MatchContext &match_ctx) -> std::vector<int> {
+          std::vector<int> int_array_value;
+          auto shape = match_ctx.Attr<std::vector<int64_t>>("full_1_value");
+          for (auto i : shape) {
+            int_array_value.emplace_back(static_cast<int>(i));
+          }
+          return int_array_value;
+        });
+
+    const auto &fused_transpose =
+        res.Op(paddle::onednn::dialect::FusedTransposeOp::name(),
+               {{
+                   {"axis", pat.Attr("perm")},
+                   {"fused_squeeze2_axes", fused_reshape_attr},
+                   {"fused_unsqueeze2_axes", res.VectorInt32Attr({})},
+                   {"fused_reshape2_shape", res.VectorInt32Attr({})},
+                   {"scale", res.Float32Attr(1.0f)},
+                   {"shift", res.Float32Attr(0.0f)},
+                   {"output_data_type", res.StrAttr("fp32")},
+                   {"data_format", res.StrAttr("AnyLayout")},
+                   {"mkldnn_data_type", res.StrAttr("float32")},
+               }});
+    fused_transpose({&res.Tensor("x")}, {&res.Tensor("transpose_op_out")});
+  }
+};
+
+class SqueezeTransposePass : public pir::PatternRewritePass {
+ public:
+  SqueezeTransposePass()
+      : pir::PatternRewritePass("squeeze_transpose_onednn_fuse_pass", 3) {}
+
+  pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override {
+    pir::RewritePatternSet ps(context);
+    ps.Add(paddle::drr::Create<SqueezeTransposePattern>(context));
+    return ps;
+  }
+};
+
+}  // namespace
+
+namespace pir {
+
+std::unique_ptr<Pass> CreateSqueezeTransposeOneDNNPass() {
+  // pd_op.squeeze + transpose2  -> onednn_op.fused_transpose
+  return std::make_unique<SqueezeTransposePass>();
+}
+
+}  // namespace pir
+
+REGISTER_IR_PASS(squeeze_transpose_onednn_fuse_pass, SqueezeTransposePass);
diff --git a/paddle/fluid/pir/transforms/onednn/squeeze_transpose_onednn_fuse_pass.h b/paddle/fluid/pir/transforms/onednn/squeeze_transpose_onednn_fuse_pass.h
new file mode 100644
index 0000000000000..fce3e0f6e8a80
--- /dev/null
+++ b/paddle/fluid/pir/transforms/onednn/squeeze_transpose_onednn_fuse_pass.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "paddle/pir/include/core/dll_decl.h"
+
+namespace pir {
+
+class Pass;
+
+IR_API std::unique_ptr<Pass> CreateSqueezeTransposeOneDNNPass();
+
+}  // namespace pir
diff --git a/paddle/fluid/pir/transforms/passes.h b/paddle/fluid/pir/transforms/passes.h
index 27a5c741e157d..bc15794c45ec6 100644
--- a/paddle/fluid/pir/transforms/passes.h
+++ b/paddle/fluid/pir/transforms/passes.h
@@ -38,17 +38,27 @@ USE_PIR_PASS(conv2d_add_act_fuse_pass);
 USE_PIR_PASS(embedding_eltwise_layernorm_fuse_pass);
 USE_PIR_PASS(add_norm_fuse_pass);
 USE_PIR_PASS(fused_dot_product_attention_pass);
+USE_PIR_PASS(fused_flash_attn_pass);
+USE_PIR_PASS(remove_redundant_transpose_pass);
 
 #ifdef PADDLE_WITH_DNNL
+USE_PIR_PASS(depthwise_conv_onednn_pass);
+USE_PIR_PASS(squeeze_transpose_onednn_fuse_pass);
 USE_PIR_PASS(batch_norm_act_fuse_pass);
 USE_PIR_PASS(conv2d_bias_fuse_pass);
 USE_PIR_PASS(conv2d_transpose_bias_fuse_pass);
 USE_PIR_PASS(conv3d_bias_fuse_pass);
+USE_PIR_PASS(scale_matmul_fuse_pass);
+USE_PIR_PASS(reshape_transpose_matmul_fuse_pass);
+USE_PIR_PASS(matmul_transpose_reshape_fuse_pass);
 USE_PIR_PASS(matmul_elementwise_add_fuse_pass);
 USE_PIR_PASS(matmul_activation_fuse_pass);
-USE_PIR_PASS(conv_elementwise_add_mkldnn_fuse_pass);
+USE_PIR_PASS(conv_elementwise_add_onednn_fuse_pass);
+USE_PIR_PASS(conv_activation_onednn_fuse_pass);
+USE_PIR_PASS(conv_concat_activation_onednn_fuse_pass);
 #endif
 
 #ifdef PADDLE_WITH_XPU
 USE_PIR_PASS(add_layernorm_xpu_fuse_pass);
+USE_PIR_PASS(group_norm_silu_xpu_fuse_pass);
 #endif
diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
index 182aa009a020c..43a3e2237036b 100644
--- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
@@ -754,7 +754,7 @@ static phi::Backend GetKernelBackendByYaml(
   auto& backend_info = op_info_parser->OpRuntimeInfo().kernel_key_backend;
   phi::Backend kernel_backend = phi::Backend::UNDEFINED;
 
-  for (auto slot_name : backend_info) {
+  for (const auto& slot_name : backend_info) {
     auto& input_map = op_info_parser->InputName2Id();
 
     if (input_map.count(slot_name)) {
diff --git a/paddle/fluid/pir/transforms/shape_optimization_pass.cc b/paddle/fluid/pir/transforms/shape_optimization_pass.cc
index d5ced352047da..932d03d7a42ff 100644
--- a/paddle/fluid/pir/transforms/shape_optimization_pass.cc
+++ b/paddle/fluid/pir/transforms/shape_optimization_pass.cc
@@ -270,6 +270,24 @@ class ShapeOptimizationPass : public pir::Pass {
 
 }  // namespace
 
+static inline bool IsStaticShape(const Value& value) {
+  const auto& value_type = value.type();
+  if (!value || !value_type ||
+      !value_type.isa<paddle::dialect::DenseTensorType>()) {
+    return false;
+  }
+  return !::common::contain_unknown_dim(
+      value_type.dyn_cast<paddle::dialect::DenseTensorType>().dims());
+}
+
+symbol::ShapeOrDataDimExprs CreateShapeOrDataByDDim(const pir::DDim& dims) {
+  std::vector<symbol::DimExpr> dim_exprs;
+  for (int i = 0; i < dims.size(); ++i) {
+    dim_exprs.emplace_back(symbol::DimExpr{dims.at(i)});
+  }
+  return symbol::TensorShapeOrDataDimExprs{dim_exprs};
+}
+
 void InferSymExprForBlock(const Block& block,
                           ShapeConstraintIRAnalysis* shape_analysis) {
   for (auto& op : block) {
@@ -290,8 +308,33 @@ void InferSymExprForBlock(const Block& block,
             &op, shape_analysis->GetShapeOrDataForValue(op.result(0)));
       }
     } else {
-      PADDLE_THROW(phi::errors::Unimplemented(
-          op.name() + " DOES NOT have InferSymbolicShapeInterface!"));
+      const bool all_outs_static_dims = [&] {
+        bool all_static_dims = true;
+        for (uint32_t i = 0; i < op.num_results(); ++i) {
+          if (IsStaticShape(op.result(i))) {
+            continue;
+          } else {
+            all_static_dims = false;
+            break;
+          }
+        }
+        return all_static_dims;
+      }();
+
+      if (all_outs_static_dims) {
+        for (uint32_t i = 0; i < op.num_results(); ++i) {
+          shape_analysis->SetShapeOrDataForValue(
+              op.result(i),
+              CreateShapeOrDataByDDim(
+                  op.result(i)
+                      .type()
+                      .dyn_cast<paddle::dialect::DenseTensorType>()
+                      .dims()));
+        }
+      } else {
+        PADDLE_THROW(phi::errors::Unimplemented(
+            op.name() + " DOES NOT have InferSymbolicShapeInterface!"));
+      }
     }
     DebugPrintOpInfo(&op, shape_analysis);
     CheckInferSymWithInferMeta(&op, shape_analysis);
diff --git a/paddle/fluid/pir/transforms/xpu/group_norm_silu_fuse_pass.cc b/paddle/fluid/pir/transforms/xpu/group_norm_silu_fuse_pass.cc
new file mode 100644
index 0000000000000..0b93a496d4dde
--- /dev/null
+++ b/paddle/fluid/pir/transforms/xpu/group_norm_silu_fuse_pass.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/transforms/xpu/group_norm_silu_fuse_pass.h"
+
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/drr/include/drr_pattern_base.h"
+#include "paddle/fluid/pir/utils/general_functions.h"
+
+#include "paddle/pir/include/pass/pass.h"
+#include "paddle/pir/include/pass/pass_registry.h"
+
+/*
+fuse gn + activation block in to group_norm_silu op
+For example:
+graph:
+                      X
+              Scale   |   Bias
+                   \  |  /
+                  group norm
+                   /  |  \
+                  /   |   \
+            variance  |   mean
+                      |
+                     silu
+                      |
+                    output
+------------------------------------------------------
+After the pass is applied:
+                      X
+              Scale   |   Bias
+                   \  |  /
+                group_norm_silu
+                      |
+                     Out
+*/
+
+namespace {
+
+class GroupNormSiluPattern : public paddle::drr::DrrPatternBase {
+ public:
+  std::string name() const override { return "GroupNormSiluPattern"; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern pat = ctx->SourcePattern();
+    const auto &groupnorm = pat.Op(
+        paddle::dialect::GroupNormOp::name(),
+        {{"epsilon", pat.Attr("epsilon")}, {"groups", pat.Attr("groups")}});
+
+    const auto &silu = pat.Op(paddle::dialect::SiluOp::name());
+
+    groupnorm({&pat.Tensor("X"), &pat.Tensor("Scale"), &pat.Tensor("Bias")},
+              {&pat.Tensor("Y"), &pat.Tensor("Mean"), &pat.Tensor("Variance")});
+    silu({&pat.Tensor("Y")}, {&pat.Tensor("Out")});
+
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+
+    const auto &group_norm_silu_xpu = res.Op(
+        paddle::dialect::GroupNormSiluXpuOp::name(),
+        {{{"epsilon", pat.Attr("epsilon")}, {"groups", pat.Attr("groups")}}});
+    group_norm_silu_xpu(
+        {&res.Tensor("X"), &res.Tensor("Scale"), &res.Tensor("Bias")},
+        {&res.Tensor("Out")});
+  }
+};
+
+class GroupNormSiluXpuFusePass : public pir::PatternRewritePass {
+ public:
+  GroupNormSiluXpuFusePass()
+      : pir::PatternRewritePass("group_norm_silu_xpu_fuse_pass", 2) {}
+
+  pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override {
+    pir::RewritePatternSet ps(context);
+    ps.Add(paddle::drr::Create<GroupNormSiluPattern>(context));
+    return ps;
+  }
+};
+
+}  // namespace
+
+namespace pir {
+std::unique_ptr<Pass> CreateGroupNormSiluXpuFusePass() {
+  return std::make_unique<GroupNormSiluXpuFusePass>();
+}
+
+}  // namespace pir
+
+REGISTER_IR_PASS(group_norm_silu_xpu_fuse_pass, GroupNormSiluXpuFusePass);
diff --git a/paddle/fluid/pir/transforms/xpu/group_norm_silu_fuse_pass.h b/paddle/fluid/pir/transforms/xpu/group_norm_silu_fuse_pass.h
new file mode 100644
index 0000000000000..665c7dcb03f16
--- /dev/null
+++ b/paddle/fluid/pir/transforms/xpu/group_norm_silu_fuse_pass.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "paddle/pir/include/core/dll_decl.h"
+
+namespace pir {
+
+class Pass;
+
+IR_API std::unique_ptr<Pass> CreateGroupNormSiluXpuFusePass();
+
+}  // namespace pir
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 2cabc79bb3844..99a6606d1183a 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -41,10 +41,10 @@ cc_test(
   SRCS place_test.cc
   DEPS place glog phi common)
 
-if(WITH_MKLDNN)
-  set(MKLDNN_CTX_DEPS mkldnn)
+if(WITH_ONEDNN)
+  set(ONEDNN_CTX_DEPS onednn)
 else()
-  set(MKLDNN_CTX_DEPS)
+  set(ONEDNN_CTX_DEPS)
 endif()
 
 add_subdirectory(device)
@@ -126,7 +126,7 @@ cc_library(
        framework_proto
        ${IPU_CTX_DEPS}
        ${GPU_CTX_DEPS}
-       ${MKLDNN_CTX_DEPS}
+       ${ONEDNN_CTX_DEPS}
        ${dgc_deps}
        dlpack
        phi
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 36189cc7e4c90..73704b04cf90b 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -217,6 +217,7 @@ class RecordedGpuMallocHelper {
     CUDADeviceGuard guard(dev_id_);
     gpuError_t result;
 #ifdef PADDLE_WITH_HIP
+    phi::backends::gpu::CUDAGraphCaptureModeGuard capture_mode_guard;
     if (UNLIKELY(malloc_managed_memory)) {
       result = hipMallocManaged(ptr, size);
     } else {
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/onednn_helper.h
similarity index 99%
rename from paddle/fluid/platform/mkldnn_helper.h
rename to paddle/fluid/platform/onednn_helper.h
index 6132aa9292e56..145f42f669d9d 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/onednn_helper.h
@@ -52,7 +52,7 @@ inline void DontClearMKLDNNCache(const platform::Place& place) {
   }
 }
 
-// If MKLDNN build and CPU place then register suffix in DeviceContext
+// If OneDNN build and CPU place then register suffix in DeviceContext
 inline void AttachPointerHashToMKLDNNKey(void* ptr,
                                          const platform::Place& place) {
   if (platform::is_cpu_place(place)) {
diff --git a/paddle/fluid/platform/mkldnn_op_list.h b/paddle/fluid/platform/onednn_op_list.h
similarity index 100%
rename from paddle/fluid/platform/mkldnn_op_list.h
rename to paddle/fluid/platform/onednn_op_list.h
diff --git a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
index 169d41d9763e5..4ef0cfee6e283 100644
--- a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
+++ b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
@@ -459,6 +459,15 @@ void sqrt_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {
   }
 }
 
+template <typename T>
+void rsqrt_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {
+  if (x_grad) {
+    // This calculation is important for resnet.
+    auto x_grad_tmp = -0.5 * out * out * out * out_grad;
+    set_output<T>(x_grad_tmp, x_grad);
+  }
+}
+
 template <typename T>
 void floor_grad(const Tensor& out_grad, Tensor* x_grad) {
   if (x_grad) {
@@ -1123,6 +1132,58 @@ void max_grad(const Tensor& x,
   set_output<T>(x_grad_tmp, x_grad);
 }
 
+template <typename T>
+void min_grad(const Tensor& x,
+              const Tensor& out,
+              const Tensor& out_grad,
+              const IntArray& axis,
+              bool keepdim,
+              bool reduce_all,
+              Tensor* x_grad) {
+  if (!x_grad) {
+    return;
+  }
+  auto zero_tensor = full<T>(common::vectorize(x.dims()), 0.0, x.dtype());
+  std::vector<int64_t> x_dim = common::vectorize<int64_t>(x.dims());
+  int64_t axis_size = axis.size();
+  int64_t x_dim_size = x_dim.size();
+  reduce_all = false;
+  if (reduce_all || axis_size == 0 || axis_size == x_dim_size) {
+    reduce_all = true;
+  } else {
+    reduce_all = false;
+  }
+  auto x_grad_tmp = Tensor();
+  if (x_dim_size == 0 || x_dim_size == 1 || keepdim) {
+    auto out_grad_tmp = out_grad.expand(IntArray(x_dim));
+    auto out_tmp = out.expand(IntArray(x_dim));
+    auto mask = equal<T>(x, out_tmp);
+    x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
+  } else {
+    auto axis_ = std::vector<int64_t>();
+    if (reduce_all) {
+      for (int64_t i = 0; i < x_dim_size; i++) {
+        axis_.push_back(i);
+      }
+    } else {
+      axis_ = axis.GetData();
+      for (int64_t i = 0; i < axis_size; i++) {
+        if (axis[i] < 0) {
+          axis_[i] = axis[i] + x_dim_size;
+        }
+      }
+    }
+    auto out_grad_shape = get_unsqueeze_dims(out_grad, axis_);
+    auto out_grad_ = reshape<T>(out_grad, out_grad_shape);
+    auto out_ = reshape<T>(out, out_grad_shape);
+    auto out_grad_tmp = out_grad_.expand(IntArray(x_dim));
+    auto out_tmp = out_.expand(IntArray(x_dim));
+    auto mask = equal<T>(x, out_tmp);
+    x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
+  }
+  set_output<T>(x_grad_tmp, x_grad);
+}
+
 template <typename T>
 void assign_grad(const Tensor& out_grad, Tensor* x_grad) {
   if (x_grad) {
diff --git a/paddle/fluid/prim/api/composite_backward/composite_double_backward_api.h b/paddle/fluid/prim/api/composite_backward/composite_double_backward_api.h
index 2c5c4fcea8b41..67feb640c9f7a 100644
--- a/paddle/fluid/prim/api/composite_backward/composite_double_backward_api.h
+++ b/paddle/fluid/prim/api/composite_backward/composite_double_backward_api.h
@@ -702,9 +702,9 @@ void add_double_grad(const Tensor& y,
     if (grad_x_grad && grad_y_grad) {
       set_output<T>(grad_x_grad.get() + grad_y_grad.get(), grad_out_grad);
     } else if (grad_x_grad) {
-      set_output<T>(grad_x_grad.get(), grad_out_grad);
+      by_pass<T>(grad_x_grad.get(), grad_out_grad);
     } else if (grad_y_grad) {
-      set_output<T>(grad_y_grad.get(), grad_out_grad);
+      by_pass<T>(grad_y_grad.get(), grad_out_grad);
     } else {
       set_output<T>(full<T>(common::vectorize(grad_out.dims()), 0.0, y.dtype()),
                     grad_out_grad);
@@ -773,9 +773,9 @@ void subtract_double_grad(const Tensor& y,
     if (grad_x_grad && grad_y_grad) {
       set_output<T>(grad_x_grad.get() - grad_y_grad.get(), grad_out_grad);
     } else if (grad_x_grad) {
-      set_output<T>(grad_x_grad.get(), grad_out_grad);
+      by_pass<T>(grad_x_grad.get(), grad_out_grad);
     } else if (grad_y_grad) {
-      set_output<T>(-grad_y_grad.get(), grad_out_grad);
+      by_pass<T>(-grad_y_grad.get(), grad_out_grad);
     } else {
       set_output<T>(
           full<T>(common::vectorize(grad_out.dims()), 0, grad_out.dtype()),
@@ -784,5 +784,54 @@ void subtract_double_grad(const Tensor& y,
   }
 }
 
+template <typename T>
+void exp_double_grad(const Tensor& out,
+                     const Tensor& grad_out,
+                     const Tensor& grad_x_grad,
+                     Tensor* out_grad,
+                     Tensor* grad_out_grad) {
+  // dout = dout_old * ddx
+  if (out_grad) {
+    auto out_grad_tmp = grad_out * grad_x_grad;
+    set_output<T>(out_grad_tmp, out_grad);
+  }
+
+  // ddout = out * ddx
+  if (grad_out_grad) {
+    auto grad_out_grad_tmp = out * grad_x_grad;
+    set_output<T>(grad_out_grad_tmp, grad_out_grad);
+  }
+}
+
+template <typename T>
+void log_double_grad(const Tensor& x,
+                     const Tensor& grad_out,
+                     const Tensor& grad_x_grad,
+                     Tensor* x_grad,
+                     Tensor* grad_out_grad) {
+  // dx = -dout/x^2 * ddx
+  if (x_grad) {
+    auto x_grad_tmp = -grad_out / (x * x) * grad_x_grad;
+    set_output<T>(x_grad_tmp, x_grad);
+  }
+
+  // ddout = ddx / x
+  if (grad_out_grad) {
+    auto grad_out_grad_tmp = grad_x_grad / x;
+    set_output<T>(grad_out_grad_tmp, grad_out_grad);
+  }
+}
+
+template <typename T>
+void abs_triple_grad(const Tensor& x,
+                     const Tensor& grad_out_grad_grad,
+                     Tensor* grad_grad_x_grad) {
+  // dddx = sign(x) * dddout
+  if (grad_grad_x_grad) {
+    auto grad_grad_x_grad_tmp = sign<T>(x) * grad_out_grad_grad;
+    set_output<T>(grad_grad_x_grad_tmp, grad_grad_x_grad);
+  }
+}
+
 }  // namespace prim
 }  // namespace paddle
diff --git a/paddle/fluid/prim/api/manual_prim/utils/utils.h b/paddle/fluid/prim/api/manual_prim/utils/utils.h
index 9062d979b40db..cbbe846671114 100644
--- a/paddle/fluid/prim/api/manual_prim/utils/utils.h
+++ b/paddle/fluid/prim/api/manual_prim/utils/utils.h
@@ -88,7 +88,7 @@ static phi::DDim get_reduce_dims(const phi::DDim& x_dims,
   * y_dims =     [2, 1, 6, 1]  <-- shaped are right-aligned for comparison
   * <-- broadcast -->
   * z_dims = [10, 2, 4, 6, 5]
-  * ==> reduce_dims_from_z_to_x = [0, 1, 3]
+  * ==> reduce_dims_from_z_to_x = [1, 3]
   * ==> reduce_dims_from_z_to_y = [0, 2, 4]
   */
   auto out_dims = paddle::operators::details::BroadcastTwoDims(x_dims, y_dims);
diff --git a/paddle/fluid/primitive/backend/manual/manual_static_prim_backend.cc b/paddle/fluid/primitive/backend/manual/manual_static_prim_backend.cc
index a79e929a6e5cc..a479379cc6ab4 100644
--- a/paddle/fluid/primitive/backend/manual/manual_static_prim_backend.cc
+++ b/paddle/fluid/primitive/backend/manual/manual_static_prim_backend.cc
@@ -43,7 +43,7 @@ Tensor full_with_tensor<LazyTensor>(const Tensor& shape,
       std::static_pointer_cast<LazyTensor>(shape.impl())->value();
   pir::Value value_res = paddle::dialect::full(
       std::vector<int64_t>{}, value.to<float>(), dtype, place);
-  auto op_res = paddle::dialect::full_with_tensor(shape_res, value_res, dtype);
+  auto op_res = paddle::dialect::full_with_tensor(value_res, shape_res, dtype);
   Tensor out(std::make_shared<LazyTensor>(op_res));
   return out;
 }
diff --git a/paddle/fluid/primitive/base/decomp_trans.cc b/paddle/fluid/primitive/base/decomp_trans.cc
index c71da029b4e37..7d8a0b723eceb 100644
--- a/paddle/fluid/primitive/base/decomp_trans.cc
+++ b/paddle/fluid/primitive/base/decomp_trans.cc
@@ -298,12 +298,12 @@ std::vector<pir::Value> DecompProgram::format_decomp_res(
   return new_decomp_outs;
 }
 
-std::vector<pir::Value> DecompProgram::construct_dst_vars(
+void DecompProgram::construct_dst_vars(
     const std::string& op_name,
     const std::vector<pir::Value>& orig_outs,
     const std::vector<pir::Value>& decomp_outs,
-    std::unordered_map<pir::Value, int> orig_vars_dict) {
-  std::vector<pir::Value> tar_vars(src_vars_.size());
+    std::unordered_map<pir::Value, int> orig_vars_dict,
+    std::vector<pir::Value>* tar_vars) {
   PADDLE_ENFORCE_EQ(
       orig_outs.size(),
       decomp_outs.size(),
@@ -315,10 +315,9 @@ std::vector<pir::Value> DecompProgram::construct_dst_vars(
           decomp_outs.size()));
   for (size_t i = 0; i < orig_outs.size(); i++) {
     if (orig_vars_dict.find(orig_outs[i]) != orig_vars_dict.end()) {
-      tar_vars[orig_vars_dict[orig_outs[i]]] = decomp_outs[i];
+      (*tar_vars)[orig_vars_dict[orig_outs[i]]] = decomp_outs[i];
     }
   }
-  return tar_vars;
 }
 
 std::vector<pir::Value> DecompProgram::get_dst_vars() {
@@ -427,8 +426,11 @@ void DecompProgram::decomp_block(
       std::vector<pir::Value> standard_decomp_res =
           format_decomp_res(op->name(), orig_outs, decomp_res);
       check_decomp_outputs(op->name(), orig_outs, standard_decomp_res);
-      tar_vars = construct_dst_vars(
-          op->name(), orig_outs, standard_decomp_res, orig_vars_dict);
+      construct_dst_vars(op->name(),
+                         orig_outs,
+                         standard_decomp_res,
+                         orig_vars_dict,
+                         &tar_vars);
 
       op->ReplaceAllUsesWith(standard_decomp_res);
       bool remove_op = true;
diff --git a/paddle/fluid/primitive/base/decomp_trans.h b/paddle/fluid/primitive/base/decomp_trans.h
index 21e48d94f97a7..00f538cb0dafc 100644
--- a/paddle/fluid/primitive/base/decomp_trans.h
+++ b/paddle/fluid/primitive/base/decomp_trans.h
@@ -50,11 +50,11 @@ class DecompProgram {
       const std::string& op_name,
       const std::vector<pir::Value>& orig_outs,
       const std::vector<std::vector<pir::Value>>& decomp_outs);
-  std::vector<pir::Value> construct_dst_vars(
-      const std::string& op_name,
-      const std::vector<pir::Value>& orig_outs,
-      const std::vector<pir::Value>& decomp_outs,
-      std::unordered_map<pir::Value, int> orig_vars_dict);
+  void construct_dst_vars(const std::string& op_name,
+                          const std::vector<pir::Value>& orig_outs,
+                          const std::vector<pir::Value>& decomp_outs,
+                          std::unordered_map<pir::Value, int> orig_vars_dict,
+                          std::vector<pir::Value>* tar_vars);
   bool enable_decomp_by_filter(const std::string& op_name);
   void set_src_vars(const std::vector<pir::Value>& src_vars) {
     src_vars_ = src_vars;
diff --git a/paddle/fluid/primitive/base/primitive_ops.h b/paddle/fluid/primitive/base/primitive_ops.h
index aa52907f8f7fe..4aafd7693ae75 100644
--- a/paddle/fluid/primitive/base/primitive_ops.h
+++ b/paddle/fluid/primitive/base/primitive_ops.h
@@ -46,7 +46,6 @@ const std::set<std::string>& GetPrimitiveOpNames() {
       "pd_op.assign_value",
       "pd_op.concat",
       "pd_op.elementwise_pow",
-      "pd_op.rsqrt",
       "pd_op.floor",
       "pd_op.gather",
       "pd_op.gather_nd",
@@ -91,6 +90,10 @@ const std::set<std::string>& GetPrimitiveOpNames() {
       "pd_op.full_with_tensor",
       "pd_op.if",
       "pd_op.while",
+      /* Considering better performance, such ops are set as primitive ops
+         temporarily*/
+      "pd_op.rsqrt",
+      "pd_op.sqrt",
       /* basic ops by PIR*/
       "builtin.combine",
       "builtin.slice",
diff --git a/paddle/fluid/primitive/codegen/decomp_gen.py b/paddle/fluid/primitive/codegen/decomp_gen.py
index 95b40f9f87506..bfc157d24c3a6 100644
--- a/paddle/fluid/primitive/codegen/decomp_gen.py
+++ b/paddle/fluid/primitive/codegen/decomp_gen.py
@@ -156,7 +156,7 @@ def gen(
 
     Args:
         prim_path (pathlib.Path): The YAML file path of the primitive API.
-        fwd_path (pathlib.Path):  The YAML file path of the forwad API.
+        fwd_path (pathlib.Path):  The YAML file path of the forward API.
         rev_path (pathlib.Path): The YAML file path of the backward API.
         compat_path: (pathlib.Path): The YAML file path of the ops compat.
         fwd_pd_op_path (pathlib.Path): The YAML file path of the ir forward API.
diff --git a/paddle/fluid/primitive/codegen/gen.py b/paddle/fluid/primitive/codegen/gen.py
index e4d0e50e60877..dd75859e16b74 100644
--- a/paddle/fluid/primitive/codegen/gen.py
+++ b/paddle/fluid/primitive/codegen/gen.py
@@ -63,6 +63,7 @@
     'exp_grad',
     'floor_grad',
     'log_grad',
+    'rsqrt_grad',
     'sin_grad',
     'cos_grad',
     'tanh_grad',
@@ -117,6 +118,9 @@
     'relu_grad',
     'sigmoid_grad',
     'silu_grad',
+    'exp_grad',
+    'log_grad',
+    'abs_double_grad',
     'softmax_grad',
     'sqrt_grad',
 ]  # custom vjp list of composite op
diff --git a/paddle/fluid/primitive/composite/composite.h b/paddle/fluid/primitive/composite/composite.h
index 539d161243698..67cc7d6388460 100644
--- a/paddle/fluid/primitive/composite/composite.h
+++ b/paddle/fluid/primitive/composite/composite.h
@@ -127,6 +127,59 @@ static bool valid_type(const DataType& dtype) {
   }
 }
 
+template <typename T>
+Tensor p_norm_decomp(const Tensor& x,
+                     const float& porder = 2.0,
+                     const int& axis = -1,
+                     const float epsilon = 1.0e-12f,
+                     const bool& keepdim = false,
+                     const bool& asvector = false) {
+  auto org_dtype = x.dtype();
+  auto x_tmp = x;
+
+  bool need_cast = is_half_dtype(org_dtype);
+  if (need_cast) {
+    x_tmp = cast<T>(x, DataType::FLOAT32);
+  }
+
+  Tensor res;
+  if (porder == 0.0) {
+    // 0-norm
+    auto zero = full<T>(empty_shape, 0, x_tmp.dtype());
+    auto none_zero = not_equal<T>(x_tmp, zero);
+    res = cast<T>(none_zero, x_tmp.dtype());
+    res = sum<T>(res, {axis}, x_tmp.dtype(), keepdim);
+  } else if (porder == 1.0) {
+    // 1-norm
+    res = abs<T>(x_tmp);
+    res = sum<T>(res, {axis}, x_tmp.dtype(), keepdim);
+  } else if (porder == 2.0) {
+    // 2-norm
+    res = sqrt<T>(sum<T>(x_tmp * x_tmp, {axis}, x_tmp.dtype(), keepdim));
+  } else if (porder == INFINITY) {
+    // +INF-norm
+    res = abs<T>(x_tmp);
+    res = max<T>(x_tmp, {axis}, keepdim);
+  } else if (porder == -INFINITY) {
+    // -INF-norm
+    res = abs<T>(x_tmp);
+    res = min<T>(x_tmp, {axis}, keepdim);
+  } else {
+    // vanilla p-norm
+    auto porder_tensor = full<T>(empty_shape, porder, x_tmp.dtype());
+    auto inv_porder_tensor = full<T>(empty_shape, 1 / porder, x_tmp.dtype());
+    res = elementwise_pow<T>(x_tmp, porder_tensor);
+    res = sum<T>(res, {axis}, x_tmp.dtype(), keepdim);
+    res = elementwise_pow<T>(res, inv_porder_tensor);
+  }
+
+  if (need_cast) {
+    return cast<T>(res, org_dtype);
+  } else {
+    return res;
+  }
+}
+
 template <typename T>
 Tensor pow_decomp(const Tensor& x, const paddle::Scalar& y) {
   auto org_dtype = x.dtype();
@@ -153,6 +206,11 @@ Tensor pow_decomp(const Tensor& x, const paddle::Scalar& y) {
   }
 }
 
+template <typename T>
+Tensor reciprocal_decomp(const Tensor& x) {
+  return full<T>(empty_shape, 1.0, x.dtype()) / x;
+}
+
 template <typename T>
 std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_decomp(
     const Tensor& x,
@@ -592,24 +650,6 @@ std::tuple<Tensor, Tensor> dropout_decomp(
   }
 }
 
-template <typename T>
-Tensor sqrt_decomp(const Tensor& x) {
-  auto org_dtype = x.dtype();
-  Tensor x_cast = x;
-
-  bool need_cast = is_half_dtype(org_dtype);
-  if (need_cast) {
-    x_cast = cast<T>(x, DataType::FLOAT32);
-  }
-
-  auto ans = 1.0 / rsqrt<T>(x_cast);
-  if (need_cast) {
-    return cast<T>(ans, org_dtype);
-  } else {
-    return ans;
-  }
-}
-
 template <typename T>
 Tensor gelu_decomp(const Tensor& x, bool approximate) {
   const double PM_2_SQRTPI = 1.12837916709551257390; /* 2/sqrt(pi) */
@@ -778,10 +818,12 @@ std::tuple<Tensor, Tensor> flatten_decomp(const Tensor& x,
 
     for (size_t i = 0; i < x_dim.size();) {
       if (i == static_cast<size_t>(start_axis)) {
-        Tensor flat =
-            slice<T>(x_shape, {0}, {start_axis}, {end_axis + 1}, {1}, {});
-        flat = prod<T>(flat, {0}, false, false);
-        out_shape.push_back(reshape<T>(flat, {1}));
+        Tensor flat = get_slice<T>(x_shape, i);
+
+        for (auto t = start_axis + 1; t <= end_axis; ++t) {
+          flat = flat * get_slice<T>(x_shape, t);
+        }
+        out_shape.push_back(flat);
         i = end_axis + 1;
       } else {
         out_shape.push_back(get_slice<T>(x_shape, i));
@@ -839,14 +881,19 @@ std::tuple<Tensor, Tensor, Tensor> group_norm_decomp(
     const float epsilon,
     const int groups,
     const std::string& data_format) {
-  if (data_format != "NCHW") {
-    // TODO(chengyanfu): support NHWC data format
-    PADDLE_THROW(phi::errors::Unimplemented("Only support NCHW format."));
+  std::vector<int64_t> c_axis;
+  if (data_format == "NCHW") {
+    c_axis = {1};
+  } else if (data_format == "NHWC") {
+    c_axis = {1, 3};
+  } else {
+    PADDLE_THROW(
+        phi::errors::Unimplemented("Only support NCHW and NHWC format."));
   }
   size_t rank = x.shape().size();
-  if (rank != 3 && rank != 4) {
-    PADDLE_THROW(
-        phi::errors::Unimplemented("Only support NCHW format in rank 3 or 4."));
+  if (rank < 3 || rank > 5) {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "Only support NCHW and NHWC format in rank {3, 4, 5}."));
   }
 
   auto org_dtype = x.dtype();
@@ -856,22 +903,28 @@ std::tuple<Tensor, Tensor, Tensor> group_norm_decomp(
   if (need_cast) {
     x_cast = cast<T>(x, DataType::FLOAT32);
   }
-  if (rank == 3) {
-    x_cast = unsqueeze<T>(x_cast, {-1});
-  }
+
   Tensor x_dim_t;
   Tensor out, mean_, var_;
   if (has_dynamic_shape(x_cast.shape())) {
-    Tensor x_dim_t = shape<T>(x_cast);
-    std::vector<int64_t> one_axis(1, 1);
-    Tensor x_shape = get_slice<T>(x_dim_t, 0) * groups;
-    Tensor dim_1 = full<T>({1}, -1, x_dim_t.type());
-    x_shape = concat<T>({x_shape, dim_1});
-    x_cast = backend::reshape<T>(x_cast, x_shape);
-    mean_ = mean_decomp<T>(x_cast, IntArray(one_axis), true);
+    x_dim_t = shape<T>(x_cast);
+    Tensor tar_shape;
+    if (data_format == "NCHW") {
+      tar_shape = get_slice<T>(x_dim_t, 0) * groups;
+      Tensor dim_1 = full<T>({1}, -1, x_dim_t.type());
+      tar_shape = concat<T>({tar_shape, dim_1});
+    } else {
+      Tensor N_shape = get_slice<T>(x_dim_t, 0);
+      Tensor dim_1 = full<T>({1}, -1, x_dim_t.type());
+      Tensor C_shape = get_slice<T>(x_dim_t, rank - 1);
+      Tensor dim_g = full<T>({1}, groups, x_dim_t.type());
+      Tensor dim_c_div_g = cast<T>(C_shape / dim_g, x_dim_t.type());
+      tar_shape = concat<T>({N_shape, dim_1, dim_g, dim_c_div_g});
+    }
+    x_cast = backend::reshape<T>(x_cast, tar_shape);
+    mean_ = mean_decomp<T>(x_cast, c_axis, true);
     Tensor var_tmp_ =
-        mean_decomp<T>(x_cast * x_cast, IntArray(one_axis), true) -
-        mean_ * mean_;
+        mean_decomp<T>(x_cast * x_cast, c_axis, true) - mean_ * mean_;
     var_ = maximum<T>(
         var_tmp_,
         backend::full_with_tensor<T>(shape<T>(var_tmp_), 0, var_tmp_.dtype()));
@@ -881,23 +934,33 @@ std::tuple<Tensor, Tensor, Tensor> group_norm_decomp(
     out = backend::reshape<T>(res, x_dim_t);
   } else {
     auto x_dim = x_cast.shape();
-    std::vector<int64_t> one_axis(1, 1);
-
-    std::vector<int64_t> x_shape{x_dim[0] * groups, -1};
-    x_cast = reshape<T>(x_cast, x_shape);
-    mean_ = mean_decomp<T>(x_cast, IntArray(one_axis), true);
-    auto var_tmp_ = mean_decomp<T>(x_cast * x_cast, IntArray(one_axis), true) -
-                    mean_ * mean_;
+    if (data_format == "NCHW") {
+      x_cast = reshape<T>(x_cast, {x_dim[0] * groups, -1});
+    } else {
+      int c_div_g = x_dim[rank - 1] / groups;
+      x_cast = reshape<T>(x_cast, {x_dim[0], -1, groups, c_div_g});
+    }
+    mean_ = mean_decomp<T>(x_cast, c_axis, true);
+    auto var_tmp_ =
+        mean_decomp<T>(x_cast * x_cast, c_axis, true) - mean_ * mean_;
     var_ = maximum<T>(var_tmp_, full<T>(var_tmp_.shape(), 0, var_tmp_.dtype()));
     auto var_inv = rsqrt<T>(var_ + full<T>(empty_shape, epsilon, var_.dtype()));
     auto res = (x_cast - mean_) * var_inv;
     out = reshape<T>(res, x_dim);
   }
 
-  std::vector<int64_t> slice_bias_shape{-1, 1, 1};
+  std::vector<int64_t> slice_bias_shape;
+  slice_bias_shape = {-1};
+  for (size_t i = 0; i < rank - 2; i++) {
+    slice_bias_shape.push_back(1);
+  }
   Tensor scale_cast;
   if (scale) {
-    scale_cast = reshape<T>(scale.get(), slice_bias_shape);
+    if (data_format == "NCHW") {
+      scale_cast = reshape<T>(scale.get(), slice_bias_shape);
+    } else {
+      scale_cast = scale.get();
+    }
     if (need_cast) {
       scale_cast = cast<T>(scale_cast, DataType::FLOAT32);
     }
@@ -905,121 +968,35 @@ std::tuple<Tensor, Tensor, Tensor> group_norm_decomp(
   }
   Tensor bias_cast;
   if (bias) {
-    bias_cast = reshape<T>(bias.get(), slice_bias_shape);
+    if (data_format == "NCHW") {
+      bias_cast = reshape<T>(bias.get(), slice_bias_shape);
+    } else {
+      bias_cast = bias.get();
+    }
     if (need_cast) {
       bias_cast = cast<T>(bias_cast, DataType::FLOAT32);
     }
     out = out + bias_cast;
   }
   Tensor mean_out, var_out;
-  if (has_dynamic_shape(x.shape())) {
+  if (has_dynamic_shape(x_cast.shape())) {
     Tensor x_shape = get_slice<T>(x_dim_t, 0);
     Tensor dim_1 = full<T>({1}, groups, x_shape.type());
     x_shape = concat<T>({x_shape, dim_1});
     mean_out = backend::reshape<T>(mean_, x_shape);
     var_out = backend::reshape<T>(var_, x_shape);
   } else {
-    auto x_dim = x.shape();
-    std::vector<int64_t> res_shape{x_dim[0], groups};
+    std::vector<int64_t> res_shape{x.shape().at(0), groups};
     mean_out = reshape<T>(mean_, res_shape);
     var_out = reshape<T>(var_, res_shape);
   }
   if (need_cast) {
     out = cast<T>(out, org_dtype);
   }
-  if (rank == 3) {
-    out = squeeze<T>(out, {-1});
-  }
 
   return std::make_tuple(out, mean_out, var_out);
 }
 
-template <typename T>
-Tensor tile_decomp(const Tensor& x, const IntArray& repeat_times) {
-  // x.shape = [3,4] repeat_time=(a,b,c)
-  // shape1 = [1,3,4]
-  // shape2 = [1,1,1,3,1,4]
-  // shape3 = [a,1,b,3,c,4]
-  // shape4 = shape1 -> [a, b*3, c*4]
-  // t1 = x.reshape(shape1)
-  // t2 = t1.reshape(shape2)
-  // t3 = t2.expand(shape3)
-  // res = t3.reshape(t3)
-  std::vector<int64_t> repeat_times_ = repeat_times.GetData();
-  std::vector<int64_t> shape1 = x.shape();
-  auto diff = int64_t(repeat_times_.size()) - int64_t(shape1.size());
-  Tensor t1;
-  if (has_dynamic_shape(shape1)) {
-    size_t repeat_time_length = repeat_times_.size();
-    std::vector<int64_t> unsqueeze_idx2;
-    if (diff > 0) {
-      std::vector<int64_t> unsqueeze_idx1(diff);
-      std::iota(unsqueeze_idx1.begin(), unsqueeze_idx1.end(), 0);
-      t1 = unsqueeze<T>(x, unsqueeze_idx1);
-    } else {
-      t1 = x;
-    }
-    auto length2 = t1.dims().size();
-    for (size_t i = 0; i < repeat_times_.size(); i++) {
-      unsqueeze_idx2.push_back(length2 - repeat_times_.size() + i * 2);
-    }
-
-    Tensor t2 = unsqueeze<T>(t1, unsqueeze_idx2);
-    std::vector<int64_t> ref_shape(t2.dims().size(), 1);
-    for (size_t i = 0; i < unsqueeze_idx2.size(); i++) {
-      ref_shape[unsqueeze_idx2[i]] = repeat_times_[i];
-    }
-    Tensor ref_t = full<T>(ref_shape, 1.0, t2.dtype());
-    Tensor t3 = t2 * ref_t;
-    Tensor origin_shape_t = shape<T>(t1);
-    std::vector<Tensor> res_s;
-    for (int64_t i = int64_t(length2) - 1; i >= 0; i--) {
-      auto relative_idx =
-          int64_t(repeat_time_length) - 1 - int64_t(length2 - i - 1);
-
-      if (relative_idx >= 0) {
-        res_s.insert(
-            res_s.begin(),
-            get_slice<T>(origin_shape_t, i) * repeat_times_[relative_idx]);
-      } else {
-        res_s.insert(res_s.begin(), get_slice<T>(origin_shape_t, i));
-      }
-    }
-    Tensor s4 = concat<T>(res_s, 0);
-    return backend::reshape_with_tensor<T>(t3, s4);
-
-  } else {
-    if (diff > 0) {
-      for (int64_t i = 0; i < diff; i++) {
-        shape1.insert(shape1.begin(), 1);
-      }
-    }
-
-    auto length = int64_t(shape1.size());
-    std::vector<int64_t> shape2 = shape1;
-    std::vector<int64_t> shape3 = shape1;
-    std::vector<int64_t> final_shape = shape1;
-    auto r_length = repeat_times_.size();
-    for (size_t j = 0; j < repeat_times_.size(); j++) {
-      int64_t i = int64_t(j);
-
-      shape2.insert(shape2.begin() + (length - 1 - i), 1);
-      shape3.insert(shape3.begin() + (length - 1 - i),
-                    repeat_times_[r_length - i - 1]);
-
-      final_shape[length - i - 1] =
-          final_shape[length - i - 1] * repeat_times_[r_length - i - 1];
-    }
-
-    t1 = reshape<T>(x, shape1);
-
-    auto t2 = reshape<T>(t1, shape2);
-    auto t3 = t2.expand(shape3);
-    auto res = reshape<T>(t3, final_shape);
-    return res;
-  }
-}
-
 template <typename T>
 Tensor square_decomp(const Tensor& x) {
   auto org_dtype = x.dtype();
diff --git a/paddle/fluid/primitive/primitive.yaml b/paddle/fluid/primitive/primitive.yaml
index 58c3ac09b782a..f5e99706faf97 100644
--- a/paddle/fluid/primitive/primitive.yaml
+++ b/paddle/fluid/primitive/primitive.yaml
@@ -4,6 +4,7 @@
 - divide
 - elementwise_pow
 - rsqrt
+- sqrt
 - sin
 - sinh
 - asin
diff --git a/paddle/fluid/primitive/rule/vjp/details.h b/paddle/fluid/primitive/rule/vjp/details.h
index 467469106f064..f12626d95257d 100644
--- a/paddle/fluid/primitive/rule/vjp/details.h
+++ b/paddle/fluid/primitive/rule/vjp/details.h
@@ -69,7 +69,7 @@ void divide_grad(const Tensor& x,
                  Tensor* dy) {
   if (dy) {
     // dy = -(x/y^2) * dout
-    auto dy_res = -(x / y.pow(2.0)) * out_grad;
+    auto dy_res = -(x / (y * y)) * out_grad;
     if (out_grad.dims() != y.dims()) {
       phi::DDim reduce_dim =
           get_reduce_dims_from_out(out_grad.dims(), y.dims());
@@ -163,8 +163,7 @@ void gelu_grad(const Tensor& x,
   // Promote to fp32 when the input type is fp16 for keeping consistent with
   // phi kernel
 
-  if (x.dtype() == phi::DataType::FLOAT16 ||
-      x.dtype() == phi::DataType::BFLOAT16) {
+  if (is_half_dtype(x.dtype())) {
     auto promoted_x = cast<T>(x, phi::DataType::FLOAT32);
     auto promoted_out_grad = cast<T>(out_grad, phi::DataType::FLOAT32);
     if (approximate) {
@@ -566,9 +565,7 @@ void layer_norm_grad(const Tensor& x,
 
   auto x_sub_mean = x_cast - mean_;          // M,N
   auto tmp = (1.0 / (variance_ + epsilon));  // M,1
-  // auto sqrt_var_1 = sqrt<T>(tmp);            // M,1
-  auto sqrt_var_1 = elementwise_pow<T>(
-      tmp, full<T>(common::vectorize(tmp.dims()), 0.5, tmp.dtype()));
+  auto sqrt_var_1 = sqrt<T>(tmp);            // M,1
   auto x_sub_mean_mul_sqrt_var_1 = x_sub_mean * sqrt_var_1;
 
   if (x_grad) {
@@ -721,6 +718,15 @@ void sqrt_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {
   }
 }
 
+template <typename T>
+void rsqrt_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {
+  if (x_grad) {
+    // This calculation is important for resnet.
+    auto x_grad_tmp = -0.5 * out * out * out * out_grad;
+    set_output<T>(x_grad_tmp, x_grad);
+  }
+}
+
 template <typename T>
 void silu_grad(const Tensor& x,
                const Tensor& out,
@@ -1254,7 +1260,7 @@ void batch_norm_grad(const Tensor& x,
     auto eps =
         full<T>(common::vectorize(run_var.dims()), epsilon, run_var.dtype());
     mean_data = run_mean;
-    rsqrt_var = (run_var + eps).pow(-0.5);
+    rsqrt_var = rsqrt<T>(run_var + eps);
   } else {
     mean_data = saved_mean;
     rsqrt_var = saved_variance;
@@ -1513,11 +1519,36 @@ void group_norm_grad(const Tensor& x,
   // cal d_bias:
   // d_bias = sum(dy, axes=(0,2,3))
   DataLayout data_layout_ = common::StringToDataLayout(data_layout);
-  if (data_layout_ != DataLayout::kNCHW) {
-    // TODO(chengyanfu): Subsequent support NHWC
+  std::vector<int64_t> x_dims = x.shape();
+  int rank = x_dims.size();
+  if (rank < 3 || rank > 5) {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "Only support NCHW and NHWC format in rank {3, 4, 5}."));
+  }
+  int N = x_dims[0];
+  int C;
+  int hw = 1;
+  std::vector<int64_t> reduce_axis;
+
+  if (data_layout_ == DataLayout::kNCHW) {
+    C = x_dims[1];
+    for (int i = 2; i < rank; ++i) {
+      hw *= x_dims[i];
+      reduce_axis.push_back(i);
+    }
+  } else if (data_layout_ == DataLayout::kNHWC) {
+    C = x_dims[rank - 1];
+    for (int i = 1; i < (rank - 1); ++i) {
+      hw *= x_dims[i];
+      reduce_axis.push_back(i);
+    }
+  } else {
     PADDLE_THROW(phi::errors::InvalidArgument("Unsupported storage order: %s",
                                               data_layout));
   }
+
+  int g_num = C / groups;
+
   Tensor x_data = x;
   Tensor out_grad_data = out_grad;
 
@@ -1531,37 +1562,38 @@ void group_norm_grad(const Tensor& x,
     out_grad_data = cast<T>(out_grad, phi::DataType::FLOAT32);
   }
 
-  std::vector<int64_t> x_dims = common::vectorize<int64_t>(x.dims());
-  auto add_axis = std::vector<int64_t>({-1});
-  const int N = x_dims[0];
-  const int C = x_dims[1];
+  auto shape_group = std::vector<int64_t>({N, groups, g_num});
 
-  const int hw = x_dims[2] * x_dims[3];
-  const int g_num = C / groups;
+  std::vector<int64_t> whole_group_shape;
+  if (data_layout_ == DataLayout::kNCHW) {
+    whole_group_shape = std::vector<int64_t>({N, groups, g_num, -1});
+  } else {
+    whole_group_shape = std::vector<int64_t>({N, -1, groups, g_num});
+  }
+  auto var_eps = variance + epsilon;
 
-  auto reduce_axis = IntArray(std::vector<int64_t>({2, 3}));
-  auto shape_group = IntArray(std::vector<int64_t>({N, groups, g_num}));
-  auto whole_group_shape =
-      IntArray(std::vector<int64_t>({N, groups, g_num, hw}));
+  auto inv_std = rsqrt<T>(var_eps);
 
-  auto scale_ptr = scale.get_ptr();
-  auto bias_ptr = bias.get_ptr();
-  auto sqrt_element = 1.0 / (variance + epsilon);
-  auto inv_std = elementwise_pow<T>(
-      sqrt_element,
-      full<T>(
-          common::vectorize(sqrt_element.dims()), 0.5, sqrt_element.dtype()));
   auto inv_std_mul_s = inv_std / hw / g_num;
   auto dtype = x_data.dtype();
   auto sum_y_grad_mul_x =
       sum<T>(out_grad_data * x_data, reduce_axis, dtype, false);
   auto sum_y_grad = sum<T>(out_grad_data, reduce_axis, dtype, false);
+
+  Tensor scale_data;
+  if (scale) {
+    scale_data = scale.get();
+  }
+  Tensor bias_data;
+  if (bias) {
+    bias_data = bias.get();
+  }
+
   if (x_grad) {
     Tensor d1;
     Tensor d2;
     Tensor p1;
-    if (scale_ptr) {
-      auto scale_data = scale.get();
+    if (scale) {
       if (scale_data.dtype() == phi::DataType::FLOAT16 ||
           scale_data.dtype() == phi::DataType::BFLOAT16) {
         scale_data = cast<T>(scale_data, phi::DataType::FLOAT32);
@@ -1573,22 +1605,29 @@ void group_norm_grad(const Tensor& x,
       p1 = reshape<T>(inv_std, std::vector<int64_t>({N, groups, 1})) *
            reshape<T>(scale_data, std::vector<int64_t>({1, groups, g_num}));
     } else {
-      d1 = (reshape<T>(sum_y_grad_mul_x, shape_group))
-               .sum(std::vector<int64_t>({2}), dtype, false);
-      d2 = (reshape<T>(sum_y_grad, shape_group))
-               .sum(std::vector<int64_t>({2}), dtype, false);
-      p1 = (reshape<T>(inv_std, std::vector<int64_t>({N, groups, 1})))
-               .expand(IntArray(shape_group));
+      d1 = (reshape<T>(sum_y_grad_mul_x, shape_group)).sum({2}, dtype, false);
+      d2 = (reshape<T>(sum_y_grad, shape_group)).sum({2}, dtype, false);
+      p1 = (reshape<T>(inv_std, {N, groups, 1}))
+               .expand(shape_group);  // [n, g, g_n]
     }
 
-    auto p2 = (d2 * mean - d1) * (inv_std_mul_s * inv_std * inv_std);
+    auto p2 = (d2 * mean - d1) * (inv_std_mul_s / var_eps);  // [n, g]
     auto p3 = -p2 * mean - d2 * inv_std_mul_s;
-    auto first_shape = get_unsqueeze_dims(p1, std::vector<int64_t>({3}));
-    auto second_shape = get_unsqueeze_dims(p2, std::vector<int64_t>({2, 3}));
+    std::vector<int64_t> first_shape;
+    std::vector<int64_t> second_shape;
+    if (data_layout_ == DataLayout::kNCHW) {
+      first_shape = get_unsqueeze_dims(p1, {3});      // [n, g, g_n, 1]
+      second_shape = get_unsqueeze_dims(p2, {2, 3});  // [n, g, 1, 1]
+    } else {
+      first_shape = get_unsqueeze_dims(p1, {1});      // [n, 1, g, g_n]
+      second_shape = get_unsqueeze_dims(p2, {1, 3});  // [n, 1, g, 1]
+    }
+
     p1 = reshape<T>(p1, first_shape);
     p2 = reshape<T>(p2, second_shape);
     p3 = reshape<T>(p3, second_shape);
-    auto tmp_1 = reshape<T>(out_grad_data, whole_group_shape) * p1;
+    auto tmp_1 =
+        reshape<T>(out_grad_data, whole_group_shape) * p1;  // [n, hw, g, g_n]
     auto tmp_2 = reshape<T>(x_data, whole_group_shape) * p2 + p3;
     auto x_grad_data = tmp_1 + tmp_2;
     x_grad_data = reshape<T>(x_grad_data, x.shape());
@@ -1599,29 +1638,24 @@ void group_norm_grad(const Tensor& x,
 
     set_output<T>(x_grad_data, x_grad);
   }
+
   if (scale_grad) {
-    if (scale_ptr) {
-      auto third_shape = get_unsqueeze_dims(mean, std::vector<int64_t>({2}));
+    if (scale) {
+      auto third_shape = get_unsqueeze_dims(mean, {2});
       auto tmp1 = (reshape<T>(sum_y_grad_mul_x, shape_group) -
                    reshape<T>(sum_y_grad, shape_group) *
                        reshape<T>(mean, third_shape)) *
                   reshape<T>(inv_std, third_shape);
-      auto scale_grad_tmp = reshape<T>(
-          tmp1.sum(std::vector<int64_t>({0}), scale_ptr->dtype(), false),
-          IntArray(std::vector<int64_t>({C})));
+      auto scale_grad_tmp =
+          reshape<T>(tmp1.sum({0}, scale->dtype(), false), {C});
       set_output<T>(scale_grad_tmp, scale_grad);
-    } else {
-      scale_grad = nullptr;
     }
   }
 
   if (bias_grad) {
-    if (bias_ptr) {
-      auto bias_grad_tmp =
-          sum_y_grad.sum(std::vector<int64_t>({0}), bias_ptr->dtype(), false);
+    if (bias) {
+      auto bias_grad_tmp = sum_y_grad.sum({0}, bias->dtype(), false);
       set_output<T>(bias_grad_tmp, bias_grad);
-    } else {
-      bias_grad = nullptr;
     }
   }
 }
diff --git a/paddle/fluid/primitive/utils/utils.h b/paddle/fluid/primitive/utils/utils.h
index c67886bc2ed2c..42f1533db723e 100644
--- a/paddle/fluid/primitive/utils/utils.h
+++ b/paddle/fluid/primitive/utils/utils.h
@@ -138,7 +138,7 @@ static phi::DDim get_reduce_dims_from_out(const phi::DDim& dout_dims,
     result.push_back(i);
   }
   for (int i = 0; i < in_dims.size(); ++i) {
-    if (in_dims[i] == 1 && dout_dims[i] != 1) {
+    if (in_dims[i] == 1 && dout_dims[i + bat] != 1) {
       result.push_back(i + bat);
     } else {
       PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index ecf95eb234972..0a32e0ea8f9ff 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -41,6 +41,7 @@ set(PYBIND_DEPS
     op_dialect_vjp
     program_translator
     pir_transforms
+    pir_save_load
     new_profiler
     fluid_jit
     prim_utils
@@ -399,11 +400,11 @@ if(WITH_PYTHON)
       list(APPEND OP_IMPL_DEPS ${op_impl_path}/openblas.dll)
       list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/openblas.dll)
     endif()
-    if(WITH_MKLDNN)
+    if(WITH_ONEDNN)
       add_custom_command(
         OUTPUT ${op_impl_path}/mkldnn.dll
         COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${op_impl_path}
-        DEPENDS mkldnn)
+        DEPENDS onednn)
       list(APPEND OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll)
       list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll)
     endif()
@@ -474,12 +475,12 @@ if(WITH_PYTHON)
       list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so)
       list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so)
     endif()
-    if(WITH_MKLDNN)
+    if(WITH_ONEDNN)
       add_custom_command(
         OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0
         COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB}
                 ${CMAKE_CURRENT_BINARY_DIR}
-        DEPENDS mkldnn)
+        DEPENDS onednn)
       list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0)
       list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0)
     endif()
diff --git a/paddle/fluid/pybind/dist_api.cc b/paddle/fluid/pybind/dist_api.cc
index 44feb061438e8..fd4066682161e 100644
--- a/paddle/fluid/pybind/dist_api.cc
+++ b/paddle/fluid/pybind/dist_api.cc
@@ -17,6 +17,8 @@
 
 #include "paddle/fluid/pir/dialect/distributed/ir/dist_api.h"
 #include "paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h"
+#include "paddle/fluid/pir/dialect/distributed/transforms/dist_to_dense_pass.h"
+#include "paddle/fluid/pir/dialect/distributed/transforms/mix_to_dist_pass.h"
 #include "paddle/fluid/pybind/dist_api.h"
 #include "paddle/fluid/pybind/dist_static_op_function.h"
 #include "paddle/phi/core/enforce.h"
@@ -105,6 +107,11 @@ void BindDistOpsAPI(pybind11::module *module) {
   }
 }
 
+void BindDistPassAPI(pybind11::module *module) {
+  module->def("apply_mix2dist_pass", paddle::dialect::MixToDistPass);
+  module->def("apply_dist2dense_pass", paddle::dialect::DistToDensePass);
+}
+
 void BindOpsFunction(py::module *m) {
   m->def("reshard_v2",
          [](const pir::Value &x, const TensorDistAttribute &dist_attr) {
@@ -116,6 +123,7 @@ void BindDistApi(pybind11::module *module) {
   auto ir_module = module->def_submodule("pir");
   BindOperationDistAttribute(&ir_module);
   BindTensorDistAttribute(&ir_module);
+  BindDistPassAPI(&ir_module);
   auto ops_modules = ir_module.def_submodule("ops");
   BindDistOpsAPI(&ops_modules);
   BindOpsFunction(&ops_modules);
diff --git a/paddle/fluid/pybind/dist_static_op_function.h b/paddle/fluid/pybind/dist_static_op_function.h
index afd71b7521567..c23a16bca2730 100644
--- a/paddle/fluid/pybind/dist_static_op_function.h
+++ b/paddle/fluid/pybind/dist_static_op_function.h
@@ -18,6 +18,7 @@
 #include "paddle/fluid/pir/dialect/operator/ir/api_builder.h"
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/exception.h"
+#include "paddle/fluid/pybind/pir.h"
 #include "paddle/phi/core/enforce.h"
 
 namespace paddle {
@@ -66,12 +67,42 @@ static PyObject *static_api_reshard(PyObject *self,
     PyObject *process_mesh_obj = PyTuple_GET_ITEM(args, 1);
     auto process_mesh = CastPyArg2ProcessMesh(process_mesh_obj, 1);
 
-    PyObject *dims_mapping_obj = PyTuple_GET_ITEM(args, 2);
-    auto dims_mapping = CastPyArg2VectorOfInt64(dims_mapping_obj, 2);
+    PyObject *placements_obj = PyTuple_GET_ITEM(args, 2);
+    auto placements = CastPyArg2VectorOfPlacement(placements_obj, 2);
+
+    int64_t ndim = GetValueDims(input).size();
+    std::vector<int64_t> dim_map(ndim, -1);
+    for (size_t i = 0; i < placements.size(); i++) {
+      auto &placement = placements[i];
+      if (placement->is_shard()) {
+        auto shard_dim =
+            dynamic_cast<const phi::distributed::Shard &>(*placement).get_dim();
+        PADDLE_ENFORCE_EQ(
+            dim_map[shard_dim],
+            -1,
+            common::errors::InvalidArgument(
+                "Tensor dim %lld is already sharded on mesh dim %lld,"
+                " DistTensor operator implementation does not support things "
+                "like hybrid"
+                " sharding strategies yet (i.e. [Shard(0), Shard(0)])",
+                shard_dim,
+                dim_map[shard_dim]));
+        dim_map[shard_dim] = i;
+      }
+    }
+    paddle::flat_hash_map<int64_t, phi::ReduceType> partial_status;
+    for (size_t i = 0; i < placements.size(); ++i) {
+      auto &p = placements[i];
+      if (p->is_partial()) {
+        partial_status.insert(
+            {i,
+             dynamic_cast<phi::distributed::Partial &>(*p).get_reduce_type()});
+      }
+    }
 
     // Call ir static api
     auto static_api_out =
-        paddle::dialect::reshard(input, process_mesh, dims_mapping);
+        paddle::dialect::reshard(input, process_mesh, dim_map, partial_status);
 
     return ToPyObject(static_api_out);
   } catch (...) {
diff --git a/paddle/fluid/pybind/eager_py_layer.cc b/paddle/fluid/pybind/eager_py_layer.cc
index daaac0c20e780..fb4235f619e99 100644
--- a/paddle/fluid/pybind/eager_py_layer.cc
+++ b/paddle/fluid/pybind/eager_py_layer.cc
@@ -478,9 +478,11 @@ PyObject* pylayer_method_apply(PyObject* cls,
 
     for (size_t i = 0; i < inputs_autograd_meta.size(); i++) {
       if (ctx->forward_input_tensor_is_duplicable[i]) {
+        std::vector<const paddle::Tensor*> tmp;
         for (auto t : inputs_tensor[i]) {
-          grad_node->SetGradOutMeta(*t, i);
+          tmp.push_back(t);
         }
+        grad_node->SetGradOutMeta(tmp, i);
       } else {
         grad_node->SetGradOutMeta(*inputs_tensor[i][0], i);
       }
@@ -490,9 +492,7 @@ PyObject* pylayer_method_apply(PyObject* cls,
       if (ctx->forward_output_tensor_is_duplicable[i]) {
         egr::EagerUtils::SetOutRankWithSlot(&outputs_autograd_meta[i], i);
         egr::EagerUtils::SetHistory(&outputs_autograd_meta[i], grad_node);
-        for (auto t : outputs_tensor[i]) {
-          grad_node->SetGradInMeta(*t, i);
-        }
+        grad_node->SetGradInMeta(outputs_tensor[i], i);
       } else {
         egr::EagerUtils::SetOutRankWithSlot(outputs_autograd_meta[i][0], i);
         egr::EagerUtils::SetHistory(outputs_autograd_meta[i][0], grad_node);
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index aba7c99662bbe..48f0168196949 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -32,7 +32,6 @@ limitations under the License. */
 #include "paddle/fluid/jit/function.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/operators/py_func_op.h"
-#include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -47,6 +46,7 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/distributed/auto_parallel/placement_types.h"
 #include "paddle/phi/core/distributed/auto_parallel/process_mesh.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/pir/include/core/attribute.h"
 
 COMMON_DECLARE_bool(check_nan_inf);
@@ -2478,6 +2478,7 @@ PyObjectHolder::PyObjectHolder(PyObject* ptr) { ptr_ = ptr; }
 
 PyObjectHolder::~PyObjectHolder() {  // NOLINT
   ::pybind11::gil_scoped_acquire gil;
+  // NOTE(deepllz): ptr_ is owned by this object, so release it in destructor.
   Py_XDECREF(ptr_);
 }
 
@@ -2512,7 +2513,10 @@ std::shared_ptr<egr::PyObjectHolderBase> PackHook::operator()(
   bool grad_tmp = egr::Controller::Instance().HasGrad();
   egr::Controller::Instance().SetHasGrad(false);
   ::pybind11::gil_scoped_acquire gil;
-  auto args = PyTuple_New(1);
+  PyObject* args = PyTuple_New(1);
+  PADDLE_ENFORCE_NOT_NULL(args,
+                          paddle::platform::errors::External(
+                              pybind11::detail::error_string().c_str()));
   PyTuple_SET_ITEM(args, 0, paddle::pybind::ToPyObject(tensor));
   PyObject* ret = PyObject_Call(hook_, args, nullptr);
   PADDLE_ENFORCE_NOT_NULL(ret,
@@ -2527,7 +2531,10 @@ void* PackHook::operator()(void* py_tensor) {
   bool grad_tmp = egr::Controller::Instance().HasGrad();
   egr::Controller::Instance().SetHasGrad(false);
   ::pybind11::gil_scoped_acquire gil;
-  auto args = PyTuple_New(1);
+  PyObject* args = PyTuple_New(1);
+  PADDLE_ENFORCE_NOT_NULL(args,
+                          paddle::platform::errors::External(
+                              pybind11::detail::error_string().c_str()));
   Py_INCREF(reinterpret_cast<PyObject*>(py_tensor));
   PyTuple_SET_ITEM(args, 0, reinterpret_cast<PyObject*>(py_tensor));
   PyObject* ret = PyObject_Call(hook_, args, nullptr);
@@ -2551,13 +2558,20 @@ paddle::Tensor UnPackHook::operator()(
   bool grad_tmp = egr::Controller::Instance().HasGrad();
   egr::Controller::Instance().SetHasGrad(false);
   ::pybind11::gil_scoped_acquire gil;
-  auto args = PyTuple_New(1);
-  Py_INCREF(reinterpret_cast<PyObject*>(packed_value->get()));
-  PyTuple_SET_ITEM(args, 0, reinterpret_cast<PyObject*>(packed_value->get()));
+  PyObject* args = PyTuple_New(1);
+  PADDLE_ENFORCE_NOT_NULL(args,
+                          paddle::platform::errors::External(
+                              pybind11::detail::error_string().c_str()));
+  PyObject* py_packed_value = reinterpret_cast<PyObject*>(packed_value->get());
+  Py_INCREF(py_packed_value);
+  PyTuple_SET_ITEM(args, 0, py_packed_value);
   PyObject* ret = PyObject_Call(hook_, args, nullptr);
   PADDLE_ENFORCE_NOT_NULL(ret,
                           paddle::platform::errors::External(
                               pybind11::detail::error_string().c_str()));
+  // NOTE(deepllz): tupledealloc will cause the reference count of the objects
+  // in it to be decremented by one, so no need to call
+  // Py_XDECREF(py_packed_value)
   Py_XDECREF(args);
   egr::Controller::Instance().SetHasGrad(grad_tmp);
 
@@ -2576,7 +2590,10 @@ void* UnPackHook::operator()(void* packed_value, void* other) {
   bool grad_tmp = egr::Controller::Instance().HasGrad();
   egr::Controller::Instance().SetHasGrad(false);
   ::pybind11::gil_scoped_acquire gil;
-  auto args = PyTuple_New(1);
+  PyObject* args = PyTuple_New(1);
+  PADDLE_ENFORCE_NOT_NULL(args,
+                          paddle::platform::errors::External(
+                              pybind11::detail::error_string().c_str()));
   Py_INCREF(reinterpret_cast<PyObject*>(packed_value));
   PyTuple_SET_ITEM(args, 0, reinterpret_cast<PyObject*>(packed_value));
   PyObject* ret = PyObject_Call(hook_, args, nullptr);
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index b70efdbabbebc..55173bad9a1c8 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -54,7 +54,6 @@ limitations under the License. */
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/imperative/xccl_context.h"
 #include "paddle/fluid/memory/allocation/mmap_allocator.h"
-#include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/pybind/cuda_streams_py.h"
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/pybind_variant_caster.h"
@@ -62,6 +61,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/pybind/uva_utils.h"
 #include "paddle/phi/core/compat/arg_map_context.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/type_defs.h"
 
 namespace paddle {
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 2996133948cc6..457bc649f98d1 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -1225,8 +1225,8 @@ void BindPaddleInferPredictor(py::module *m) {
       .def("try_shrink_memory", &paddle_infer::Predictor::TryShrinkMemory)
       .def("clear_intermediate_tensor",
            &paddle_infer::Predictor::ClearIntermediateTensor)
-      .def("register_output_hook",
-           &paddle_infer::Predictor::RegisterOutputHook);
+      .def("register_output_hook", &paddle_infer::Predictor::RegisterOutputHook)
+      .def("register_input_hook", &paddle_infer::Predictor::RegisterInputHook);
 }
 
 void BindZeroCopyTensor(py::module *m) {
diff --git a/paddle/fluid/pybind/io.cc b/paddle/fluid/pybind/io.cc
index 9075e904ef4b8..d38dbf72643ce 100644
--- a/paddle/fluid/pybind/io.cc
+++ b/paddle/fluid/pybind/io.cc
@@ -17,6 +17,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/io/save_load_tensor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
+#include "paddle/fluid/pir/serialize_deserialize/include/interface.h"
+#include "paddle/fluid/pir/serialize_deserialize/include/save_load_parameters.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/pybind/pybind_variant_caster.h"
 #include "paddle/utils/pybind.h"
@@ -122,6 +124,24 @@ void BindIO(pybind11::module *m) {
     paddle::framework::LoadTensor(path, &tensor_load);
     return tensor_load;
   });
+
+  m->def("save_func", &pir::SaveFunction);
+
+  m->def("save_combine_func", &pir::SaveCombineFunction);
+
+  m->def("load_func", &pir::LoadFunction);
+
+  m->def("load_combine_func", &pir::LoadCombineFunction);
+
+  m->def("serialize_pir_program",
+         &pir::WriteModule,
+         py::arg("program"),
+         py::arg("file_path"),
+         py::arg("pir_version"),
+         py::arg("overwrite") = true,
+         py::arg("readable") = false,
+         py::arg("trainable") = true);
+  m->def("deserialize_pir_program", &pir::ReadModule);
 }
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/manual_static_op_function.h b/paddle/fluid/pybind/manual_static_op_function.h
index 7767c4a4569b3..8943633fb4cda 100644
--- a/paddle/fluid/pybind/manual_static_op_function.h
+++ b/paddle/fluid/pybind/manual_static_op_function.h
@@ -159,7 +159,7 @@ PyObject *static_api_full(PyObject *self, PyObject *args, PyObject *kwargs) {
       CallStackRecorder callstack_recoder("full_with_tensor");
       callstack_recoder.Record();
       auto static_api_out =
-          paddle::dialect::full_with_tensor(shape, value, dtype);
+          paddle::dialect::full_with_tensor(value, shape, dtype);
       callstack_recoder.AttachToOps();
 
       return ToPyObject(static_api_out);
diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc
index 2568e5eef4c5e..4176ecf0bbcbb 100644
--- a/paddle/fluid/pybind/pir.cc
+++ b/paddle/fluid/pybind/pir.cc
@@ -32,7 +32,6 @@
 #include "paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h"
 #include "paddle/fluid/pir/dialect/distributed/ir/dist_dialect.h"
 #include "paddle/fluid/pir/dialect/distributed/ir/dist_type.h"
-#include "paddle/fluid/pir/dialect/distributed/transforms/mix_to_dist_pass.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
 #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
 #include "paddle/fluid/pir/dialect/operator/ir/api_builder.h"
@@ -72,6 +71,7 @@
 #ifdef PADDLE_WITH_CINN
 #include "paddle/cinn/hlir/dialect/operator/ir/op_dialect.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.h"
+#include "paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.h"
 #include "paddle/cinn/hlir/framework/pir_compiler.h"
 #endif
 
@@ -301,6 +301,15 @@ void BindProgram(py::module *m) {
           [](std::shared_ptr<Program> self, int64_t random_seed) {
             SetProgramInt64Attr(self, "random_seed", random_seed);
           })
+      .def_property_readonly(
+          "blocks",
+          [](const std::shared_ptr<Program> &self) {
+            // Note: We only return global block currently.
+            py::list op_list;
+            op_list.append(self->block());
+            return op_list;
+          },
+          return_value_policy::reference)
       .def("get_output_value_by_name",
            [](Program &self, const std::string &name) {
              return GetOutputValueByName(self, name);
@@ -638,6 +647,10 @@ void BindOperation(py::module *m) {
           "callstack",
           [](Operation &self) -> py::list {
             py::list callstack_list;
+            if (!self.HasAttribute(paddle::framework::OpProtoAndCheckerMaker::
+                                       OpCreationCallstackAttrName())) {
+              return callstack_list;
+            }
             pir::Attribute op_callstack = self.attribute<pir::Attribute>(
                 paddle::framework::OpProtoAndCheckerMaker::
                     OpCreationCallstackAttrName());
@@ -675,14 +688,19 @@ void BindOperation(py::module *m) {
                 pir::ArrayAttribute::get(pir::IrContext::Instance(),
                                          op_callstack_infos));
           })
-      .def("dist_attr", [](Operation &self) {
-        if (self.HasAttribute(kAttrOpDistAttr)) {
-          return self.attribute<OperationDistAttribute>(kAttrOpDistAttr);
-        } else {
-          PADDLE_THROW(
-              phi::errors::InvalidArgument("dist_attr is only for dist op."));
-        }
-      });
+      .def_property(
+          "dist_attr",
+          [](Operation &self) {
+            if (self.HasAttribute(kAttrOpDistAttr)) {
+              return self.attribute<OperationDistAttribute>(kAttrOpDistAttr);
+            } else {
+              PADDLE_THROW(phi::errors::InvalidArgument(
+                  "dist_attr is only for dist op."));
+            }
+          },
+          [](Operation &self, OperationDistAttribute op_dist_attr) {
+            self.set_attribute(kAttrOpDistAttr, op_dist_attr);
+          });
   py::class_<Operation::BlockContainer> block_container(
       *m, "Operation_BlockContainer", R"DOC(
     The Operation_BlockContainer only use to walk all blocks in the operation.
@@ -1072,6 +1090,131 @@ void range_block_do(const Block *block, std::vector<int> range, F fn) {
   }
 }
 
+template <typename K, typename V>
+bool ExistsInMapValues(const std::map<K, V> &m, V value) {
+  for (const auto &[k, v] : m) {
+    if (v == value) {
+      return true;
+    }
+  }
+  return false;
+}
+
+std::map<int, int> GetOpInplaceInfo(const pir::Operation *op) {
+  std::map<int, int> inplace_info;
+  if (!op->HasTrait<paddle::dialect::InplaceTrait>()) {
+    return inplace_info;
+  }
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  std::string op_name = op->name();
+  if (op->attributes().count("op_name")) {
+    op_name =
+        op->attributes().at("op_name").dyn_cast<pir::StrAttribute>().AsString();
+  }
+
+  pir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_name);
+  paddle::dialect::OpYamlInfoParser yaml_parser(
+      op_info.GetInterfaceImpl<paddle::dialect::OpYamlInfoInterface>()
+          ->get_op_info_(op_name),
+      paddle::dialect::IsLegacyOp(op_name));
+
+  for (size_t i = 0; i < op->num_results(); ++i) {
+    std::string value_name = yaml_parser.OutputNames()[i];
+    if (yaml_parser.HasInplace(value_name)) {
+      const std::string &inplace_name = yaml_parser.InplaceName(value_name);
+      inplace_info[i] = yaml_parser.InputName2Id().at(inplace_name);
+    }
+    if (yaml_parser.HasView(value_name)) {
+      const std::string &view_name = yaml_parser.ViewName(value_name);
+      inplace_info[i] = yaml_parser.InputName2Id().at(view_name);
+    }
+  }
+
+  return inplace_info;
+}
+
+std::vector<std::vector<pir::Value>> GetOpInplaceChains(const Block *block) {
+  std::vector<std::vector<pir::Value>> inplace_chains;
+  std::map<pir::Value, int> value_to_inplace_chain_index;
+
+  for (auto &op : *block) {
+    pir::Walk(&op, [&](Operation *inner_op) {
+      auto op_inplace_info = GetOpInplaceInfo(inner_op);
+      for (auto &[out_idx, in_idx] : op_inplace_info) {
+        auto target_value = inner_op->results()[out_idx];
+        auto source_value = inner_op->operands()[in_idx].source();
+        VLOG(8) << "Inplace Mapping: " << Value2String(source_value) << " -> "
+                << Value2String(target_value);
+
+        if (value_to_inplace_chain_index.count(source_value) == 0 &&
+            value_to_inplace_chain_index.count(target_value) == 0) {
+          size_t chain_insertion_idx = inplace_chains.size();
+          inplace_chains.push_back({source_value, target_value});
+          value_to_inplace_chain_index.insert(
+              {source_value, chain_insertion_idx});
+          value_to_inplace_chain_index.insert(
+              {target_value, chain_insertion_idx});
+        } else {
+          PADDLE_ENFORCE_NE(
+              value_to_inplace_chain_index.count(source_value),
+              0,
+              phi::errors::Unavailable("source value should be in the chain"));
+          PADDLE_ENFORCE_EQ(value_to_inplace_chain_index.count(target_value),
+                            0,
+                            phi::errors::Unavailable(
+                                "target value should not be in the chain"));
+          size_t chain_insertion_idx =
+              value_to_inplace_chain_index[source_value];
+          inplace_chains[chain_insertion_idx].push_back(target_value);
+          value_to_inplace_chain_index.insert(
+              {target_value, chain_insertion_idx});
+        }
+      }
+    });
+  }
+  return inplace_chains;
+}
+
+std::optional<pir::Value> FindInplaceSource(
+    const std::vector<std::vector<pir::Value>> inplace_chains,
+    pir::Value value) {
+  if (value.impl() == nullptr) {
+    return std::nullopt;
+  }
+  for (auto &chain : inplace_chains) {
+    for (auto &v : chain) {
+      if (v == value) {
+        return chain[0];
+      }
+    }
+  }
+  return std::nullopt;
+}
+
+std::map<pir::Value, pir::Value> ReplaceValueWithInplaceSource(
+    const std::vector<std::vector<pir::Value>> &source_domain,
+    std::vector<pir::Value> *target_values,
+    const std::vector<std::vector<pir::Value>> inplace_chains) {
+  std::map<pir::Value, pir::Value> replacements;
+  for (auto &target_value : *target_values) {
+    auto inplace_source = FindInplaceSource(inplace_chains, target_value);
+    if (!inplace_source.has_value()) {
+      continue;
+    }
+    for (auto &source_values : source_domain) {
+      if (std::find(source_values.begin(),
+                    source_values.end(),
+                    inplace_source.value()) != source_values.end()) {
+        VLOG(4) << "Replace " << Value2String(target_value) << " with "
+                << Value2String(inplace_source.value());
+        replacements.insert({target_value, inplace_source.value()});
+        target_value = inplace_source.value();
+      }
+    }
+  }
+  return replacements;
+}
+
 std::pair<std::vector<pir::Value>, std::unordered_set<pir::Value>>
 AnalysisMiddleVariable(const Program &program,
                        const std::vector<pir::Value> &forward_inputs,
@@ -1145,6 +1288,10 @@ static auto GetNoNeedBufferValue(const ::pir::Block *whole_block,
   std::unordered_set<::pir::Value> no_need_buffer_values;
   range_block_do(
       whole_block, range, [&need_buffer_values](::pir::Operation *op) {
+        // NOTE(SigureMo): We should process the CombineOp in it's users.
+        if (op->isa<pir::CombineOp>()) {
+          return;
+        }
         if (op->HasInterface<paddle::dialect::OpYamlInfoInterface>() == false) {
           // not a OpYamlInfoInterface, can't have no_need_buffer.
           for (const auto &operand : op->operands_source()) {
@@ -1155,8 +1302,16 @@ static auto GetNoNeedBufferValue(const ::pir::Block *whole_block,
               op->dyn_cast<paddle::dialect::OpYamlInfoInterface>().GetOpInfo();
           int counter = 0;
           for (const auto &op_input_info : std::get<0>(opinfo)) {
+            auto value = op->operand_source(counter);
             if (!op_input_info.no_need_buffer) {
-              need_buffer_values.insert(op->operand_source(counter));
+              need_buffer_values.insert(value);
+              if (!IsFakeValue(value) && value.defining_op() &&
+                  value.defining_op()->isa<pir::CombineOp>()) {
+                for (const auto &combine_value :
+                     value.defining_op()->operands_source()) {
+                  need_buffer_values.insert(combine_value);
+                }
+              }
             }
             counter += 1;
           }
@@ -1255,10 +1410,26 @@ SplitedResult SplitForwardBackward(
   pir::IrContext *ctx = pir::IrContext::Instance();
   auto forward_program = std::make_shared<Program>(ctx);
   auto backward_program = std::make_shared<Program>(ctx);
+  std::vector<pir::Value> forward_outputs_mutable = forward_outputs;
   std::vector<pir::Value> middle_values;
   std::unordered_set<pir::Value> backward_inputs;
+  const auto &inplace_chains = GetOpInplaceChains(program.block());
   std::tie(middle_values, backward_inputs) = AnalysisMiddleVariable(
       program, forward_in_out_values, forward_range, backward_range);
+
+  // Replace inplace value with source value.
+  // NOTE(SigureMo): Why not process inplace value for forward_inputs in
+  // forward?
+  // Because all forward_inputs uses data op, after lower to kernel
+  // pass, the data op will following a non-inplace op shadow_feed, so we don't
+  // need to process inplace for forward_inputs in forward.
+  // Same reason for whole backward program, because all backward inputs are
+  // created by block kwargs, it also add a shadow_feed op after lower to kernel
+  // pass.
+  auto replacement_for_forward_middles = ReplaceValueWithInplaceSource(
+      {forward_params}, &middle_values, inplace_chains);
+  auto replacement_for_forward_outputs = ReplaceValueWithInplaceSource(
+      {forward_params}, &forward_outputs_mutable, inplace_chains);
   pir::Block &backward_block = *backward_program->block();
   bool has_backward = (backward_range[1] > backward_range[0]);
 
@@ -1283,8 +1454,13 @@ SplitedResult SplitForwardBackward(
   auto create_kwarg_fn = [&backward_block,
                           &backward_inputs,
                           &backward_value_map,
+                          &replacement_for_forward_middles,
+                          &replacement_for_forward_outputs,
                           &counter](const pir::Value &v) {
-    if (v && backward_inputs.count(v)) {
+    if (v && !backward_value_map.count(v) &&
+        (backward_inputs.count(v) ||
+         ExistsInMapValues(replacement_for_forward_middles, v) ||
+         ExistsInMapValues(replacement_for_forward_outputs, v))) {
       backward_value_map[v] = backward_block.AddKwarg(
           "input_" + std::to_string(counter++), v.type());
     }
@@ -1293,10 +1469,19 @@ SplitedResult SplitForwardBackward(
   auto create_output_fn_forward = [&ctx,
                                    &forward_value_map,
                                    &counter,
-                                   &forward_program](const pir::Value &v) {
+                                   &forward_program,
+                                   &forward_inputs,
+                                   &forward_params](const pir::Value &v) {
     if (v.impl() == nullptr) {
       return;
     }
+    // Skip the value that already in forward_inputs or forward_params.
+    if (std::find(forward_inputs.begin(), forward_inputs.end(), v) !=
+            forward_inputs.end() ||
+        std::find(forward_params.begin(), forward_params.end(), v) !=
+            forward_params.end()) {
+      return;
+    }
     // NOTE(Aurelius84): we should skip insert ShadowOutputOp repeatedly by
     // calling SplitForwardBackward multi-times.
     std::string shadow_output_name =
@@ -1350,14 +1535,14 @@ SplitedResult SplitForwardBackward(
     counter += 1;
   };
 
-  // counter = 0;
   if (has_backward) {
     VLOG(4) << "start create backward inputs, creating keyword argument.";
     VLOG(4)
         << "Create keyword argument for backward program: fo, start with input_"
         << counter;
-    std::for_each(
-        forward_outputs.begin(), forward_outputs.end(), create_kwarg_fn);
+    std::for_each(forward_outputs_mutable.begin(),
+                  forward_outputs_mutable.end(),
+                  create_kwarg_fn);
     VLOG(4)
         << "Create keyword argument for backward program: fx, start with input_"
         << counter;
@@ -1380,14 +1565,27 @@ SplitedResult SplitForwardBackward(
                   create_kwarg_fn);
     VLOG(4) << "Create keyword argument for backward program end. input_"
             << counter;
+
+    // Update the value map with inplace source value.
+    VLOG(4) << "start update inplace names";
+    VLOG(4) << "replacement_for_forward_middles size is: "
+            << replacement_for_forward_middles.size();
+    for (auto &[target, source] : replacement_for_forward_middles) {
+      backward_value_map[target] = backward_value_map.at(source);
+    }
+    VLOG(4) << "replacement_for_forward_outputs size is: "
+            << replacement_for_forward_outputs.size();
+    for (auto &[target, source] : replacement_for_forward_outputs) {
+      backward_value_map[target] = backward_value_map.at(source);
+    }
   }
 
-  // counter = 0;
   VLOG(4) << "start create forward outputs, inserting set_parameter ops.";
   std::for_each(
       middle_values.begin(), middle_values.end(), create_output_fn_forward);
-  std::for_each(
-      forward_outputs.begin(), forward_outputs.end(), create_output_fn_forward);
+  std::for_each(forward_outputs_mutable.begin(),
+                forward_outputs_mutable.end(),
+                create_output_fn_forward);
 
   // Step2. copy backward ops .
   VLOG(4) << "start copy backward ops";
@@ -1398,7 +1596,6 @@ SplitedResult SplitForwardBackward(
         auto *cloned_op = op->Clone(backward_mapper, clone_options);
         backward_program->block()->push_back(cloned_op);
       });
-  // counter = 0;
   VLOG(4) << "start create backward outputs, inserting set_parameter ops.";
   if (has_backward) {
     std::for_each(forward_inputs_grads.begin(),
@@ -1423,20 +1620,20 @@ SplitedResult SplitForwardBackward(
 
   // construct all attributes we needed.
 
-  mapping_value(middle_values, forward_value_map, fm);    // write 'fm'
-  mapping_value(middle_values, backward_value_map, bm);   // write 'bm'
-  mapping_value(forward_inputs, forward_value_map, fx);   // write 'fx'
-  mapping_value(forward_inputs, backward_value_map, bx);  // write 'bx'
-  mapping_value(forward_params, forward_value_map, fp);   // write 'fp'
-  mapping_value(forward_params, backward_value_map, bp);  // write 'bp'
-  mapping_value(forward_outputs, forward_value_map, fo);  // write 'fo'
+  mapping_value(middle_values, forward_value_map, fm);            // write 'fm'
+  mapping_value(middle_values, backward_value_map, bm);           // write 'bm'
+  mapping_value(forward_inputs, forward_value_map, fx);           // write 'fx'
+  mapping_value(forward_inputs, backward_value_map, bx);          // write 'bx'
+  mapping_value(forward_params, forward_value_map, fp);           // write 'fp'
+  mapping_value(forward_params, backward_value_map, bp);          // write 'bp'
+  mapping_value(forward_outputs_mutable, forward_value_map, fo);  // write 'fo'
   mapping_value(
       forward_inputs_grads, backward_value_map, bx_g);  // write 'bx_g'
   mapping_value(
       forward_params_grads, backward_value_map, bp_g);  // write 'bp_g'
   mapping_value(
-      forward_outputs_grads, backward_value_map, bo_g);    // write 'bo_g'
-  mapping_value(forward_outputs, backward_value_map, bo);  // write 'bo'
+      forward_outputs_grads, backward_value_map, bo_g);  // write 'bo_g'
+  mapping_value(forward_outputs_mutable, backward_value_map, bo);  // write 'bo'
   mapping_value(GetNoNeedBufferValue(program.block(), backward_range),
                 forward_value_map,
                 no_need_buffer_values);  // write 'no_need_buffers'
@@ -1502,39 +1699,6 @@ void ResetShadowOutputName(pir::Operation *op, const std::string &name) {
   }
 }
 
-std::map<int, int> GetOpInplaceInfo(const pir::Operation *op) {
-  std::map<int, int> inplace_info;
-  if (!op->HasTrait<paddle::dialect::InplaceTrait>()) {
-    return inplace_info;
-  }
-  pir::IrContext *ctx = pir::IrContext::Instance();
-  std::string op_name = op->name();
-  if (op->attributes().count("op_name")) {
-    op_name =
-        op->attributes().at("op_name").dyn_cast<pir::StrAttribute>().AsString();
-  }
-
-  pir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_name);
-  paddle::dialect::OpYamlInfoParser yaml_parser(
-      op_info.GetInterfaceImpl<paddle::dialect::OpYamlInfoInterface>()
-          ->get_op_info_(op_name),
-      paddle::dialect::IsLegacyOp(op_name));
-
-  for (size_t i = 0; i < op->num_results(); ++i) {
-    std::string value_name = yaml_parser.OutputNames()[i];
-    if (yaml_parser.HasInplace(value_name)) {
-      const std::string &inplace_name = yaml_parser.InplaceName(value_name);
-      inplace_info[i] = yaml_parser.InputName2Id().at(inplace_name);
-    }
-    if (yaml_parser.HasView(value_name)) {
-      const std::string &view_name = yaml_parser.ViewName(value_name);
-      inplace_info[i] = yaml_parser.InputName2Id().at(view_name);
-    }
-  }
-
-  return inplace_info;
-}
-
 void BindUtils(pybind11::module *m) {
   m->def("clone_program", CloneProgram);
   m->def("get_op_inplace_info", GetOpInplaceInfo);
@@ -1702,33 +1866,34 @@ void BindUtils(pybind11::module *m) {
                 >>> print(mappings)
                 {'matmul_v2_0.tmp_0': [Value(define_op_name=pd_op.matmul, index=0, dtype=builtin.tensor<4x4xf32>)], 'x': [Value(define_op_name=pd_op.data, index=0, dtype=builtin.tensor<4x4xf32>)], 'tanh_0.tmp_0': [Value(define_op_name=pd_op.tanh, index=0, dtype=builtin.tensor<4x4xf32>)], 'elementwise_add_0': [Value(define_op_name=pd_op.add, index=0, dtype=builtin.tensor<4x4xf32>)]}
     )DOC");
-
-  m->def("clear_cinn_compilation_cache",
-         []() {
+  m->def("clear_cinn_compilation_cache", []() {
 #ifdef PADDLE_WITH_CINN
-           pybind11::gil_scoped_release release;
-           VLOG(4) << "clear CINN CompilationCache and free BackendResource.";
-           cinn::hlir::framework::CompilationCache::Instance().Clear();
+    pybind11::gil_scoped_release release;
+    VLOG(4) << "clear CINN CompilationCache and free BackendResource.";
+    cinn::hlir::framework::CompilationCache::Instance().Clear();
 #endif
-         }),
-      m->def("apply_mix2dist_pass", paddle::dialect::MixToDistPass);
+  });
 }
 
 namespace {
 
+#ifdef PADDLE_WITH_CINN
+std::shared_ptr<pir::PassManager> CreatePassManager() {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<cinn::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<pir::shape::ShapeDialect>();
+  auto pass_manager = std::make_shared<pir::PassManager>(ctx);
+  if (FLAGS_print_ir) {
+    pass_manager->EnableIRPrinting();
+  }
+  return pass_manager;
+}
+#endif
+
 void ApplyCinnPass(Program &program) {  // NOLINT
 #ifdef PADDLE_WITH_CINN
-  cinn::dialect::ir::ApplyCinnPass(&program, [] {
-    pir::IrContext *ctx = pir::IrContext::Instance();
-    ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
-    ctx->GetOrRegisterDialect<cinn::dialect::OperatorDialect>();
-    ctx->GetOrRegisterDialect<pir::shape::ShapeDialect>();
-    auto pass_manager = std::make_shared<pir::PassManager>(ctx);
-    if (FLAGS_print_ir) {
-      pass_manager->EnableIRPrinting();
-    }
-    return pass_manager;
-  });
+  cinn::dialect::ir::ApplyCinnPass(&program, CreatePassManager);
 #else
   PADDLE_THROW(common::errors::Unimplemented(
       "Currently we only support CINN Pass for Pir under @to_static, please "
@@ -1736,6 +1901,14 @@ void ApplyCinnPass(Program &program) {  // NOLINT
 #endif
 }
 
+void CheckInferSymbolicIfNeed(Program &program) {  // NOLINT
+#ifdef PADDLE_WITH_CINN
+  cinn::dialect::ir::CheckInferSymbolicIfNeed(&program, CreatePassManager);
+#else
+  // Do nothing.
+#endif
+}
+
 }  // namespace
 
 void InferSymbolicShapePass(
@@ -1751,6 +1924,7 @@ void InferSymbolicShapePass(
 
 void BindIrPass(pybind11::module *m) {
   m->def("apply_cinn_pass", ApplyCinnPass);
+  m->def("check_infer_symbolic_if_need", CheckInferSymbolicIfNeed);
   m->def("infer_symbolic_shape_pass", InferSymbolicShapePass);
 
   py::class_<Pass, std::shared_ptr<Pass>> pass(*m,
@@ -1781,8 +1955,26 @@ void BindPassManager(pybind11::module *m) {
            }),
            py::arg("opt_level") = 2)
       .def("add_pass",
-           [](PassManager &self, const std::string &pass_name) {
-             self.AddPass(pir::PassRegistry::Instance().Get(pass_name));
+           [](PassManager &self,
+              const std::string &pass_name,
+              const std::unordered_map<std::string, py::object> attrs = {}) {
+             auto pass = pir::PassRegistry::Instance().Get(pass_name);
+             for (const auto &attr : attrs) {
+               if (py::isinstance<py::str>(attr.second)) {
+                 pass->Set(attr.first,
+                           new std::string(attr.second.cast<std::string>()));
+               } else if (py::isinstance<py::bool_>(attr.second)) {
+                 pass->Set(attr.first, new bool(attr.second.cast<bool>()));
+               } else if (py::isinstance<py::int_>(attr.second)) {
+                 pass->Set(attr.first, new int(attr.second.cast<int>()));
+               } else if (py::isinstance<py::float_>(attr.second)) {
+                 pass->Set(attr.first, new float(attr.second.cast<float>()));
+               } else {
+                 PADDLE_THROW(phi::errors::InvalidArgument(
+                     "The pass attr is not supported this type."));
+               }
+             }
+             self.AddPass(std::move(pass));
            })
       .def("passes",
            [](PassManager &self) {
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 5470f4d7ec4f2..35d1a297720b4 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -190,6 +190,7 @@ limitations under the License. */
 #endif
 
 #ifdef PADDLE_WITH_CINN
+#include "paddle/cinn/pybind/bind.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
 #include "paddle/fluid/pybind/test.h"
 #endif
@@ -405,6 +406,10 @@ bool SupportsInt8() {
 #endif
 }
 
+bool SupportsAvx512F() {
+  return phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512f);
+}
+
 bool SupportsVNNI() {
 #ifndef PADDLE_WITH_DNNL
   return false;
@@ -2153,6 +2158,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("supports_bfloat16", SupportsBfloat16);
   m.def("supports_bfloat16_fast_performance", SupportsBfloat16FastPerformance);
   m.def("supports_int8", SupportsInt8);
+  m.def("supports_avx512f", SupportsAvx512F);
   m.def("supports_vnni", SupportsVNNI);
   m.def("op_supported_infos", imperative::OpSupportedInfos);
   m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
@@ -3053,6 +3059,7 @@ All parameter, weight, gradient are variables in Paddle.
 
 #if defined(PADDLE_WITH_CINN)
   BindTest(&m);
+  cinn::pybind::BindCINN(&m);
 #endif
 
   BindPir(&m);
diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
index c66cd9d0dc81f..bf3d025b228cc 100644
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -859,7 +859,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
         )DOC")
 #endif
       .def("_share_filename",
-           [](phi::DenseTensor &self) {
+           [](phi::DenseTensor &self, bool use_file_descriptor) {
              if (!self.IsInitialized() || self.numel() == 0)
                throw std::runtime_error(
                    "Tensor not initialized or numel is 0. could not pass to "
@@ -886,6 +886,10 @@ void BindTensor(pybind11::module &m) {  // NOLINT
 
                int flags = memory::allocation::MAPPED_SHAREDMEM |
                            memory::allocation::MAPPED_EXCLUSIVE;
+               if (use_file_descriptor) {
+                   flags = flags | memory::allocation::MAPPED_KEEPFD |
+                           memory::allocation::MAPPED_UNLINK;
+               }
                std::string handle = memory::allocation::GetIPCName();
                int find_id = -1;
                if (FLAGS_use_shm_cache) {
@@ -894,9 +898,10 @@ void BindTensor(pybind11::module &m) {  // NOLINT
                if (find_id != -1) {
                  handle = memory::allocation::MemoryMapAllocationPool::Instance().GetById(find_id).file_name_; // NOLINT
                }
+               int shared_fd = -1;
                auto shared_holder =
                    memory::allocation::AllocateRefcountedMemoryMapAllocation(
-                       handle, flags, data_size, find_id);
+                       handle, shared_fd, flags, data_size, find_id);
 
                // copy data & reset holder
                if (platform::is_cuda_pinned_place(holder->place())) {
@@ -914,8 +919,10 @@ void BindTensor(pybind11::module &m) {  // NOLINT
              int type_idx = static_cast<int>(self.type());
 
              return py::make_tuple(mmap_allocation->ipc_name(),
+                                   mmap_allocation->shared_fd(),
                                    mmap_allocation->size(), type_idx,
-                                   common::vectorize(self.dims()), self.lod());
+                                   common::vectorize(self.dims()), self.lod(),
+                                   use_file_descriptor);
            },
            R"DOC(
            Serialize CPU lod tensor in shared memory to tuple.
@@ -935,30 +942,37 @@ void BindTensor(pybind11::module &m) {  // NOLINT
        )DOC")
       .def("_new_shared_filename",
            [](py::tuple t) {  // __setstate__
-             if (t.size() != 5)
+             if (t.size() != 7)
                throw std::runtime_error("Invalid Tensor meta info state!");
 
              phi::DenseTensor tensor;
 
              // 2. Rebuild Allocation
              const std::string &ipc_name = t[0].cast<std::string>();
-             size_t size = t[1].cast<size_t>();
+             const int shared_fd = t[1].cast<int>();
+             const bool use_file_descriptor = t[6].cast<bool>();
+
+             size_t size = t[2].cast<size_t>();
              int flags = memory::allocation::MAPPED_SHAREDMEM |
                          memory::allocation::MAPPED_NOCREATE;
+             if (use_file_descriptor) {
+                 flags = flags | memory::allocation::MAPPED_KEEPFD |
+                         memory::allocation::MAPPED_UNLINK;
+             }
              int find_id = -1;
              if (FLAGS_use_shm_cache) {
                find_id = memory::allocation::MemoryMapAllocationPool::Instance().FindFromCache(flags, size, ipc_name, /*check_refcount*/ false); // NOLINT
              }
              auto shared_holder =
                  memory::allocation::AllocateRefcountedMemoryMapAllocation(
-                     ipc_name, flags, size, find_id);
+                     ipc_name, shared_fd, flags, size, find_id);
 
              // 3. Rebuild Tensor
              tensor.ResetHolderWithType(
                  shared_holder,
-                 static_cast<phi::DataType>(t[2].cast<int>()));
-             tensor.Resize(common::make_ddim(t[3].cast<std::vector<int>>()));
-             tensor.set_lod(t[4].cast<framework::LoD>());
+                 static_cast<phi::DataType>(t[3].cast<int>()));
+             tensor.Resize(common::make_ddim(t[4].cast<std::vector<int>>()));
+             tensor.set_lod(t[5].cast<framework::LoD>());
 
              return tensor;
            },
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index ba3a466fba219..c93588f73d6f3 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -31,11 +31,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/pybind/complex.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/strided_memcpy.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
@@ -696,7 +696,7 @@ void _sliceCompute(const phi::DenseTensor *in,
   auto out_t =
       framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
           *out);
-  operators::EigenSlice<std::decay_t<decltype(eigen_place)>, T, D>::Eval(
+  phi::funcs::EigenSlice<std::decay_t<decltype(eigen_place)>, T, D>::Eval(
       eigen_place, out_t, in_t, offsets, extents);
 }
 
diff --git a/paddle/fluid/pybind/uva_utils.h b/paddle/fluid/pybind/uva_utils.h
index 7f29814bcecb5..4d46a2398056d 100644
--- a/paddle/fluid/pybind/uva_utils.h
+++ b/paddle/fluid/pybind/uva_utils.h
@@ -20,10 +20,10 @@
 #undef copysign
 #endif
 
-#include "paddle/fluid/operators/utils.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/tensor_utils.h"
 
 namespace paddle {
 namespace pybind {
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 7325aef2202b5..ee4fcec12c257 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -62,8 +62,8 @@ if(WITH_XBYAK)
   list(APPEND PHI_DEPS xbyak)
 endif()
 
-if(WITH_MKLDNN)
-  list(APPEND PHI_DEPS mkldnn)
+if(WITH_ONEDNN)
+  list(APPEND PHI_DEPS onednn)
 endif()
 
 if(WITH_GLOO)
@@ -119,8 +119,11 @@ if(WITH_AVX
    AND AVX512F_FLAG
    AND WITH_MKL)
   set_source_files_properties(
+    kernels/fusion/cpu/fused_layer_norm_avx_kernel.cc
     kernels/fusion/cpu/self_dp_attention_kernel.cc
-    PROPERTIES COMPILE_FLAGS "-Wno-maybe-uninitialized  -mfma ${AVX512F_FLAG}")
+    kernels/fusion/cpu/rms_norm_avx_kernel.cc
+    PROPERTIES COMPILE_FLAGS
+               "${Wno_Maybe_Uninitialized} ${FMA_FLAG} ${AVX512F_FLAG}")
 endif()
 
 if(WITH_GPU)
diff --git a/paddle/phi/api/CMakeLists.txt b/paddle/phi/api/CMakeLists.txt
index 1827dfbeb7f64..b06c40cf41a6e 100644
--- a/paddle/phi/api/CMakeLists.txt
+++ b/paddle/phi/api/CMakeLists.txt
@@ -1,2 +1,9 @@
 add_subdirectory(profiler)
 add_subdirectory(lib)
+if(WIN32)
+  file(GLOB YAML_FILE "${CMAKE_CURRENT_SOURCE_DIR}/yaml/*.yaml")
+  set_property(
+    DIRECTORY
+    APPEND
+    PROPERTY CMAKE_CONFIGURE_DEPENDS ${YAML_FILE})
+endif()
diff --git a/paddle/phi/api/lib/backend_set.h b/paddle/phi/api/lib/backend_set.h
index af4de2580f578..13077fb9167ad 100644
--- a/paddle/phi/api/lib/backend_set.h
+++ b/paddle/phi/api/lib/backend_set.h
@@ -26,7 +26,7 @@ namespace experimental {
  * and the higher backend bit has a higher priority.
  *
  * A Tensor may belong to multiple backends at the same time, such CPU and
- * MKLDNN. Only one backend value cannot
+ * OneDNN. Only one backend value cannot
  */
 class BackendSet final {
  public:
diff --git a/paddle/phi/api/profiler/device_tracer.cc b/paddle/phi/api/profiler/device_tracer.cc
index e1c009fa9cad0..085d28220a6a9 100644
--- a/paddle/phi/api/profiler/device_tracer.cc
+++ b/paddle/phi/api/profiler/device_tracer.cc
@@ -834,7 +834,7 @@ uint32_t GetCurSystemThreadId() {
   return id;
 }
 
-void RecoreCurThreadId(uint64_t id) {
+void RecordCurThreadId(uint64_t id) {
   std::lock_guard<std::mutex> lock(system_thread_id_map_mutex);
   auto gid = GetCurSystemThreadId();
   system_thread_id_map[gid] = id;
diff --git a/paddle/phi/api/profiler/device_tracer.h b/paddle/phi/api/profiler/device_tracer.h
index bde73357f2075..a0f4b5c54670e 100644
--- a/paddle/phi/api/profiler/device_tracer.h
+++ b/paddle/phi/api/profiler/device_tracer.h
@@ -162,5 +162,5 @@ void ClearCurBlock();
 int BlockDepth();
 
 // Set current thread id, so we can map the system thread id to thread id.
-void RecoreCurThreadId(uint64_t id);
+void RecordCurThreadId(uint64_t id);
 }  // namespace phi
diff --git a/paddle/phi/api/profiler/profiler_helper.h b/paddle/phi/api/profiler/profiler_helper.h
index 31ccbbb12fb6f..16ae735fccc1e 100644
--- a/paddle/phi/api/profiler/profiler_helper.h
+++ b/paddle/phi/api/profiler/profiler_helper.h
@@ -73,7 +73,7 @@ inline EventList<Event> &GetEventList() {
     ProfilerHelper::g_thread_id = ProfilerHelper::g_next_thread_id++;
     ProfilerHelper::g_all_event_lists.emplace_front(
         ProfilerHelper::g_event_list);
-    RecoreCurThreadId(ProfilerHelper::g_thread_id);
+    RecordCurThreadId(ProfilerHelper::g_thread_id);
   }
   return *ProfilerHelper::g_event_list;
 }
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index 25bd37ab01f87..3937464fbce49 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -13,6 +13,7 @@
   kernel :
     func : abs_double_grad
     data_type : grad_x_grad
+  backward : abs_triple_grad
 
 - backward_op : abs_grad
   forward : abs (Tensor x) -> Tensor(out)
@@ -27,6 +28,17 @@
   composite : abs_grad(x, out_grad, x_grad)
   backward : abs_double_grad
 
+- backward_op : abs_triple_grad
+  forward : abs_double_grad (Tensor x, Tensor grad_x_grad) -> Tensor(grad_out_grad)
+  args : (Tensor x, Tensor grad_out_grad_grad)
+  output : Tensor(grad_x_grad_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  data_transform :
+    support_trans_dtype : x
+  composite : abs_triple_grad(x, grad_out_grad_grad, grad_x_grad_grad)
+
 - backward_op : acos_grad
   forward : acos (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
@@ -717,6 +729,16 @@
   kernel :
     func : erfinv_grad
 
+- backward_op : exp_double_grad
+  forward : exp_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x)
+  args : (Tensor out, Tensor grad_out, Tensor grad_x_grad)
+  output : Tensor(out_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [out, out]
+  composite : exp_double_grad(out, grad_out, grad_x_grad, out_grad, grad_out_grad)
+  inplace : (grad_x_grad -> grad_out_grad)
+
 - backward_op : exp_grad
   forward : exp (Tensor x) -> Tensor(out)
   args : (Tensor out, Tensor out_grad)
@@ -728,6 +750,7 @@
   kernel :
     func : exp_grad
   inplace : (out_grad -> x_grad)
+  backward : exp_double_grad
   composite : exp_grad(out, out_grad, x_grad)
 
 - backward_op : expand_as_grad
@@ -1434,6 +1457,7 @@
     param : [x, x]
   kernel :
     func : log_double_grad
+  composite : log_double_grad(x, grad_out, grad_x_grad, x_grad, grad_out_grad)
   inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_op : log_grad
@@ -2010,6 +2034,7 @@
     spmd_rule : ElementwiseUnaryGradInferSpmd
   kernel :
     func : rsqrt_grad
+  composite : rsqrt_grad(out, out_grad, x_grad)
   backward : rsqrt_double_grad
   inplace : (out_grad -> x_grad)
 
@@ -2375,6 +2400,12 @@
   inplace : (out_grad -> x_grad)
   backward: squeeze_double_grad
 
+- backward_op : stack_double_grad
+  forward : stack_grad (Tensor[] x, Tensor grad_out, int axis=0) -> Tensor[](grad_x)
+  args : (Tensor[] grad_x_grad, int axis = 0)
+  output : Tensor(grad_out_grad)
+  invoke : stack(grad_x_grad, axis)
+
 - backward_op : stack_grad
   forward : stack (Tensor[] x, int axis) -> Tensor(out)
   args : (Tensor[] x, Tensor out_grad, int axis)
@@ -2389,6 +2420,7 @@
     data_type : out_grad
   no_need_buffer : x
   composite : stack_grad(x, out_grad, axis, x_grad)
+  backward: stack_double_grad
 
 - backward_op : stanh_grad
   forward : stanh(Tensor x, float scale_a, float scale_b) -> Tensor(out)
@@ -2727,6 +2759,9 @@
   forward: silu_grad (Tensor x, Tensor out, Tensor grad_out) -> Tensor(grad_x)
   args: (Tensor x, Tensor out, Tensor grad_out, Tensor grad_x_grad)
   output: Tensor(x_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, x]
   composite: silu_double_grad(x, out, grad_out, grad_x_grad, x_grad, grad_out_grad)
 
 - backward_op: unpool3d_grad
diff --git a/paddle/phi/api/yaml/fused_backward.yaml b/paddle/phi/api/yaml/fused_backward.yaml
index 36c3c0dde5191..235864c4c9d8b 100644
--- a/paddle/phi/api/yaml/fused_backward.yaml
+++ b/paddle/phi/api/yaml/fused_backward.yaml
@@ -41,8 +41,8 @@
   support_dygraph_mode : true
 
 - backward_op : fused_rotary_position_embedding_grad
-  forward: fused_rotary_position_embedding (Tensor q, Tensor k, Tensor v, Tensor sin, Tensor cos, Tensor position_ids, bool use_neox_rotary_style, bool time_major) -> Tensor(out_q), Tensor(out_k), Tensor(out_v)
-  args : (Tensor sin, Tensor cos, Tensor position_ids, Tensor out_q_grad, Tensor out_k_grad,Tensor out_v_grad, bool use_neox_rotary_style, bool time_major)
+  forward: fused_rotary_position_embedding (Tensor q, Tensor k, Tensor v, Tensor sin, Tensor cos, Tensor position_ids, bool use_neox_rotary_style, bool time_major, float rotary_emb_base) -> Tensor(out_q), Tensor(out_k), Tensor(out_v)
+  args : (Tensor sin, Tensor cos, Tensor position_ids, Tensor out_q_grad, Tensor out_k_grad,Tensor out_v_grad, bool use_neox_rotary_style, bool time_major, float rotary_emb_base)
   output : Tensor(q_grad), Tensor(k_grad), Tensor(v_grad)
   optional :  sin, cos, position_ids, out_k_grad, out_v_grad, k_grad, v_grad
   infer_meta :
diff --git a/paddle/phi/api/yaml/fused_ops.yaml b/paddle/phi/api/yaml/fused_ops.yaml
index ff6969194f6d6..9b03721fb284b 100644
--- a/paddle/phi/api/yaml/fused_ops.yaml
+++ b/paddle/phi/api/yaml/fused_ops.yaml
@@ -1,4 +1,4 @@
-# This file is designed for fusion C++ farward operators, which manages the
+# This file is designed for fusion C++ forward operators, which manages the
 # generated code for static mode and dynamic mode (when `support_dygraph_mode` is true).
 # "support_dygraph_mode" is an extra configuration item in this file,
 # if one operator have "support_dygraph_mode : true", it supports dygraph mode,
@@ -83,6 +83,15 @@
     data_type : x
   optional : bias, branch, branch_max ,x_max, scale_max, out_max_in
 
+- op : cross_attention_xpu
+  args : (Tensor input_q, Tensor input_kv, Tensor[] fc_weight, Tensor[] fc_weight_max, Tensor[] fc_bias, Tensor mask, int head_num, int head_dim, float alpha, DataType out_dtype)
+  output : Tensor(qkv), Tensor(qkv_max)
+  infer_meta :
+    func : CrossAttentionXPUInferMeta
+  kernel :
+    func : cross_attention_xpu
+    data_type : input_q
+
 - op : dequantize_xpu
   args : (Tensor x, DataType out_dtype, float scale = 1.0f)
   output : Tensor(y)
@@ -186,6 +195,7 @@
     func : fused_conv2d_add_act
     data_type : input
   optional : bias, residual_data, outputs
+  interfaces : paddle::dialect::LayoutTransformationInterface
 
 - op : fused_dconv_drelu_dbn
   args : (Tensor grad_output, Tensor weight, Tensor grad_output_add, Tensor residual_input, Tensor bn1_eqscale, Tensor bn1_eqbias, Tensor conv_input, Tensor bn1_mean, Tensor bn1_inv_std, Tensor bn1_gamma, Tensor bn1_beta, Tensor bn1_input, Tensor bn2_mean, Tensor bn2_inv_std, Tensor bn2_gamma, Tensor bn2_beta, Tensor bn2_input, int[] paddings, int[] dilations, int[] strides, str padding_algorithm, int groups, str data_format, bool fuse_shortcut, bool fuse_dual, bool fuse_add, bool exhaustive_search)
@@ -273,7 +283,7 @@
   optional : cache_kv, pre_caches, rotary_pos_emb, time_step, seq_lengths, src_mask, gather_index
 
 - op : fused_rotary_position_embedding
-  args : (Tensor q, Tensor k, Tensor v, Tensor sin, Tensor cos, Tensor position_ids, bool use_neox_rotary_style = true, bool time_major = false)
+  args : (Tensor q, Tensor k, Tensor v, Tensor sin, Tensor cos, Tensor position_ids, bool use_neox_rotary_style = true, bool time_major = false, float rotary_emb_base = 10000.0)
   output : Tensor(out_q), Tensor(out_k), Tensor(out_v)
   infer_meta :
     func : FusedRopeInferMeta
@@ -375,6 +385,15 @@
     func : generate_sequence_xpu
     data_type : dtype
 
+- op : group_norm_silu_xpu
+  args : (Tensor x, Tensor scale, Tensor bias, int groups = -1, float epsilon = 1e-5)
+  output : Tensor(out)
+  infer_meta :
+    func : GroupNormalizeSiluXPUInferMeta
+  kernel :
+    func : group_norm_silu_xpu
+    data_type : x
+
 - op : layer_norm_act_xpu
   args : (Tensor x, Tensor scale, Tensor bias, int begin_norm_axis, float epsilon, int act_type, float act_param)
   output : Tensor(out)
@@ -420,14 +439,14 @@
   optional : bias_qk
 
 - op : qkv_attention_xpu
-  args : (Tensor q, Tensor k, Tensor v, Tensor q_max, Tensor k_max, Tensor v_max, float alpha, int head_num, int head_dim, bool qkv_fc_fusion, DataType out_dtype)
-  output : Tensor(qkv), Tensor(qkv_max)
+  args : (Tensor q, Tensor k, Tensor v, Tensor q_max, Tensor k_max, Tensor v_max, Tensor qk_max, Tensor qkv_max, float alpha, int head_num, int head_dim, bool qkv_fc_fusion, DataType out_dtype)
+  output : Tensor(qkv)
   infer_meta :
     func : QKVAttentionXPUInferMeta
   kernel :
     func : qkv_attention_xpu
     data_type : q
-  optional : q_max, k_max, v_max
+  optional : q_max, k_max, v_max, qk_max, qkv_max
 
 - op : quantize_xpu
   args : (Tensor x, DataType out_dtype, float scale = 1.0f)
@@ -474,6 +493,15 @@
     func : skip_layernorm
     data_type : x
 
+- op : spatial_transformer_resblock_xpu
+  args : (Tensor x, Tensor[] x_max, Tensor[] conv_bias, Tensor[] conv_filter, Tensor[] conv_filter_max, Tensor[] gn_bias, Tensor[] gn_scale, int[] dilations, int[] paddings, int[] strides, float[] gn_eps, int[] gn_groups, int[] groups, bool conv_fix, bool has_silu_fc_input, bool include_silu)
+  output : Tensor(out), Tensor(out_max)
+  infer_meta :
+    func : SpatialTransformerResblockXPUInferMeta
+  kernel :
+    func : spatial_transformer_resblock_xpu
+    data_type : x
+
 - op : squeeze_excitation_block
   args : (Tensor x, Tensor filter, Tensor filter_max, Tensor bias, Tensor branch, int[] act_type, float[] act_param, int[] filter_dims)
   output : Tensor(out)
diff --git a/paddle/phi/api/yaml/generator/api_gen.py b/paddle/phi/api/yaml/generator/api_gen.py
index 3e144fa27d986..59eedd4a83de4 100644
--- a/paddle/phi/api/yaml/generator/api_gen.py
+++ b/paddle/phi/api/yaml/generator/api_gen.py
@@ -340,9 +340,7 @@ def gene_output(
                         )
                     else:
                         raise ValueError(
-                            "{} : Output error: only support Tensor type when use view in yaml. But get {}".format(
-                                self.api, out_dtype_list[i]
-                            )
+                            f"{self.api} : Output error: only support Tensor type when use view in yaml. But get {out_dtype_list[i]}"
                         )
         else:
             raise ValueError(
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index e2f4cca95c923..b24b3a20c37eb 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -175,15 +175,15 @@
 
 - backward_op : divide_double_grad
   forward : divide_grad (Tensor x, Tensor y, Tensor out, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y)
-  args : (Tensor y, Tensor out, Tensor grad_x, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1)
+  args : (Tensor y, Tensor out, Tensor grad_out, Tensor grad_x, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1)
   output : Tensor(y_grad), Tensor(out_grad), Tensor(grad_out_grad)
   infer_meta :
     func : GeneralTernaryGradInferMeta
-    param : [y, grad_x, grad_x]
+    param : [y, out, out]
   kernel :
     func : divide_double_grad
     data_type : out
-  optional : grad_x_grad, grad_y_grad
+  optional : grad_x, grad_x_grad, grad_y_grad
   inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_op : divide_grad
@@ -411,6 +411,7 @@
     param: [x]
   kernel :
     func : min_grad
+  composite : min_grad(x, out, out_grad, axis, keepdim, reduce_all, x_grad)
 
 - backward_op : minimum_grad
   forward : minimum(Tensor x, Tensor y) -> Tensor(out)
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index 142814e1cc01e..188367817803a 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -77,7 +77,6 @@
     backend : place
   data_transform :
     support_trans_dtype : start, end, step
-  interfaces : paddle::dialect::InferSymbolicShapeInterface
 
 - op : assign
   args : (Tensor x)
@@ -551,7 +550,7 @@
     skip_transform : x
 
 - op : full_with_tensor
-  args : (Tensor shape, Tensor value, DataType dtype=DataType::FLOAT32)
+  args : (Tensor value, IntArray shape, DataType dtype=DataType::FLOAT32)
   output: Tensor(out)
   infer_meta :
     func : FullWithTensorInferMeta
@@ -1099,7 +1098,6 @@
   kernel :
     func : split
   backward : split_grad
-  interfaces : paddle::dialect::InferSymbolicShapeInterface
 
 - op : split_with_num
   args : (Tensor x, int num, Scalar(int) axis)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index ab6161e0b0765..56dad40de1353 100755
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -15,7 +15,7 @@
 #     attrs : [bool is_test = false]
 
 - op : abs
-  backward : abs_grad
+  backward : abs_grad, abs_double_grad, abs_triple_grad
   inputs :
     x : X
   outputs :
@@ -296,6 +296,12 @@
   get_expected_kernel_type :
     assign : GetAssignExpectedKernelType
 
+- op : assign_pos
+  inputs :
+    {x : X}
+  outputs :
+    out : Out
+
 - op : assign_value
   outputs :
     out : Out
@@ -818,6 +824,12 @@
   outputs :
     out : Out
 
+- op : dgc_momentum
+  inputs :
+    {param : Param, grad : Grad, velocity : Velocity, learning_rate : LearningRate, master_param : MasterParam, current_step_tensor : current_step, nranks_tensor : nranks}
+  outputs :
+    {param_out : ParamOut, velocity_out : VelocityOut, master_param_out : MasterParamOut, grad_out : Grad_out}
+
 - op : diag (diag_v2)
   backward : diag_grad (diag_v2_grad)
   inputs :
@@ -1019,7 +1031,7 @@
     out : Out
 
 - op : exp
-  backward : exp_grad
+  backward : exp_grad, exp_double_grad
   inputs :
     x : X
   outputs :
@@ -1267,6 +1279,12 @@
       data_type : float
       support_tensor : true
 
+- op : full_with_tensor
+  int_array:
+    shape :
+      data_type : int64_t
+      support_tensor : true
+
 - op : fused_adam_(fused_adam)
   inputs :
     {params : Params, grads : Grads, learning_rate : LearningRate, moments1 : Moments1,
@@ -3225,7 +3243,7 @@
     outputs : [xshape]
 
 - op : stack
-  backward : stack_grad
+  backward : stack_grad, stack_double_grad
   inputs :
     x : X
   outputs :
@@ -3705,6 +3723,12 @@
   outputs:
     {param_out : ParamOut, moment_out : MomentOut}
 
+- op: dgc
+  inputs:
+    {u: U, v: V, grad: Grad}
+  outputs:
+    {u_out: U_out, v_out: V_out, encode_grad: EncodeGrad, grad_out: Grad_out, gather_buff: GatherBuff}
+
 - op: distribute_fpn_proposals
   inputs :
     {fpn_rois: FpnRois, rois_num: RoisNum}
@@ -3713,6 +3737,12 @@
     multi_level_rois_num: MultiLevelRoIsNum
     restore_index: RestoreIndex
 
+- op: distributed_fused_lamb
+  inputs:
+    {param: Param, grad: Grad, fp32_fused_param: FP32FusedParam, fp32_fused_grad: FP32FusedGrad, fp16_fused_param: FP16FusedParam, fp16_fused_grad: FP16FusedGrad, moment1: Moment1, moment2: Moment2, beta1pow: Beta1Pow, beta2pow: Beta2Pow, fused_param_offsets: FusedParamOffsets, fp32_shard_fused_param_offsets: FP32ShardFusedParamOffsets, fp16_shard_fused_param_offsets: FP16ShardFusedParamOffsets, param_info: ParamInfo, param_order: ParamOrder, learning_rate: LearningRate, global_scale: GlobalScale}
+  outputs:
+    {param_out : ParamOut, fp32_fused_param_out: FP32FusedParamOut, fp16_fused_param_out: FP16FusedParamOut, fp32_acc_fused_grad: FP32AccFusedGrad, fp16_acc_fused_grad: FP16AccFusedGrad, moment1_out: Moment1Out, moment2_out: Moment2Out, beta1pow_out: Beta1PowOut, beta2pow_out: Beta2PowOut, found_inf: FoundInf, acc_step: AccStep, stop_update: StopUpdate, step: Step}
+
 - op: distributed_fused_lamb_init
   inputs:
     {param: Param, grad: Grad}
diff --git a/paddle/phi/api/yaml/op_version.yaml b/paddle/phi/api/yaml/op_version.yaml
index 2bd09abd311ae..6e7a2cff79764 100644
--- a/paddle/phi/api/yaml/op_version.yaml
+++ b/paddle/phi/api/yaml/op_version.yaml
@@ -274,7 +274,7 @@
 
 - op : generate_proposals
   version :
-    - checkpoint : Registe generate_proposals_v2 for adding the attribute of pixel_offset
+    - checkpoint : Register generate_proposals_v2 for adding the attribute of pixel_offset
       action :
         - add_attr : pixel_offset
           comment : If true, im_shape pixel offset is 1.
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index d6f4c6cddfb27..9830d7ae3a7a4 100755
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -1,7 +1,7 @@
 # This file is designed for C++ operators, which manages the
 # generated code for dynamic mode and static mode. If you want
 # to add the new operator configuration, make sure an operator's
-# Python API, dynamic graph API, and static graph Opertaor parameters
+# Python API, dynamic graph API, and static graph Operator parameters
 # are consistent and correspond one-to-one. It's forbidden that the
 # operator configured in this yaml file does not have Python API.
 
@@ -327,6 +327,7 @@
   backward : bicubic_interp_grad
   data_transform :
     skip_transform : out_size, size_tensor, scale_tensor
+  interfaces : paddle::dialect::InferSymbolicShapeInterface
 
 - op : bilinear
   args : (Tensor x, Tensor y, Tensor weight, Tensor bias)
@@ -350,6 +351,7 @@
   backward : bilinear_interp_grad
   data_transform :
     skip_transform : out_size, size_tensor, scale_tensor
+  interfaces : paddle::dialect::InferSymbolicShapeInterface
 
 - op : bincount
   args: (Tensor x, Tensor weights, Scalar(int) minlength = 0)
@@ -1658,6 +1660,7 @@
   backward : linear_interp_grad
   data_transform :
     skip_transform : out_size, size_tensor, scale_tensor
+  interfaces : paddle::dialect::InferSymbolicShapeInterface
 
 - op : llm_int8_linear
   args : (Tensor x, Tensor weight, Tensor bias, Tensor weight_scale, float threshold=6.0)
@@ -1976,6 +1979,7 @@
     func : meshgrid
     data_type : inputs
   backward : meshgrid_grad
+  interfaces : paddle::dialect::InferSymbolicShapeInterface
 
 - op : mode
   args : (Tensor x,  int axis = -1,  bool keepdim = false)
@@ -2068,6 +2072,7 @@
   backward : nearest_interp_grad
   data_transform :
     skip_transform : out_size, size_tensor, scale_tensor
+  interfaces : paddle::dialect::InferSymbolicShapeInterface
 
 - op : nextafter
   args : (Tensor x, Tensor y)
@@ -2314,6 +2319,7 @@
   kernel :
     func : relu6
   backward : relu6_grad
+  interfaces : paddle::dialect::InferSymbolicShapeInterface
 
 - op : renorm
   args : (Tensor x, float p, int axis, float max_norm)
@@ -2555,7 +2561,7 @@
   kernel :
     func : shape {dense -> dense},
            shape_sr {selected_rows -> dense}
-  data_transform:
+  data_transform :
     skip_transform : input
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
@@ -2616,7 +2622,7 @@
     spmd_rule : ElementwiseUnaryInferSpmd
   kernel :
     func : sin
-  inplace: (x -> out)
+  inplace : (x -> out)
   backward : sin_grad
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
@@ -2777,10 +2783,10 @@
 - op : swiglu
   args : (Tensor x, Tensor y)
   output : Tensor(out)
-  infer_meta:
-     func: SwiGLUInferMeta
-     spmd_rule: SwiGLUInferSpmd
-  kernel:
+  infer_meta :
+     func : SwiGLUInferMeta
+     spmd_rule : SwiGLUInferSpmd
+  kernel :
      func : swiglu
   optional : y
   backward: swiglu_grad
@@ -2804,7 +2810,7 @@
     func : UnchangedInferMeta
   kernel :
     func : tan
-  inplace: (x -> out)
+  inplace : (x -> out)
   backward : tan_grad
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
@@ -2913,6 +2919,7 @@
   backward : trilinear_interp_grad
   data_transform :
     skip_transform : out_size, size_tensor, scale_tensor
+  interfaces : paddle::dialect::InferSymbolicShapeInterface
 
 - op : trunc
   args : (Tensor input)
@@ -3052,9 +3059,9 @@
     func : WarpctcInferMeta
   kernel :
     func : warpctc
-    data_type: logits
-  optional: logits_length, labels_length
-  intermediate: warpctcgrad
+    data_type : logits
+  optional : logits_length, labels_length
+  intermediate : warpctcgrad
   backward : warpctc_grad
 
 - op : warprnnt
@@ -3064,8 +3071,8 @@
     func : WarprnntInferMeta
   kernel :
     func : warprnnt
-    data_type: input
-  intermediate: warprnntgrad
+    data_type : input
+  intermediate : warprnntgrad
   backward : warprnnt_grad
 
 - op : weight_dequantize
@@ -3085,8 +3092,8 @@
   kernel :
     func : weight_only_linear
     data_type : x
-  optional: bias
-  backward: weight_only_linear_grad
+  optional : bias
+  backward : weight_only_linear_grad
 
 - op : weight_quantize
   args : (Tensor x, str algo = "weight_only_int8", int arch = 80, int group_size = -1)
@@ -3095,7 +3102,8 @@
     func : WeightQuantizeInferMeta
   kernel :
     func : weight_quantize
-    data_type: x
+    data_type : x
+    backend : x
 
 - op : weighted_sample_neighbors
   args : (Tensor row, Tensor colptr, Tensor edge_weight, Tensor input_nodes, Tensor eids, int sample_size, bool return_eids)
@@ -3114,7 +3122,7 @@
     spmd_rule: WhereInferSpmd
   kernel :
     func : where
-  inplace: (x -> out)
+  inplace : (x -> out)
   backward : where_grad
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
diff --git a/paddle/phi/api/yaml/static_backward.yaml b/paddle/phi/api/yaml/static_backward.yaml
index 526a7195a5bb3..d4ca3f05e7c0b 100755
--- a/paddle/phi/api/yaml/static_backward.yaml
+++ b/paddle/phi/api/yaml/static_backward.yaml
@@ -103,7 +103,7 @@
   output : Tensor(weight_grad)
   infer_meta :
     func : EmbeddingGradInferMeta
-    param : [x,weght]
+    param : [x,weight]
   kernel :
     func : embedding_grad {dense, dense, dense -> dense}
            embedding_sparse_grad {dense, dense, dense -> selected_rows}
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index 80d5f14e627a3..67690440f6bbb 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -31,7 +31,7 @@ if(WITH_XPU)
   list(APPEND BACKENDS_DEPS phi_dynload_xpti)
 endif()
 
-if(WITH_MKLDNN)
+if(WITH_ONEDNN)
   list(APPEND BACKENDS_SRCS onednn/onednn_context.cc)
   list(APPEND BACKENDS_SRCS onednn/axpy_handler.cc)
   list(APPEND BACKENDS_SRCS onednn/matmul_utils.cc)
diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index 0b056d6df972f..3e65845905646 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -103,13 +103,14 @@ static constexpr char* win_nvjpeg_lib =
     ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll;nvjpeg64_10.dll";
 static constexpr char* win_cusolver_lib =
     "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll;cusolver64_10.dll";
+    ".dll;cusolver64_" CUDA_VERSION_MAJOR
+    ".dll;cusolver64_11.dll;cusolver64_10.dll";
 static constexpr char* win_cusparse_lib =
     "cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
     ".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll;cusparse64_10.dll";
 static constexpr char* win_cufft_lib =
     "cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cufft64_" CUDA_VERSION_MAJOR ".dll;cufft64_10.dll";
+    ".dll;cufft64_" CUDA_VERSION_MAJOR ".dll;cufft64_11.dll;cufft64_10.dll";
 #else
 static constexpr char* win_curand_lib =
     "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
diff --git a/paddle/phi/backends/onednn/onednn_context.cc b/paddle/phi/backends/onednn/onednn_context.cc
index b7789f29740f0..1a27e83af50fb 100644
--- a/paddle/phi/backends/onednn/onednn_context.cc
+++ b/paddle/phi/backends/onednn/onednn_context.cc
@@ -189,7 +189,7 @@ struct OneDNNContext::Impl {
 
     std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
 
-    // Find ShapeBlob for current mkldnn session id.
+    // Find ShapeBlob for current onednn session id.
     auto map_it = pMap->find(sid);
 
     if (map_it == pMap->end()) {
@@ -259,7 +259,7 @@ struct OneDNNContext::Impl {
 
     std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
 
-    // Find ShapeBlob for current mkldnn session id firstly
+    // Find ShapeBlob for current onednn session id firstly
     auto map_it = pMap->find(sid);
     // (jczaja): After first iteration of model's execution we
     // should have all elements cached (mostly) so failures are unlikely (less
@@ -366,7 +366,7 @@ struct OneDNNContext::Impl {
   unsigned int block_next_cache_clearing_ = 0;
 
   // Holds some attributes only used by the onednn kernel calculation
-  // Since original mkldnn op kernel directly adds the operations that require
+  // Since original onednn op kernel directly adds the operations that require
   // fusion to the native kernel operations, and uses the attribute `fuse_xxx`
   // to control, for onednn, there will be some attributes that seem to be
   // independent of the device are also saved here.
diff --git a/paddle/phi/backends/onednn/onednn_context.h b/paddle/phi/backends/onednn/onednn_context.h
index 499be34650098..0e4654cb50a77 100644
--- a/paddle/phi/backends/onednn/onednn_context.h
+++ b/paddle/phi/backends/onednn/onednn_context.h
@@ -28,7 +28,7 @@ namespace phi {
 using TensorNameMap = std::map<std::string, std::vector<std::string>>;
 
 class OneDNNContextThreadLocals {
-  // default mkldnn session id
+  // default onednn session id
 
   typedef OneDNNContextThreadLocals self;
   struct Body {
@@ -38,7 +38,7 @@ class OneDNNContextThreadLocals {
     // - For fixed-shape, it's a null string in default.
     // - For dynamic-shape, it's user specific.
     std::string cur_input_shape_str;
-    // the cache capacity of different input shapes for MKLDNN.
+    // the cache capacity of different input shapes for OneDNN.
     // Default 1 means fixed input shape, not dynamic shape.
     int cur_input_shape_cache_capacity;
     // Recently registered data_format. This is needed to
@@ -73,9 +73,9 @@ class OneDNNContextThreadLocals {
   OneDNNContextThreadLocals(const OneDNNContextThreadLocals& c) = delete;
 
  public:
-  // default mkldnn session id
+  // default onednn session id
   static constexpr size_t kMKLDNNSessionID_Default = 0;
-  // mkldnn session id for cache clearing mode
+  // onednn session id for cache clearing mode
   static constexpr size_t kMKLDNNSessionID_CacheClearing = -1;
   TEST_API static Body& fetch();
 };
@@ -89,7 +89,7 @@ class OneDNNContext : public CPUContext {
   template <class T>
   using umap_key_string_t = umap_value_smart_t<std::string, T>;
 
-  // Following three maps are used to cache MKLDNN primitives.
+  // Following three maps are used to cache OneDNN primitives.
   // There relations are:
   // - BlobMap = Map<cur_thread_id, ShapeBlob>
   // - ShapeBlob = Map<cur_input_shape_str, KeyBlob>
diff --git a/paddle/phi/backends/xpu/xpu1_op_list.cc b/paddle/phi/backends/xpu/xpu1_op_list.cc
index cef49d14c076f..58e5c5d72beab 100644
--- a/paddle/phi/backends/xpu/xpu1_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu1_op_list.cc
@@ -154,13 +154,13 @@ XPUOpMap& get_kl1_ops() {
        XPUKernelSet({phi::DataType::INT64,
                      phi::DataType::INT32,
                      phi::DataType::FLOAT32})},
+      {"group_norm_silu_xpu", XPUKernelSet({phi::DataType::FLOAT32})},
       {"hard_switch_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"hard_switch", XPUKernelSet({phi::DataType::FLOAT32})},
       {"index_select",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::INT32,
                      phi::DataType::INT64})},
-      {"iou_similarity", XPUKernelSet({phi::DataType::FLOAT32})},
       {"lamb", XPUKernelSet({phi::DataType::FLOAT32})},
       {"layer_norm_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"layer_norm", XPUKernelSet({phi::DataType::FLOAT32})},
diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
index 07972469a32b1..9698544b3738f 100644
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -549,6 +549,8 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::FLOAT32})},
       {"grid_sampler_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"grid_sampler", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"group_norm_silu_xpu",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"hard_sigmoid_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"hard_sigmoid",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
@@ -586,7 +588,6 @@ XPUOpMap& get_kl2_ops() {
       {"instance_norm_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"inverse",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT64})},
-      {"iou_similarity", XPUKernelSet({phi::DataType::FLOAT32})},
       {"label_smooth", XPUKernelSet({phi::DataType::FLOAT32})},
       {"lamb", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"lars_momentum",
@@ -836,8 +837,6 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::FLOAT16,
                      phi::DataType::INT32,
                      phi::DataType::INT64})},
-      {"sampling_id",
-       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT64})},
       {"set_value",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::INT32,
@@ -914,6 +913,8 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::INT16,
                      phi::DataType::FLOAT32,
                      phi::DataType::FLOAT16})},
+      {"spatial_transformer_resblock_xpu",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"split",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::FLOAT16,
@@ -1210,7 +1211,9 @@ XPUOpMap& get_kl2_ops() {
       {"fused_feedforward_grad",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"qkv_attention_xpu",
-       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT8})},
       {"lod_reset",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::FLOAT16,
@@ -1225,6 +1228,8 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::FLOAT32})},
       {"roformer_relative_embedding_xpu",
        XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::FLOAT32})},
+      {"cross_attention_xpu",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"variable_length_memory_efficient_attention",
        XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::FLOAT32})},
       {"flash_attn_unpadded",
diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc
index 48dc5d8334193..779f35a483bc7 100644
--- a/paddle/phi/backends/xpu/xpu3_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu3_op_list.cc
@@ -125,6 +125,7 @@ XPUOpMap& get_kl3_ops() {
                      phi::DataType::INT64})},
       {"c_concat",
        XPUKernelSet({phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16,
                      phi::DataType::FLOAT32,
                      phi::DataType::INT32,
                      phi::DataType::INT64})},
@@ -523,6 +524,8 @@ XPUOpMap& get_kl3_ops() {
                      phi::DataType::FLOAT16,
                      phi::DataType::FLOAT32})},
       {"grid_sampler_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"group_norm_silu_xpu",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"hard_sigmoid_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"hard_sigmoid",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
@@ -807,8 +810,6 @@ XPUOpMap& get_kl3_ops() {
                      phi::DataType::FLOAT16,
                      phi::DataType::INT32,
                      phi::DataType::INT64})},
-      {"sampling_id",
-       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT64})},
       {"set_value",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::INT32,
diff --git a/paddle/phi/common/data_type.h b/paddle/phi/common/data_type.h
index f28dd7e1c6ef1..e7a07b7a3e525 100644
--- a/paddle/phi/common/data_type.h
+++ b/paddle/phi/common/data_type.h
@@ -253,6 +253,46 @@ inline std::string DataTypeToString(const DataType& dtype) {
   }
 }
 
+inline DataType StringToDataType(const std::string& dtype) {
+  if (dtype == "Undefined(ALL_DTYPE)") {
+    return DataType::UNDEFINED;
+  } else if (dtype == "bool") {
+    return DataType::BOOL;
+  } else if (dtype == "int8") {
+    return DataType::INT8;
+  } else if (dtype == "uint8") {
+    return DataType::UINT8;
+  } else if (dtype == "int16") {
+    return DataType::INT16;
+  } else if (dtype == "uint16") {
+    return DataType::UINT16;
+  } else if (dtype == "int32") {
+    return DataType::INT32;
+  } else if (dtype == "uint32") {
+    return DataType::UINT32;
+  } else if (dtype == "int64") {
+    return DataType::INT64;
+  } else if (dtype == "uint64") {
+    return DataType::UINT64;
+  } else if (dtype == "bfloat16") {
+    return DataType::BFLOAT16;
+  } else if (dtype == "float16") {
+    return DataType::FLOAT16;
+  } else if (dtype == "float32") {
+    return DataType::FLOAT32;
+  } else if (dtype == "float64") {
+    return DataType::FLOAT64;
+  } else if (dtype == "complex64") {
+    return DataType::COMPLEX64;
+  } else if (dtype == "complex128") {
+    return DataType::COMPLEX128;
+  } else if (dtype == "pstring") {
+    return DataType::PSTRING;
+  } else {
+    PD_THROW("Invalid enum data type `", dtype, "`.");
+  }
+}
+
 }  // namespace phi
 
 namespace paddle {
diff --git a/paddle/phi/config.h.in b/paddle/phi/config.h.in
index cb3d7eadc7f04..38cac639437b7 100644
--- a/paddle/phi/config.h.in
+++ b/paddle/phi/config.h.in
@@ -12,8 +12,8 @@
 #define ON 1
 #define OFF 0
 
-// WITH_MKLDNN
-#if @WITH_MKLDNN@
+// WITH_ONEDNN
+#if @WITH_ONEDNN@
 #undef PADDLE_WITH_DNNL
 #define PADDLE_WITH_DNNL
 #endif
diff --git a/paddle/phi/core/dense_tensor.h b/paddle/phi/core/dense_tensor.h
index b78cec1483272..c2d804199d2c7 100644
--- a/paddle/phi/core/dense_tensor.h
+++ b/paddle/phi/core/dense_tensor.h
@@ -203,7 +203,7 @@ class TEST_API DenseTensor : public TensorBase,
    *
    * 1. Some hardware or third-party libraries add some additional storage
    * properties on top of the description of the basic DenseTensor, such as
-   * memory desc of MKLDNN, storage_format and storage_layout of NPU,
+   * memory desc of OneDNN, storage_format and storage_layout of NPU,
    * these members are necessary for optimal performance, but if the properties
    * of each device are added to the DenseTensor with different macro isolation,
    * the memory layout of the DenseTensor will become more fragmented.
diff --git a/paddle/phi/core/tensor_meta.h b/paddle/phi/core/tensor_meta.h
index f493e0249d7bf..613ba5f1f7f1f 100644
--- a/paddle/phi/core/tensor_meta.h
+++ b/paddle/phi/core/tensor_meta.h
@@ -75,7 +75,7 @@ struct TEST_API DenseTensorMeta {
 
   bool is_scalar{false};
   /// \brief Determine whether using gpudnn speed-up library in the new dygraph.
-  /// It maybe also support MKLDNN library in the near future.
+  /// It maybe also support OneDNN library in the near future.
   bool use_gpudnn{true};
   DDim dims;
   DataType dtype{DataType::UNDEFINED};
diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h
index ad30da4ddcd6f..03da054450092 100644
--- a/paddle/phi/core/visit_type.h
+++ b/paddle/phi/core/visit_type.h
@@ -355,7 +355,7 @@ namespace phi {
                  "`");                                                        \
     }                                                                         \
   }()
-#if defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_XPU)
 #define PD_VISIT_ALL_TYPES(TYPE, NAME, ...)                                    \
   [&] {                                                                        \
     const auto& __dtype__ = TYPE;                                              \
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index 9ba70ce824b39..261b99512a0ff 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -1405,6 +1405,7 @@ void FusedRopeGradInferMeta(const MetaTensor& sin,
                             const MetaTensor& dout_v,
                             bool use_neox_rotary_style,
                             bool time_major,
+                            float rotary_emb_base,
                             MetaTensor* dq,
                             MetaTensor* dk,
                             MetaTensor* dv) {
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index 278b4ba970ff1..88aea8f18181b 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -210,6 +210,7 @@ void FusedRopeGradInferMeta(const MetaTensor& sin,
                             const MetaTensor& dout_v,
                             bool use_neox_rotary_style,
                             bool time_major,
+                            float rotary_emb_base,
                             MetaTensor* dq,
                             MetaTensor* dk,
                             MetaTensor* dv);
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 97edce9ad7953..63d1d1c9b32d0 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -1532,7 +1532,7 @@ void ExpandAsInferMeta(const MetaTensor& x,
                        const MetaTensor& y,
                        const std::vector<int>& target_shape,
                        MetaTensor* out) {
-#define MAX_RANK_SUPPORTED 6
+#define MAX_RANK_SUPPORTED 8
   auto x_dims = x.dims();
   PADDLE_ENFORCE_GE(
       target_shape.size(),
diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc
index b56e7fab0bfe6..e8eb740e453ff 100644
--- a/paddle/phi/infermeta/fusion.cc
+++ b/paddle/phi/infermeta/fusion.cc
@@ -116,6 +116,20 @@ void AddLayernormXPUInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
+void GroupNormalizeSiluXPUInferMeta(const MetaTensor& x,
+                                    const MetaTensor& scale,
+                                    const MetaTensor& bias,
+                                    int groups,
+                                    float epsilon,
+                                    MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto out_dims = x_dims;
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  out->share_lod(x);
+}
+
 void FusedMultiTransformerInferMeta(
     const MetaTensor& x,
     const std::vector<const MetaTensor*>& ln_scales,
@@ -568,6 +582,36 @@ void Conv2dXPUInferMeta(const MetaTensor& x,
   out->set_dtype(out_dtype);
 }
 
+void SpatialTransformerResblockXPUInferMeta(
+    const MetaTensor& x,
+    const std::vector<const MetaTensor*>& x_max,
+    const std::vector<const MetaTensor*>& conv_bias,
+    const std::vector<const MetaTensor*>& conv_filter,
+    const std::vector<const MetaTensor*>& conv_filter_max,
+    const std::vector<const MetaTensor*>& gn_bias,
+    const std::vector<const MetaTensor*>& gn_scale,
+    const std::vector<int>& dilations,
+    const std::vector<int>& paddings,
+    const std::vector<int>& strides,
+    const std::vector<float>& gn_eps,
+    const std::vector<int>& gn_groups,
+    const std::vector<int>& groups,
+    bool conv_fix,
+    bool has_silu_fc_input,
+    bool include_silu,
+    MetaTensor* out,
+    MetaTensor* out_max) {
+  auto input_shape = x.dims();
+  auto batch_size = input_shape[0];
+  auto channel_out = conv_filter[0]->dims()[0];
+  auto h = input_shape[2];
+  auto w = input_shape[3];
+  out->set_dims(common::make_ddim({batch_size, channel_out, h, w}));
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  out->share_lod(x);
+}
+
 void EmbeddingWithEltwiseAddXPUInferMeta(
     const std::vector<const MetaTensor*>& ids,
     const std::vector<const MetaTensor*>& tables,
@@ -3032,7 +3076,7 @@ void FusedConv2dAddActInferMeta(const MetaTensor& input,
                                 MetaTensor* output,
                                 std::vector<MetaTensor*> outputs,
                                 MetaConfig config) {
-  // TODO(liuyuanle): mkldnn seems only support nchw.
+  // TODO(liuyuanle): onednn seems only support nchw.
   const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
   std::vector<int64_t> out_shape = ComputeOutputShape(input,
                                                       filter,
@@ -3731,13 +3775,14 @@ void QKVAttentionXPUInferMeta(const MetaTensor& q,
                               const MetaTensor& q_max,
                               const MetaTensor& k_max,
                               const MetaTensor& v_max,
+                              const MetaTensor& qk_max,
+                              const MetaTensor& qkv_max,
                               float alpha,
                               int head_num,
                               int head_dim,
                               bool qkv_fc_fusion,
                               DataType out_dtype,
-                              MetaTensor* qkv,
-                              MetaTensor* qkv_max) {
+                              MetaTensor* qkv) {
   auto q_dims = q.dims();
   auto k_dims = k.dims();
   auto v_dims = v.dims();
@@ -3781,9 +3826,6 @@ void QKVAttentionXPUInferMeta(const MetaTensor& q,
   qkv->set_dims(phi::make_ddim({q_dims[0], q_dims[1], head_num * head_dim}));
   qkv->set_dtype(out_dtype);
   qkv->set_layout(q.layout());
-  qkv_max->set_dims(phi::make_ddim({6}));
-  qkv_max->set_dtype(out_dtype);
-  qkv_max->set_layout(q.layout());
 }
 void SinePosXPUInferMeta(const MetaTensor& x,
                          const MetaTensor& y,
@@ -3816,6 +3858,95 @@ void SinePosXPUInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
+void CrossAttentionXPUInferMeta(
+    const MetaTensor& input_q,
+    const MetaTensor& input_kv,
+    const std::vector<const MetaTensor*>& fc_weight,
+    const std::vector<const MetaTensor*>& fc_weight_max,
+    const std::vector<const MetaTensor*>& fc_bias,
+    const MetaTensor& mask,
+    int head_num,
+    int head_dim,
+    float alpha,
+    DataType out_dtype,
+    MetaTensor* qkv,
+    MetaTensor* qkv_max) {
+  auto input_q_dims = input_q.dims();
+  auto input_kv_dims = input_kv.dims();
+  auto mask_dims = mask.dims();
+  // input shape : {B, L, H*D}
+  PADDLE_ENFORCE_EQ(input_q_dims.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The dim of input_q should be 3! But received ",
+                        input_q_dims.size()));
+  PADDLE_ENFORCE_EQ(input_kv_dims.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The dim of input_kv should be 3! But received ",
+                        input_kv_dims.size()));
+  // sequece length of q and k/v  not requied to be eqaul
+  // but batch size and dim should be the same
+  PADDLE_ENFORCE_EQ(
+      input_q_dims[0],
+      input_kv_dims[0],
+      phi::errors::InvalidArgument("The batch size of input_q and input_kv "
+                                   "should be the same! Received ",
+                                   input_q_dims[0],
+                                   " vs ",
+                                   input_kv_dims[0]));
+  PADDLE_ENFORCE_EQ(
+      input_q_dims[2],
+      input_kv_dims[2],
+      phi::errors::InvalidArgument("The hidden_dim of input_q and input_kv "
+                                   "should be the same! Received ",
+                                   input_q_dims[2],
+                                   " vs ",
+                                   input_kv_dims[2]));
+  int hidden_dim = head_num * head_dim;
+  PADDLE_ENFORCE_EQ(
+      input_q_dims[2],
+      hidden_dim,
+      phi::errors::InvalidArgument(
+          "The last dimension of input_q should be [H*D]! Received ",
+          input_q_dims[2],
+          " != expected ",
+          hidden_dim));
+  PADDLE_ENFORCE_EQ(fc_weight.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The size of fc_weight should be 3! But received ",
+                        fc_weight.size()));
+  PADDLE_ENFORCE_EQ(fc_weight_max.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The size of fc_weight_max should be 3! But received ",
+                        fc_weight_max.size()));
+  PADDLE_ENFORCE_EQ(
+      fc_bias.size(),
+      3,
+      phi::errors::InvalidArgument(
+          "The size of fc_bias should be 3! But received ", fc_bias.size()));
+  PADDLE_ENFORCE_LE(
+      mask_dims.size(),
+      4,
+      phi::errors::InvalidArgument(
+          "The dim of mask should be not greater than 4!", mask_dims.size()));
+
+  // output shape: {B, qL, H*D}
+  qkv->set_dims(
+      phi::make_ddim({input_q_dims[0], input_q_dims[1], head_num * head_dim}));
+  qkv->set_dtype(out_dtype);
+  qkv->set_layout(input_q.layout());
+  // TODO(Terry) optmize the max value num
+  // unable to pass few PR-CIs, so just use a constant value
+  // int xpu2_max_value_num = phi::backends::xpu::get_xpu_max_ptr_size(-1);
+  const int xpu2_max_value_num = 6;
+  qkv_max->set_dims(phi::make_ddim({xpu2_max_value_num}));
+  qkv_max->set_dtype(out_dtype);
+  qkv_max->set_layout(input_q.layout());
+}
+
 void MultiGruInferMeta(
     const MetaTensor& x,
     const std::vector<const MetaTensor*>& weight_x,
diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h
index 0a7224e39f73b..632a656414b4f 100644
--- a/paddle/phi/infermeta/fusion.h
+++ b/paddle/phi/infermeta/fusion.h
@@ -70,6 +70,13 @@ void AddLayernormXPUInferMeta(const MetaTensor& x,
                               float epsilon,
                               MetaTensor* out);
 
+void GroupNormalizeSiluXPUInferMeta(const MetaTensor& x,
+                                    const MetaTensor& scale,
+                                    const MetaTensor& bias,
+                                    int groups,
+                                    float epsilon,
+                                    MetaTensor* out);
+
 void BlockMultiheadAttentionInferMeta(const MetaTensor& qkv,
                                       const MetaTensor& key_cache,
                                       const MetaTensor& value_cache,
@@ -145,6 +152,26 @@ void Conv2dXPUInferMeta(const MetaTensor& x,
                         MetaTensor* out,
                         MetaTensor* out_max);
 
+void SpatialTransformerResblockXPUInferMeta(
+    const MetaTensor& x,
+    const std::vector<const MetaTensor*>& x_max,
+    const std::vector<const MetaTensor*>& conv_bias,
+    const std::vector<const MetaTensor*>& conv_filter,
+    const std::vector<const MetaTensor*>& conv_filter_max,
+    const std::vector<const MetaTensor*>& gn_bias,
+    const std::vector<const MetaTensor*>& gn_scale,
+    const std::vector<int>& dilations,
+    const std::vector<int>& paddings,
+    const std::vector<int>& strides,
+    const std::vector<float>& gn_eps,
+    const std::vector<int>& gn_groups,
+    const std::vector<int>& groups,
+    bool conv_fix,
+    bool has_silu_fc_input,
+    bool include_silu,
+    MetaTensor* out,
+    MetaTensor* out_max);
+
 void EmbeddingWithEltwiseAddXPUInferMeta(
     const std::vector<const MetaTensor*>& ids,
     const std::vector<const MetaTensor*>& tables,
@@ -862,13 +889,14 @@ void QKVAttentionXPUInferMeta(const MetaTensor& q,
                               const MetaTensor& q_max,
                               const MetaTensor& k_max,
                               const MetaTensor& v_max,
+                              const MetaTensor& qk_max,
+                              const MetaTensor& qkv_max,
                               float alpha,
                               int head_num,
                               int head_dim,
                               bool qkv_fc_fusion,
                               DataType out_dtype,
-                              MetaTensor* qkv,
-                              MetaTensor* qkv_max);
+                              MetaTensor* qkv);
 void SinePosXPUInferMeta(const MetaTensor& x,
                          const MetaTensor& y,
                          MetaTensor* out);
@@ -877,6 +905,19 @@ void RoformerRelativePosXPUInferMeta(const MetaTensor& x,
                                      const MetaTensor& cos_emb,
                                      int max_pos_len,
                                      MetaTensor* out);
+void CrossAttentionXPUInferMeta(
+    const MetaTensor& input_q,
+    const MetaTensor& input_kv,
+    const std::vector<const MetaTensor*>& fc_weight,
+    const std::vector<const MetaTensor*>& fc_weight_max,
+    const std::vector<const MetaTensor*>& fc_bias,
+    const MetaTensor& mask,
+    int head_num,
+    int head_dim,
+    float alpha,
+    DataType out_dtype,
+    MetaTensor* qkv,
+    MetaTensor* qkv_max);
 
 void MultiGruInferMeta(
     const MetaTensor& x,
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index a71f0b37437ab..ceebbdb5b2d74 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -4273,6 +4273,15 @@ void WeightOnlyLinearInferMeta(const MetaTensor& x,
           "But received Input(X) dim[-1](%s) != Input(Weight) dim[1](%s)",
           x_dims[x_dims.size() - 1],
           w_dims[1]));
+  if (bias.initialized()) {
+    auto bias_dims = bias.dims();
+    PADDLE_ENFORCE_EQ(
+        bias_dims.size(),
+        1UL,
+        errors::InvalidArgument(
+            "The size of Input(Bias)'s dimension should equal to 1UL.",
+            bias_dims.size()));
+  }
 
   // per-channel dequantization
   if (group_size == -1) {
@@ -4554,6 +4563,7 @@ void FusedRopeInferMeta(const MetaTensor& q,
                         const MetaTensor& position_ids,
                         bool use_neox_rotary_style,
                         bool time_major,
+                        float rotary_emb_base,
                         MetaTensor* out_q,
                         MetaTensor* out_k,
                         MetaTensor* out_v) {
@@ -4911,10 +4921,10 @@ void MaskedMultiheadAttentionInferMeta(const MetaTensor& x,
   }
 }
 
-void FullWithTensorInferMeta(const MetaTensor& shape,
+void FullWithTensorInferMeta(const IntArray& shape,
                              DataType dtype,
                              MetaTensor* out) {
-  out->set_dims(common::make_ddim(std::vector<int64_t>(shape.numel(), -1)));
+  out->set_dims(common::make_ddim(shape.GetData()));
   out->set_dtype(dtype);
 }
 
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 3722a0d5844ba..8d6a366fdbb24 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -904,6 +904,7 @@ void FusedRopeInferMeta(const MetaTensor& q,
                         const MetaTensor& position_ids,
                         bool use_neox_rotary_style,
                         bool time_major,
+                        float rotary_emb_base,
                         MetaTensor* out_q,
                         MetaTensor* out_k,
                         MetaTensor* out_v);
@@ -951,7 +952,7 @@ void MaskedMultiheadAttentionInferMeta(const MetaTensor& x,
                                        MetaTensor* cache_kv_out,
                                        MetaTensor* beam_cache_offset_out);
 
-void FullWithTensorInferMeta(const MetaTensor& shape,
+void FullWithTensorInferMeta(const IntArray& shape,
                              DataType dtype,
                              MetaTensor* out);
 
diff --git a/paddle/phi/infermeta/spmd_rules/fused_rope.cc b/paddle/phi/infermeta/spmd_rules/fused_rope.cc
index e58b987fb3499..9b2be06066799 100644
--- a/paddle/phi/infermeta/spmd_rules/fused_rope.cc
+++ b/paddle/phi/infermeta/spmd_rules/fused_rope.cc
@@ -241,7 +241,8 @@ SpmdInfo FusedRopeInferSpmd(const DistMetaTensor& q,
                             const DistMetaTensor& cos,
                             const DistMetaTensor& position_ids,
                             bool use_neox_rotary_style,
-                            bool time_major) {
+                            bool time_major,
+                            float rotary_emb_base) {
   check_q(q);
 
   std::vector<std::pair<std::string, std::vector<int64_t>>>
@@ -392,7 +393,8 @@ SpmdInfo FusedRopeInferSpmdReverse(const DistMetaTensor& q,
                                    const DistMetaTensor& out_k,
                                    const DistMetaTensor& out_v,
                                    bool use_neox_rotary_style,
-                                   bool time_major) {
+                                   bool time_major,
+                                   float rotary_emb_base) {
   check_q(out_q);
   std::vector<std::pair<std::string, std::vector<int64_t>>>
       outputs_sharding_info;
@@ -548,7 +550,8 @@ SpmdInfo FusedRopeGradInferSpmd(const DistMetaTensor& sin,
                                 const DistMetaTensor& out_k_grad,
                                 const DistMetaTensor& out_v_grad,
                                 bool use_neox_rotary_style,
-                                bool time_major) {
+                                bool time_major,
+                                float rotary_emb_base) {
   // NOTE(zhonghui): The forward and backward kernels of fuse rope are same, so
   // the spmd rules can be shared.
   SpmdInfo spmd_info = FusedRopeInferSpmd(out_q_grad,
diff --git a/paddle/phi/infermeta/spmd_rules/fused_rope.h b/paddle/phi/infermeta/spmd_rules/fused_rope.h
index 3a5c331098ad1..63eba399a3bbf 100644
--- a/paddle/phi/infermeta/spmd_rules/fused_rope.h
+++ b/paddle/phi/infermeta/spmd_rules/fused_rope.h
@@ -30,7 +30,8 @@ SpmdInfo FusedRopeInferSpmd(const DistMetaTensor& q,
                             const DistMetaTensor& cos,
                             const DistMetaTensor& position_ids,
                             bool use_neox_rotary_style = true,
-                            bool time_major = false);
+                            bool time_major = false,
+                            float rotary_emb_base = 10000.f);
 
 SpmdInfo FusedRopeInferSpmdReverse(const DistMetaTensor& q,
                                    const DistMetaTensor& k,
@@ -42,7 +43,8 @@ SpmdInfo FusedRopeInferSpmdReverse(const DistMetaTensor& q,
                                    const DistMetaTensor& out_k,
                                    const DistMetaTensor& out_v,
                                    bool use_neox_rotary_style = true,
-                                   bool time_major = false);
+                                   bool time_major = false,
+                                   float rotary_emb_base = 10000.f);
 
 SpmdInfo FusedRopeGradInferSpmd(const DistMetaTensor& sin,
                                 const DistMetaTensor& cos,
@@ -51,7 +53,8 @@ SpmdInfo FusedRopeGradInferSpmd(const DistMetaTensor& sin,
                                 const DistMetaTensor& out_k_grad,
                                 const DistMetaTensor& out_v_grad,
                                 bool use_neox_rotary_style = true,
-                                bool time_major = false);
+                                bool time_major = false,
+                                float rotary_emb_base = 10000.f);
 
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/rules.cc b/paddle/phi/infermeta/spmd_rules/rules.cc
index 9c6492ee75913..d74beb98de74e 100644
--- a/paddle/phi/infermeta/spmd_rules/rules.cc
+++ b/paddle/phi/infermeta/spmd_rules/rules.cc
@@ -502,6 +502,16 @@ PD_REGISTER_SPMD_RULE(
     PD_INFER_SPMD(phi::distributed::LayerNormInferSpmd),
     PD_INFER_SPMD(phi::distributed::LayerNormInferSpmdReverse));
 
+// fused_rms_norm
+// NOTE(ZHIQIU): Temporally register fused_rms_norm rule,
+// this is not for rms_norm kernel, but for the custom kernel
+// 'fused_rms_norm' in PaddleNLP.
+// It will be no longer needed when the PIR-AutoParallel project
+// is finished.
+PD_REGISTER_SPMD_RULE(fused_rms_norm,
+                      PD_INFER_SPMD(phi::distributed::RmsNormInferSpmd),
+                      PD_INFER_SPMD(phi::distributed::RmsNormInferSpmdReverse));
+
 PD_REGISTER_SPMD_RULE(
     flash_attention,
     PD_INFER_SPMD(phi::distributed::FlashAttInferSpmdStatic),
diff --git a/paddle/phi/infermeta/spmd_rules/swiglu.cc b/paddle/phi/infermeta/spmd_rules/swiglu.cc
index 924a80c2e39a0..040b8100d8042 100644
--- a/paddle/phi/infermeta/spmd_rules/swiglu.cc
+++ b/paddle/phi/infermeta/spmd_rules/swiglu.cc
@@ -27,8 +27,14 @@ namespace distributed {
 SpmdInfo SwiGLUInferSpmd(const DistMetaTensor& x, const DistMetaTensor& y) {
   // y.dist_attr() is empty means y is None
   if (y.dist_attr() == TensorDistAttr()) {
-    PADDLE_THROW(
-        phi::errors::Unimplemented("The input y is not allowed to be None"));
+    auto x_dims_mapping = x.dist_attr().dims_mapping();
+    if (x_dims_mapping.back() != -1) {
+      PADDLE_THROW(
+          phi::errors::Unimplemented("The input y is none and input x's last "
+                                     "dim is sharded is not supported"));
+    }
+    auto res = ElementwiseUnaryInferSpmd(x);
+    return {{res.first[0], y.dist_attr()}, {res.second[0]}};
   } else {
     return ElementwiseBinaryInferSpmd(x, y);
   }
@@ -38,8 +44,14 @@ SpmdInfo SwiGLUInferSpmdReverse(const DistMetaTensor& x,
                                 const DistMetaTensor& y,
                                 const DistMetaTensor& out) {
   if (y.dist_attr() == TensorDistAttr()) {
-    PADDLE_THROW(
-        phi::errors::Unimplemented("The input y is not allowed to be None"));
+    auto x_dims_mapping = x.dist_attr().dims_mapping();
+    if (x_dims_mapping.back() != -1) {
+      PADDLE_THROW(
+          phi::errors::Unimplemented("The input y is none and input x's last "
+                                     "dim is sharded is not supported"));
+    }
+    auto res = ElementwiseUnaryInferSpmdReverse(x, out);
+    return {{res.first[0], y.dist_attr()}, {res.second[0]}};
   } else {
     return ElementwiseBinaryInferSpmdReverse(x, y, out);
   }
@@ -49,8 +61,15 @@ SpmdInfo SwiGLUGradInferSpmd(const DistMetaTensor& x,
                              const DistMetaTensor& y,
                              const DistMetaTensor& out_grad) {
   if (y.dist_attr() == TensorDistAttr()) {
-    PADDLE_THROW(
-        phi::errors::Unimplemented("The input y is not allowed to be None"));
+    auto x_dims_mapping = x.dist_attr().dims_mapping();
+    if (x_dims_mapping.back() != -1) {
+      PADDLE_THROW(
+          phi::errors::Unimplemented("The input y is none and input x's last "
+                                     "dim is sharded is not supported"));
+    }
+    auto res = ElementwiseUnaryGradInferSpmd(x, out_grad);
+    return {{res.first[0], y.dist_attr(), res.first[1]},
+            {res.second[0], y.dist_attr()}};
   } else {
     return ElementwiseBinaryGradInferSpmd(x, y, out_grad);
   }
diff --git a/paddle/phi/infermeta/spmd_rules/tile.cc b/paddle/phi/infermeta/spmd_rules/tile.cc
index 76eb0dd95f632..e6d98a1b28303 100644
--- a/paddle/phi/infermeta/spmd_rules/tile.cc
+++ b/paddle/phi/infermeta/spmd_rules/tile.cc
@@ -151,7 +151,7 @@ SpmdInfo TileInferSpmdReverse(const DistMetaTensor& x,
   auto x_dist_attr_dst = CopyTensorDistAttrForOutput(x_dist_attr_src);
   x_dist_attr_dst.set_dims_mapping(x_dims_mapping_dst);
 
-  VLOG(4) << "TriuInferSpmdReverse:";
+  VLOG(4) << "TileInferSpmdReverse:";
 
   VLOG(4) << "out shape: [" << str_join(out_shape) << "]"
           << "src_dims_mapping: [" << str_join(out_dist_attr_src.dims_mapping())
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index f10a86b33836a..beba7457039cc 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -146,6 +146,25 @@ void AddmmInferMeta(const MetaTensor& input,
   out->set_dtype(input.dtype());
 }
 
+void AssignPosInferMeta(const MetaTensor& x,
+                        const MetaTensor& cum_count,
+                        const MetaTensor& eff_num_len,
+                        MetaTensor* out) {
+  phi::DataType X_dtype = x.dtype();
+  phi::DataType cum_count_dtype = cum_count.dtype();
+
+  PADDLE_ENFORCE_EQ(cum_count_dtype,
+                    X_dtype,
+                    phi::errors::InvalidArgument(
+                        "The dtype of the cum_count and X should be same"));
+  PADDLE_ENFORCE_EQ(cum_count_dtype,
+                    phi::DataType::INT64,
+                    phi::errors::InvalidArgument(
+                        "The dtype of the cum_count_dtype, eff_num_len and "
+                        "X should be same as int64"));
+  out->set_dtype(X_dtype);
+}
+
 void BatchFCInferMeta(const MetaTensor& input,
                       const MetaTensor& w,
                       const MetaTensor& bias,
@@ -1429,12 +1448,19 @@ void ScatterNdAddInferMeta(const MetaTensor& x,
 
     // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:]
     std::vector<int64_t> r_updates_dims;
+    bool without_dynamic_shape = true;
     for (int i = 0; i < index_dims_size - 1; ++i) {
+      if (index_dims[i] == -1) {
+        without_dynamic_shape = false;
+      }
       r_updates_dims.emplace_back(index_dims[i]);
     }
     for (int i = static_cast<int>(index_dims[index_dims_size - 1]);
          i < ref_dims_size;
          ++i) {
+      if (ref_dims[i] == -1) {
+        without_dynamic_shape = false;
+      }
       r_updates_dims.emplace_back(ref_dims[i]);
     }
     // check for non-0d updates
@@ -1442,25 +1468,27 @@ void ScatterNdAddInferMeta(const MetaTensor& x,
         r_updates_dims.size(),
         updates_dims_size,
         phi::errors::InvalidArgument(
-            "Updates has wrong shape. The shape of Updates and Input(Updates) "
+            "Updates has wrong shape. The shape of Updates and "
+            "Input(Updates) "
             "should be same, but received the shape of Updates is %d, "
             "the shape of Input(Updates) is %d.",
             r_updates_dims.size(),
             updates_dims_size));
-
-    for (int64_t i = 0; i < updates_dims_size; ++i) {
-      PADDLE_ENFORCE_EQ(
-          r_updates_dims[i],
-          updates_dims[i],
-          phi::errors::InvalidArgument(
-              "Updates has wrong shape. The dimensions of Updates and "
-              "Input(Updates) should match, but received Updates's"
-              "%d-th dimension is %d, Input(Updates)'s %d-th "
-              "dimension is %d.",
-              i,
-              r_updates_dims[i],
-              i,
-              updates_dims[i]));
+    if (without_dynamic_shape) {
+      for (int64_t i = 0; i < updates_dims_size; ++i) {
+        PADDLE_ENFORCE_EQ(
+            r_updates_dims[i],
+            updates_dims[i],
+            phi::errors::InvalidArgument(
+                "Updates has wrong shape. The dimensions of Updates and "
+                "Input(Updates) should match, but received Updates's"
+                "%d-th dimension is %d, Input(Updates)'s %d-th "
+                "dimension is %d.",
+                i,
+                r_updates_dims[i],
+                i,
+                updates_dims[i]));
+      }
     }
   }
   out->set_dims(ref_dims);
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index c1c1af6f08218..c7c31e767f40f 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -53,6 +53,11 @@ void ArangeTensorInferMeta(const MetaTensor& start,
                            const MetaTensor& step,
                            MetaTensor* out);
 
+void AssignPosInferMeta(const MetaTensor& x,
+                        const MetaTensor& cum_count,
+                        const MetaTensor& eff_num_len,
+                        MetaTensor* out);
+
 void BatchFCInferMeta(const MetaTensor& input,
                       const MetaTensor& w,
                       const MetaTensor& bias,
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 74d04da5de8f2..a152bc152ae6b 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -1219,7 +1219,7 @@ void EinsumRawInferMeta(const std::vector<const MetaTensor*>& inputs,
 void ExpandInferMeta(const MetaTensor& x,
                      const IntArray& shape,
                      MetaTensor* out) {
-#define MAX_RANK_SUPPORTED 6
+#define EXPAND_MAX_RANK_SUPPORTED 8
   auto x_dims = x.dims();
   auto expand_shape = shape.GetData();
 
@@ -1238,11 +1238,11 @@ void ExpandInferMeta(const MetaTensor& x,
           static_cast<size_t>(x_dims.size())));
   PADDLE_ENFORCE_LE(
       expand_shape.size(),
-      MAX_RANK_SUPPORTED,
+      EXPAND_MAX_RANK_SUPPORTED,
       phi::errors::InvalidArgument("The number of elements (%d) of 'shape' for "
                                    "must not be greater than %d.",
                                    expand_shape.size(),
-                                   MAX_RANK_SUPPORTED));
+                                   EXPAND_MAX_RANK_SUPPORTED));
   PADDLE_ENFORCE_GE(
       expand_shape.size(),
       0,
@@ -1283,6 +1283,7 @@ void ExpandInferMeta(const MetaTensor& x,
   if (out_rank > 0 && out_shape[0] == x_dims[0]) {
     out->share_lod(x);
   }
+#undef EXPAND_MAX_RANK_SUPPORTED
 }
 
 void FillAnyLikeInferMeta(const MetaTensor& x,
@@ -4722,7 +4723,7 @@ void TileInferMeta(const MetaTensor& x,
                    const IntArray& repeat_times,
                    MetaTensor* out,
                    MetaConfig config) {
-#define MAX_RANK_SUPPORTED 6
+#define TILE_MAX_RANK_SUPPORTED 6
 
   auto repeat_times_data = repeat_times.GetData();
   auto x_dims = x.dims();
@@ -4732,19 +4733,19 @@ void TileInferMeta(const MetaTensor& x,
 
   PADDLE_ENFORCE_LE(
       x_dims.size(),
-      MAX_RANK_SUPPORTED,
+      TILE_MAX_RANK_SUPPORTED,
       errors::InvalidArgument(
           "The rank of the input 'x' for tile op "
           "must not be greater than %d, but the value received is %d.",
-          MAX_RANK_SUPPORTED,
+          TILE_MAX_RANK_SUPPORTED,
           x_dims.size()));
   PADDLE_ENFORCE_LE(
       repeat_times_data.size(),
-      MAX_RANK_SUPPORTED,
+      TILE_MAX_RANK_SUPPORTED,
       errors::InvalidArgument(
           "The size of the shape of input 'repeat_times' for tile op "
           "must not be greater than %d, but the value received is %d.",
-          MAX_RANK_SUPPORTED,
+          TILE_MAX_RANK_SUPPORTED,
           repeat_times_data.size()));
   PADDLE_ENFORCE_GE(
       repeat_times_data.size(),
@@ -4785,6 +4786,7 @@ void TileInferMeta(const MetaTensor& x,
     out->share_lod(x);
   }
   out->set_dtype(x.dtype());
+#undef TILE_MAX_RANK_SUPPORTED
 }
 
 void TopKInferMeta(const MetaTensor& x,
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 304fd3cef793a..31de8c3e244be 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -209,11 +209,9 @@ if(WITH_ROCM)
     "gpu/lu_kernel.cu"
     "gpu/matrix_rank_kernel.cu"
     "gpu/matrix_rank_tol_kernel.cu"
-    "gpu/multiclass_nms3_kernel.cu"
     "gpu/put_along_axis_grad_kernel.cu"
     "gpu/put_along_axis_kernel.cu"
     "gpu/qr_kernel.cu"
-    "gpu/rms_norm_grad_kernel.cu"
     "gpu/svd_kernel.cu"
     "gpudnn/mha_cudnn_frontend.cu"
     "fusion/gpu/block_multi_head_attention_kernel.cu"
@@ -239,7 +237,7 @@ set(cc_search_pattern
     "stride/*.cc"
     "fusion/cpu/*.cc")
 
-if(WITH_MKLDNN)
+if(WITH_ONEDNN)
   set(cc_search_pattern ${cc_search_pattern} "legacy/onednn/*.cc" "onednn/*.cc"
                         "fusion/onednn/*.cc")
 endif()
@@ -262,7 +260,9 @@ if(NOT
     AND AVX512F_FOUND
     AND AVX512F_FLAG
     AND WITH_MKL))
+  list(REMOVE_ITEM kernel_cc "fusion/cpu/fused_layer_norm_avx_kernel.cc")
   list(REMOVE_ITEM kernel_cc "fusion/cpu/self_dp_attention_kernel.cc")
+  list(REMOVE_ITEM kernel_cc "fusion/cpu/rms_norm_avx_kernel.cc")
 endif()
 
 file(
diff --git a/paddle/phi/kernels/cpu/cross_grad_kernel.cc b/paddle/phi/kernels/cpu/cross_grad_kernel.cc
index 882c3dd9ee512..4c41107ba0199 100644
--- a/paddle/phi/kernels/cpu/cross_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/cross_grad_kernel.cc
@@ -18,6 +18,8 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace phi {
 
@@ -81,9 +83,27 @@ void CrossGradKernel(const Context &dev_ctx,
     slice_size *= static_cast<int>(input_x_dims[i]);
   }
 
+  int64_t numel = x.numel();
+  DenseTensor x_conj, y_conj;
+  DenseTensorMeta meta_xy(x.dtype(), x.dims());
+  x_conj.set_meta(meta_xy);
+  y_conj.set_meta(meta_xy);
+
+  auto *input_x_conj_data = dev_ctx.template Alloc<T>(&x_conj);
+
+  auto *input_y_conj_data = dev_ctx.template Alloc<T>(&y_conj);
+
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+  phi::funcs::ConjFunctor<T> functor_x(
+      input_x.data<T>(), numel, input_x_conj_data);
+  phi::funcs::ConjFunctor<T> functor_y(
+      input_y.data<T>(), numel, input_y_conj_data);
+  for_range(functor_x);
+  for_range(functor_y);
+
   std::vector<T> input_x_vec, input_y_vec, input_dout_vec;
-  phi::TensorToVector(input_x, dev_ctx, &input_x_vec);
-  phi::TensorToVector(input_y, dev_ctx, &input_y_vec);
+  phi::TensorToVector(x_conj, dev_ctx, &input_x_vec);
+  phi::TensorToVector(y_conj, dev_ctx, &input_y_vec);
   phi::TensorToVector(input_out_grad, dev_ctx, &input_dout_vec);
   std::vector<T> out_dx_vec(output_x_grad->numel());
   std::vector<T> out_dy_vec(output_y_grad->numel());
@@ -120,4 +140,6 @@ PD_REGISTER_KERNEL(cross_grad,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/cross_kernel.cc b/paddle/phi/kernels/cpu/cross_kernel.cc
index 0f45b7c304e31..95f826cfe9132 100644
--- a/paddle/phi/kernels/cpu/cross_kernel.cc
+++ b/paddle/phi/kernels/cpu/cross_kernel.cc
@@ -105,5 +105,13 @@ void CrossKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    cross, CPU, ALL_LAYOUT, phi::CrossKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(cross,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::CrossKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/full_kernel.cc b/paddle/phi/kernels/cpu/full_kernel.cc
index b1a6ceda3647d..278b3bea324f1 100644
--- a/paddle/phi/kernels/cpu/full_kernel.cc
+++ b/paddle/phi/kernels/cpu/full_kernel.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
-#include "paddle/phi/kernels/impl/full_whit_tensor_kernel_impl.h"
+#include "paddle/phi/kernels/impl/full_with_tensor_kernel_impl.h"
 
 namespace phi {
 
@@ -156,5 +156,4 @@ PD_REGISTER_KERNEL(full_with_tensor,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {
   kernel->InputAt(0).SetBackend(phi::Backend::CPU);
-  kernel->InputAt(1).SetBackend(phi::Backend::CPU);
 }
diff --git a/paddle/phi/kernels/cpu/isfinite_kernel.cc b/paddle/phi/kernels/cpu/isfinite_kernel.cc
index c9f69c5f7e4f5..2fa44670c15c2 100644
--- a/paddle/phi/kernels/cpu/isfinite_kernel.cc
+++ b/paddle/phi/kernels/cpu/isfinite_kernel.cc
@@ -27,7 +27,10 @@ PD_REGISTER_KERNEL(isinf,
                    phi::dtype::float16,
                    phi::dtype::bfloat16,
                    int,
-                   int64_t) {
+                   int64_t,
+                   int16_t,
+                   int8_t,
+                   uint8_t) {
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
 }
 
diff --git a/paddle/phi/kernels/cpu/log_softmax_kernel.cc b/paddle/phi/kernels/cpu/log_softmax_kernel.cc
index a57ab908d24ca..26e894945284c 100644
--- a/paddle/phi/kernels/cpu/log_softmax_kernel.cc
+++ b/paddle/phi/kernels/cpu/log_softmax_kernel.cc
@@ -122,7 +122,7 @@ void LogSoftmaxKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-// TODO(YuanRisheng): The layout of mkldnn kernel should be MKLDNN, we should
+// TODO(YuanRisheng): The layout of onednn kernel should be OneDNN, we should
 // support specifying the exact layout when the kernel is registered
 PD_REGISTER_KERNEL(
     log_softmax, CPU, ALL_LAYOUT, phi::LogSoftmaxKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/nanmedian_kernel.cc b/paddle/phi/kernels/cpu/nanmedian_kernel.cc
index 2911d5c0fcec5..c38cb831d379b 100644
--- a/paddle/phi/kernels/cpu/nanmedian_kernel.cc
+++ b/paddle/phi/kernels/cpu/nanmedian_kernel.cc
@@ -103,8 +103,12 @@ void CalcMedianFunc(const Context& dev_ctx,
         offset = i * sort_k;
         int64_t pos = offset + sort_k - 1;
         o_ptr[i] = sort_out_ptr[pos];
-        m_ptr[2 * i] = sort_indices_ptr[pos];
-        m_ptr[2 * i + 1] = sort_indices_ptr[pos];
+        if (mode == "avg") {
+          m_ptr[2 * i] = sort_indices_ptr[pos];
+          m_ptr[2 * i + 1] = sort_indices_ptr[pos];
+        } else {
+          m_ptr[i] = sort_indices_ptr[pos];
+        }
       }
     } else {
       for (i = 0; i < pre_dim; i++) {
diff --git a/paddle/phi/kernels/cpu/shape_broadcast_kernel.cc b/paddle/phi/kernels/cpu/shape_broadcast_kernel.cc
new file mode 100644
index 0000000000000..3157be039255c
--- /dev/null
+++ b/paddle/phi/kernels/cpu/shape_broadcast_kernel.cc
@@ -0,0 +1,84 @@
+/* Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/dense_tensor.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/utils/array_ref.h"
+
+namespace phi {
+
+template <typename T>
+std::vector<T> ComputeBroadcastShape(const paddle::array_ref<T>& large_shape,
+                                     const paddle::array_ref<T>& small_shape) {
+  PADDLE_ENFORCE_GE(
+      large_shape.size(),
+      small_shape.size(),
+      phi::errors::PreconditionNotMet(
+          "Size of large_shape is expected to be greater or equal size of "
+          "small_shape, but got [%d] >= [%d].",
+          large_shape.size(),
+          small_shape.size()));
+  std::vector<T> output_data;
+  output_data.reserve(large_shape.size());
+  auto rank_gap = large_shape.size() - small_shape.size();
+  for (size_t i = 0; i < rank_gap; ++i) {
+    output_data.push_back(large_shape[i]);
+  }
+  for (size_t i = 0; i < small_shape.size(); ++i) {
+    output_data.push_back(std::max(large_shape[i + rank_gap], small_shape[i]));
+  }
+  return output_data;
+}
+
+template <typename T, typename Context>
+void ShapeBroadcastKernel(const Context& ctx,
+                          const DenseTensor& x_shape,
+                          const DenseTensor& y_shape,
+                          DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      x_shape.dims().size(),
+      1,
+      phi::errors::InvalidArgument("Invalid input tensor. The rank of x_shape "
+                                   "should be equal 1, but now received [%d].",
+                                   x_shape.dims().size()));
+  PADDLE_ENFORCE_EQ(
+      y_shape.dims().size(),
+      1,
+      phi::errors::InvalidArgument("Invalid input tensor. The rank of y_shape "
+                                   "should be equal 1, but now received [%d].",
+                                   y_shape.dims().size()));
+  paddle::array_ref<T> x_shape_data(x_shape.data<T>(), x_shape.numel());
+  paddle::array_ref<T> y_shape_data(y_shape.data<T>(), y_shape.numel());
+  const auto& output_data =
+      x_shape_data.size() > y_shape_data.size()
+          ? ComputeBroadcastShape(x_shape_data, y_shape_data)
+          : ComputeBroadcastShape(y_shape_data, x_shape_data);
+  T* out_data = ctx.template HostAlloc<T>(out);
+  int64_t out_numel = out->numel();
+  for (int i = 0; i < out_numel; ++i) {
+    out_data[i] = output_data[i];
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(shape_broadcast,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ShapeBroadcastKernel,
+                   int32_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/elementwise_divide_grad_kernel.h b/paddle/phi/kernels/elementwise_divide_grad_kernel.h
index c764f05c3983f..15b1e65a9cfdf 100644
--- a/paddle/phi/kernels/elementwise_divide_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_divide_grad_kernel.h
@@ -33,7 +33,8 @@ template <typename T, typename Context>
 void DivideDoubleGradKernel(const Context& dev_ctx,
                             const DenseTensor& y,
                             const DenseTensor& out,
-                            const DenseTensor& dx,
+                            const DenseTensor& grad_out,
+                            const paddle::optional<DenseTensor>& dx,
                             const paddle::optional<DenseTensor>& ddx,
                             const paddle::optional<DenseTensor>& ddy,
                             int axis,
diff --git a/paddle/phi/kernels/flatten_kernel.h b/paddle/phi/kernels/flatten_kernel.h
index b941a1fbb9691..ac53c5b82c6cb 100644
--- a/paddle/phi/kernels/flatten_kernel.h
+++ b/paddle/phi/kernels/flatten_kernel.h
@@ -40,7 +40,8 @@ void FlattenInferStridedKernel(const Context& dev_ctx,
                                const DenseTensor& x,
                                int start_axis,
                                int stop_axis,
-                               DenseTensor* out);
+                               DenseTensor* out,
+                               DenseTensor* xshape);
 
 template <typename Context>
 void FlattenStridedKernel(const Context& dev_ctx,
diff --git a/paddle/phi/kernels/full_kernel.h b/paddle/phi/kernels/full_kernel.h
index b10e02658fe75..e6d80ed43dff4 100644
--- a/paddle/phi/kernels/full_kernel.h
+++ b/paddle/phi/kernels/full_kernel.h
@@ -33,8 +33,8 @@ void FullKernel(const Context& dev_ctx,
 
 template <typename T, typename Context>
 void FullWithTensorKernel(const Context& dev_ctx,
-                          const DenseTensor& shape,
                           const DenseTensor& value,
+                          const IntArray& shape,
                           DataType dtype,
                           DenseTensor* out);
 
diff --git a/paddle/phi/kernels/funcs/common_shape.h b/paddle/phi/kernels/funcs/common_shape.h
index 19f2fa1f2fac4..45a1024339ba3 100644
--- a/paddle/phi/kernels/funcs/common_shape.h
+++ b/paddle/phi/kernels/funcs/common_shape.h
@@ -52,7 +52,6 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims,
           "Axis should be less than or equal to %d, but received axis is %d.",
           max_dim,
           axis));
-
   if (x_dims.size() > y_dims.size()) {
     std::fill(y_dims_array, y_dims_array + axis, 1);
     if (axis + y_dims.size() < max_dim) {
@@ -68,7 +67,6 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims,
     std::copy(x_dims.Get(), x_dims.Get() + x_dims.size(), x_dims_array + axis);
     std::copy(y_dims.Get(), y_dims.Get() + y_dims.size(), y_dims_array);
   }
-
   for (int i = 0; i < max_dim; ++i) {
     PADDLE_ENFORCE_EQ(
         x_dims_array[i] == y_dims_array[i] || x_dims_array[i] <= 1 ||
diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h
index 463272a37c00d..855b6fe6c8e15 100644
--- a/paddle/phi/kernels/funcs/dropout_impl.cu.h
+++ b/paddle/phi/kernels/funcs/dropout_impl.cu.h
@@ -349,19 +349,6 @@ void DropoutFwGPUKernelDriver(
     } else {
       bool copy_in_kernel = GetSeedDataAndIncrement(
           dev_ctx, seed, is_fix_seed, seed_val, offset, &seed_data, &increment);
-#ifdef PADDLE_WITH_HIP
-      VectorizedRandomGenerator<T>
-          <<<grid_size, block_size, 0, stream>>>(0,
-                                                 size,
-                                                 seed_data,
-                                                 dropout_prob,
-                                                 x_data,
-                                                 mask_data,
-                                                 y_data,
-                                                 upscale_in_train,
-                                                 increment,
-                                                 main_offset);
-#else
       const phi::GPUContext* dev_ctx_p = &dev_ctx;
       auto gen_cuda = dev_ctx.GetGenerator();
       auto state_index = gen_cuda->GetStateIndex();
@@ -370,10 +357,11 @@ void DropoutFwGPUKernelDriver(
           parameterSetter = [offset, dev_ctx_p, state_index, is_fix_seed](
                                 phi::backends::gpu::gpuKernelParams& params) {
             if (!is_fix_seed) {
-              // we assume seed is null pointer
-              // seed copy to cpu is meaningless here
+          // we assume seed is null pointer
+          // seed copy to cpu is meaningless here
+#ifndef PADDLE_WITH_HIP
               assert(seed_tensor_ptr == nullptr);
-
+#endif
               auto gen_cuda = dev_ctx_p->GetGenerator();
               // ensure the generator use correct state index
               gen_cuda->SetStateIndex(state_index);
@@ -393,9 +381,14 @@ void DropoutFwGPUKernelDriver(
           cudaKernelCallback = [=](unsigned int id) {
             void* functionPtr =
                 reinterpret_cast<void*>(&(VectorizedRandomGenerator<T>));
+#ifdef PADDLE_WITH_HIP
+            hipFunction_t cudaFunc =
+                reinterpret_cast<hipFunction_t>(functionPtr);
+#else
             cudaFunction_t cudaFunc;
             PADDLE_ENFORCE_GPU_SUCCESS(
                 cudaGetFuncBySymbol(&cudaFunc, functionPtr));
+#endif
             VLOG(10) << "[cudaKernelCallback] cudaFunc = " << cudaFunc
                      << " functionPtr = " << functionPtr;
 
@@ -417,7 +410,6 @@ void DropoutFwGPUKernelDriver(
 
       VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data
                << ", increment = " << increment;
-#endif
     }
   } else {
     if (upscale_in_train) {
diff --git a/paddle/phi/kernels/funcs/eigen/broadcast.cc b/paddle/phi/kernels/funcs/eigen/broadcast.cc
index 04e13a6799931..0bf9d37d60e4a 100644
--- a/paddle/phi/kernels/funcs/eigen/broadcast.cc
+++ b/paddle/phi/kernels/funcs/eigen/broadcast.cc
@@ -73,7 +73,9 @@ struct EigenBroadcastGrad<Eigen::DefaultDevice, T, Rank> {
   template struct FUNCTOR<Eigen::DefaultDevice, T, 3>; \
   template struct FUNCTOR<Eigen::DefaultDevice, T, 4>; \
   template struct FUNCTOR<Eigen::DefaultDevice, T, 5>; \
-  template struct FUNCTOR<Eigen::DefaultDevice, T, 6>
+  template struct FUNCTOR<Eigen::DefaultDevice, T, 6>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, T, 7>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, T, 8>
 INSTANTIATION(EigenBroadcast, bool);
 INSTANTIATION(EigenBroadcast, dtype::float16);
 INSTANTIATION(EigenBroadcast, dtype::bfloat16);
diff --git a/paddle/phi/kernels/funcs/eigen/broadcast.cu b/paddle/phi/kernels/funcs/eigen/broadcast.cu
index 0c5a3408872c4..fe16588c9bce6 100644
--- a/paddle/phi/kernels/funcs/eigen/broadcast.cu
+++ b/paddle/phi/kernels/funcs/eigen/broadcast.cu
@@ -72,7 +72,9 @@ struct EigenBroadcastGrad<Eigen::GpuDevice, T, Rank> {
   template struct FUNCTOR<Eigen::GpuDevice, T, 3>; \
   template struct FUNCTOR<Eigen::GpuDevice, T, 4>; \
   template struct FUNCTOR<Eigen::GpuDevice, T, 5>; \
-  template struct FUNCTOR<Eigen::GpuDevice, T, 6>
+  template struct FUNCTOR<Eigen::GpuDevice, T, 6>; \
+  template struct FUNCTOR<Eigen::GpuDevice, T, 7>; \
+  template struct FUNCTOR<Eigen::GpuDevice, T, 8>
 INSTANTIATION(EigenBroadcast, bool);
 INSTANTIATION(EigenBroadcast, dtype::float16);
 INSTANTIATION(EigenBroadcast, dtype::bfloat16);
diff --git a/paddle/phi/kernels/funcs/jit/README.en.md b/paddle/phi/kernels/funcs/jit/README.en.md
index 0e1958a5c1415..cf661d5468a6c 100644
--- a/paddle/phi/kernels/funcs/jit/README.en.md
+++ b/paddle/phi/kernels/funcs/jit/README.en.md
@@ -100,4 +100,4 @@ Add more implementations of `your_key` for performance enhancement.
 
 1. Add functions based on generated code in `gen`. It should be derived from `JitCode` and should have corresponding creator from `JitCodeCreator` which will be registered on the `your_key`.
 2. If new attribute type is added, you should specialize `JitCodeKey` of this type.
-3. Add more functions in `more`，you can use any third party you wish, like mkl, mkldnn or intrinsic code to reach the best performance.
+3. Add more functions in `more`，you can use any third party you wish, like mkl, onednn or intrinsic code to reach the best performance.
diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
index 6a82875819161..3eee52efcbebe 100644
--- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
@@ -166,14 +166,14 @@ __inline__ __device__ double rsqrt_(const double val) {
   return ::rsqrt(val);
 }
 
-#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) || defined(PADDLE_WITH_HIP)
 template <>
 __inline__ __device__ half rsqrt_(const half val) {
   return hrsqrt(val);
 }
 #endif
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <typename T,
           typename U,
           typename ScaleT = U,
@@ -254,7 +254,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel(
 
 #pragma unroll
     for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
+#ifdef PADDLE_WITH_HIP
+      mu_local += __shfl_xor(mu_local, it);
+#else
       mu_local += __shfl_xor_sync(uint32_t(-1), mu_local, it);
+#endif
     }
     if (WARPS_N > 1) {
       if (lane == 0) {
@@ -290,7 +294,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel(
 
 #pragma unroll
     for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
+#ifdef PADDLE_WITH_HIP
+      var_local += __shfl_xor(var_local, it);
+#else
       var_local += __shfl_xor_sync(uint32_t(-1), var_local, it);
+#endif
     }
 
     if (WARPS_N > 1) {
@@ -546,7 +554,7 @@ __inline__ __device__ void cuLoadAddStridedInputs(const int64_t i1_block,
   }
 }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <bool IsFusedDropoutResidualLn,
           bool NeedDDropoutSrcPtr,
           typename T,
@@ -678,16 +686,26 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_fast_kernel(
 #pragma unroll
       // row reduction among 32 threads.
       for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
+#ifdef PADDLE_WITH_HIP
+        sum_loss1 += __shfl_xor(sum_loss1, it);
+        sum_loss2 += __shfl_xor(sum_loss2, it);
+#else
         sum_loss1 += __shfl_xor_sync(uint32_t(-1), sum_loss1, it);
         sum_loss2 += __shfl_xor_sync(uint32_t(-1), sum_loss2, it);
+#endif
       }
       sum_loss1 *= rn;
       sum_loss2 *= rn;
     } else {
 #pragma unroll
       for (int it = 16; it > 0; it /= 2) {
+#ifdef PADDLE_WITH_HIP
+        sum_loss1 += __shfl_down(sum_loss1, it);
+        sum_loss2 += __shfl_down(sum_loss2, it);
+#else
         sum_loss1 += __shfl_down_sync(uint32_t(-1), sum_loss1, it);
         sum_loss2 += __shfl_down_sync(uint32_t(-1), sum_loss2, it);
+#endif
       }
 
       if (lane == 0) {
diff --git a/paddle/phi/kernels/fusion/cpu/fused_layer_norm_avx_kernel.cc b/paddle/phi/kernels/fusion/cpu/fused_layer_norm_avx_kernel.cc
new file mode 100644
index 0000000000000..62944d7ea3b09
--- /dev/null
+++ b/paddle/phi/kernels/fusion/cpu/fused_layer_norm_avx_kernel.cc
@@ -0,0 +1,244 @@
+// Copyright (c) 2024 PaddlePaddle Authors All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <immintrin.h>
+#include <math.h>
+#include <omp.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+
+namespace phi {
+namespace fusion {
+
+template <typename T>
+void ResidualBiasSumFunc(const T* x_data,
+                         const T* residual_data,
+                         const T* bias_data,
+                         const float residual_alpha,
+                         const int rows,
+                         const int cols,
+                         const int iStride,
+                         const int oStride,
+                         T* out_data) {
+  __m512 vresidual_alpha = _mm512_set1_ps(residual_alpha);
+  const T* pb = bias_data;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int r = 0; r < rows; ++r) {
+    const T* px = x_data + r * iStride;
+    const T* pr = residual_data ? residual_data + r * iStride : nullptr;
+    T* py = out_data + r * oStride;
+    for (int col = 0; col < cols; col += 16) {
+      int remain = cols - col;
+      __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
+
+      // residual*alpha + bias + x
+      __m512 vx = _mm512_maskz_loadu_ps(mask, px + col);
+      if (residual_data) {
+        __m512 residual_vx = _mm512_maskz_loadu_ps(mask, pr + col);
+        residual_vx = _mm512_mul_ps(residual_vx, vresidual_alpha);
+        vx = _mm512_mask_add_ps(vx, mask, vx, residual_vx);
+      }
+      if (bias_data) {
+        __m512 vb = _mm512_maskz_loadu_ps(mask, pb + col);
+        vx = _mm512_mask_add_ps(vx, mask, vx, vb);
+      }
+      _mm512_mask_storeu_ps(py + col, mask, vx);
+    }
+  }
+}
+
+template <typename T>
+void LayerNormFunc(const T* x_data,
+                   const T* residual_data,
+                   const T* bias_data,
+                   const T* norm_weight_data,
+                   const T* norm_bias_data,
+                   const float epsilon,
+                   const float residual_alpha,
+                   const int rows,
+                   const int cols,
+                   const int iStride,
+                   const int oStride,
+                   T* out_data,
+                   T* residual_out_data,
+                   T* mean_out,
+                   T* var_out) {
+  auto size = cols;
+  __m512 vresidual_alpha = _mm512_set1_ps(residual_alpha);
+  __m512 vgamma = _mm512_set1_ps(1);
+  __m512 vbeta = _mm512_set1_ps(0);
+  const T* pb = bias_data;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int r = 0; r < rows; ++r) {
+    const T* px = x_data + r * iStride;
+    const T* pr = residual_data ? residual_data + r * iStride : nullptr;
+    T* pr_out = residual_out_data ? residual_out_data + r * oStride : nullptr;
+    T* py = out_data + r * oStride;
+
+    T sum = 0;
+    T squareSum = 0;
+
+    __m512 vsum = _mm512_set1_ps(0);
+    __m512 vsqare = _mm512_set1_ps(0);
+    for (int col = 0; col < size; col += 16) {
+      int remain = size - col;
+      __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
+
+      // SUM(x)
+      __m512 vx = _mm512_maskz_loadu_ps(mask, px + col);
+      if (residual_data) {
+        __m512 residual_vx = _mm512_maskz_loadu_ps(mask, pr + col);
+        residual_vx = _mm512_mul_ps(residual_vx, vresidual_alpha);
+        vx = _mm512_mask_add_ps(vx, mask, vx, residual_vx);
+        if (bias_data) {
+          __m512 vb = _mm512_maskz_loadu_ps(mask, pb + col);
+          vx = _mm512_mask_add_ps(vx, mask, vx, vb);
+        }
+        _mm512_mask_storeu_ps(pr_out + col, mask, vx);
+      }
+      vsum = _mm512_add_ps(vsum, vx);
+
+      // SUM(x*x)
+      __m512 tmp = _mm512_mul_ps(vx, vx);
+      vsqare = _mm512_add_ps(vsqare, tmp);
+    }
+
+    sum = _mm512_reduce_add_ps(vsum);
+    squareSum = _mm512_reduce_add_ps(vsqare);
+
+    // Mean
+    T mean = sum / size;
+    mean_out[r] = mean;
+    __m512 vmean = _mm512_set1_ps(mean);
+
+    // Variance
+    T var = 1 / sqrt(squareSum / size - mean * mean + epsilon);
+    var_out[r] = var;
+    __m512 vvar = _mm512_set1_ps(var);
+
+    for (int col = 0; col < size; col += 16) {
+      int remain = size - col;
+      __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
+
+      __m512 vx = _mm512_maskz_loadu_ps(mask, px + col);
+      if (residual_data) {
+        __m512 residual_vx = _mm512_maskz_loadu_ps(mask, pr + col);
+        residual_vx = _mm512_mul_ps(residual_vx, vresidual_alpha);
+        vx = _mm512_mask_add_ps(vx, mask, vx, residual_vx);
+        if (bias_data) {
+          __m512 vb = _mm512_maskz_loadu_ps(mask, pb + col);
+          vx = _mm512_mask_add_ps(vx, mask, vx, vb);
+        }
+      }
+      if (norm_weight_data) {
+        vgamma = _mm512_maskz_loadu_ps(mask, norm_weight_data + col);
+      }
+      if (norm_bias_data) {
+        vbeta = _mm512_maskz_loadu_ps(mask, norm_bias_data + col);
+      }
+      // (vx - vmean) * vgamma * vvar + vbeta
+      vx = _mm512_mask_sub_ps(vx, mask, vx, vmean);
+      vx = _mm512_mask_mul_ps(vx, mask, vx, vgamma);
+      vx = _mm512_mask_mul_ps(vx, mask, vx, vvar);
+      __m512 vy = _mm512_mask_add_ps(vx, mask, vx, vbeta);
+      _mm512_mask_storeu_ps(py + col, mask, vy);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void FusedLayerNormAvxKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const paddle::optional<DenseTensor>& bias,
+                             const paddle::optional<DenseTensor>& residual,
+                             const paddle::optional<DenseTensor>& norm_weight,
+                             const paddle::optional<DenseTensor>& norm_bias,
+                             const float epsilon,
+                             const float residual_alpha,
+                             const int begin_norm_axis,
+                             const float quant_scale,
+                             const int quant_round_type,
+                             const float quant_max_bound,
+                             const float quant_min_bound,
+                             DenseTensor* out,
+                             DenseTensor* residual_out,
+                             DenseTensor* mean,
+                             DenseTensor* variance) {
+  if (quant_scale > 0.0f) {
+    PD_THROW("NOT supported quant int8. ");
+  }
+  const auto x_dims = x.dims();
+  auto matrix_dim = common::flatten_to_2d(x_dims, begin_norm_axis);
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  T* mean_out = dev_ctx.template Alloc<T>(mean);
+  T* var_out = dev_ctx.template Alloc<T>(variance);
+
+  const T* x_data = x.data<T>();
+  const T* bias_data = bias ? bias.get().data<T>() : nullptr;
+  const T* residual_data = residual ? residual.get().data<T>() : nullptr;
+  const T* norm_weight_data =
+      norm_weight ? norm_weight.get().data<T>() : nullptr;
+  const T* norm_bias_data = norm_bias ? norm_bias.get().data<T>() : nullptr;
+  T* residual_out_data =
+      residual ? dev_ctx.template Alloc<T>(residual_out) : nullptr;
+
+  int32_t rows = static_cast<int32_t>(matrix_dim[0]);
+  int32_t cols = static_cast<int32_t>(matrix_dim[1]);
+
+  auto iStride = cols;
+  auto oStride = cols;
+  if (!norm_weight && !norm_bias_data) {
+    ResidualBiasSumFunc(x_data,
+                        residual_data,
+                        bias_data,
+                        residual_alpha,
+                        rows,
+                        cols,
+                        iStride,
+                        oStride,
+                        out_data);
+  } else {
+    LayerNormFunc(x_data,
+                  residual_data,
+                  bias_data,
+                  norm_weight_data,
+                  norm_bias_data,
+                  epsilon,
+                  residual_alpha,
+                  rows,
+                  cols,
+                  iStride,
+                  oStride,
+                  out_data,
+                  residual_out_data,
+                  mean_out,
+                  var_out);
+  }
+}
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(fused_bias_residual_layernorm,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::fusion::FusedLayerNormAvxKernel,
+                   float) {}
diff --git a/paddle/phi/kernels/fusion/cpu/rms_norm_avx_kernel.cc b/paddle/phi/kernels/fusion/cpu/rms_norm_avx_kernel.cc
new file mode 100644
index 0000000000000..a46b91b3c7330
--- /dev/null
+++ b/paddle/phi/kernels/fusion/cpu/rms_norm_avx_kernel.cc
@@ -0,0 +1,167 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <immintrin.h>
+#include <math.h>
+#include <omp.h>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+
+namespace phi {
+namespace fusion {
+
+template <typename T, typename Context>
+void RmsNormAvxKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const paddle::optional<DenseTensor>& bias,
+                      const paddle::optional<DenseTensor>& residual,
+                      const DenseTensor& norm_weight,
+                      const paddle::optional<DenseTensor>& norm_bias,
+                      const float epsilon,
+                      const int begin_norm_axis,
+                      const float quant_scale,
+                      const int quant_round_type,
+                      const float quant_max_bound,
+                      const float quant_min_bound,
+                      DenseTensor* out,
+                      DenseTensor* residual_out,
+                      DenseTensor* inv_var) {
+  if (quant_scale > 0.0f) {
+    PD_THROW("NOT supported quant int8. ");
+  }
+
+  const T* x_data = x.data<T>();
+  int32_t rows = 1;
+  int32_t cols = 1;
+  for (int i = 0; i < begin_norm_axis; i++) {
+    rows *= x.dims()[i];
+  }
+  for (int i = begin_norm_axis; i < x.dims().size(); i++) {
+    cols *= x.dims()[i];
+  }
+
+  int size = cols;
+  auto istride = cols;
+  auto ostride = cols;
+  const T* norm_weight_data = norm_weight.data<T>();
+  const T* norm_bias_data = norm_bias ? norm_bias.get().data<T>() : nullptr;
+  const T* residual_data = residual ? residual.get().data<T>() : nullptr;
+  const T* bias_data = bias ? bias.get().data<T>() : nullptr;
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  T* residual_out_data =
+      residual ? dev_ctx.template Alloc<T>(residual_out) : nullptr;
+
+  __m512 vb = _mm512_setzero_ps();
+  const T* pb = bias_data;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int r = 0; r < rows; ++r) {
+    const T* px = x_data + r * istride;
+    const T* pr = residual ? residual_data + r * istride : nullptr;
+    T* pr_out = residual ? residual_out_data + r * ostride : nullptr;
+    T* py = out_data + r * ostride;
+
+    T squareSum = 0;
+
+    __m512 vsqare = _mm512_set1_ps(0);
+
+    int col = 0;
+    for (; col + 15 < size; col += 16) {
+      // SUM(x*x)
+      __m512 vx = _mm512_loadu_ps(px + col);
+      if (residual) {
+        __m512 residual_vx = _mm512_loadu_ps(pr + col);
+        vx = _mm512_add_ps(vx, residual_vx);
+        if (bias) {
+          __m512 vb = _mm512_loadu_ps(pb + col);
+          vx = _mm512_add_ps(vx, vb);
+        }
+        _mm512_storeu_ps(pr_out + col, vx);
+      }
+      __m512 tmp = _mm512_mul_ps(vx, vx);
+      vsqare = _mm512_add_ps(vsqare, tmp);
+    }
+    if (col < size) {
+      __mmask16 mask = (1 << (size - col)) - 1;
+      __m512 vx = _mm512_maskz_loadu_ps(mask, px + col);
+      if (residual) {
+        __m512 residual_vx = _mm512_maskz_loadu_ps(mask, pr + col);
+        vx = _mm512_mask_add_ps(vx, mask, vx, residual_vx);
+        if (bias) {
+          __m512 vb = _mm512_maskz_loadu_ps(mask, pb + col);
+          vx = _mm512_mask_add_ps(vx, mask, vx, vb);
+        }
+        _mm512_mask_storeu_ps(pr_out + col, mask, vx);
+      }
+      __m512 tmp = _mm512_mul_ps(vx, vx);
+      vsqare = _mm512_add_ps(vsqare, tmp);
+    }
+
+    squareSum = _mm512_reduce_add_ps(vsqare);
+
+    // Variance
+    T var = 1 / sqrt(squareSum / size + epsilon);
+    __m512 vvar = _mm512_set1_ps(var);
+
+    for (col = 0; col + 15 < size; col += 16) {
+      __m512 vx = _mm512_loadu_ps(px + col);
+      if (residual) {
+        __m512 residual_vx = _mm512_loadu_ps(pr + col);
+        vx = _mm512_add_ps(vx, residual_vx);
+        if (bias) {
+          __m512 vb = _mm512_loadu_ps(pb + col);
+          vx = _mm512_add_ps(vx, vb);
+        }
+      }
+      __m512 vw = _mm512_loadu_ps(norm_weight_data + col);
+      if (norm_bias_data) {
+        vb = _mm512_loadu_ps(norm_bias_data + col);
+      }
+
+      // vy = vx * vvar * vw + vb
+      vx = _mm512_mul_ps(vx, vvar);
+      vx = _mm512_mul_ps(vx, vw);
+      __m512 vy = _mm512_add_ps(vx, vb);
+      _mm512_storeu_ps(py + col, vy);
+    }
+    if (col < size) {
+      __mmask16 mask = (1 << (size - col)) - 1;
+      __m512 vx = _mm512_maskz_loadu_ps(mask, px + col);
+      if (residual) {
+        __m512 residual_vx = _mm512_maskz_loadu_ps(mask, pr + col);
+        vx = _mm512_mask_add_ps(vx, mask, vx, residual_vx);
+        if (bias) {
+          __m512 vb = _mm512_maskz_loadu_ps(mask, pb + col);
+          vx = _mm512_mask_add_ps(vx, mask, vx, vb);
+        }
+      }
+      __m512 vw = _mm512_maskz_loadu_ps(mask, norm_weight_data + col);
+      if (norm_bias_data) {
+        vb = _mm512_maskz_loadu_ps(mask, norm_bias_data + col);
+      }
+      // vx * vvar * vw + vb
+      vx = _mm512_mask_mul_ps(vx, mask, vx, vvar);
+      vx = _mm512_mask_mul_ps(vx, mask, vx, vw);
+      __m512 vy = _mm512_mask_add_ps(vx, mask, vx, vb);
+      _mm512_mask_storeu_ps(py + col, mask, vy);
+    }
+  }  // end for rows
+}
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    rms_norm, CPU, ALL_LAYOUT, phi::fusion::RmsNormAvxKernel, float) {}
diff --git a/paddle/phi/kernels/fusion/cpu/self_dp_attention_kernel.cc b/paddle/phi/kernels/fusion/cpu/self_dp_attention_kernel.cc
index 0d3189187351c..dff41e6d4250c 100644
--- a/paddle/phi/kernels/fusion/cpu/self_dp_attention_kernel.cc
+++ b/paddle/phi/kernels/fusion/cpu/self_dp_attention_kernel.cc
@@ -257,7 +257,9 @@ void softmax_sum_max(float* AB,
       __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
 
       __m512 vx = _mm512_maskz_loadu_ps(mask, buf + off);
-      vx = vexp(vx * vrefac - vmax);
+      vx = _mm512_mask_mul_ps(vx, mask, vx, vrefac);
+      vx = _mm512_mask_sub_ps(vx, mask, vx, vmax);
+      vx = vexp(vx);
 
       _mm512_mask_storeu_ps(buf + off, mask, vx);
 
@@ -275,8 +277,7 @@ void softmax_sum_max(float* AB,
       __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
 
       __m512 vx = _mm512_maskz_loadu_ps(mask, buf + off);
-      vx = vx * vrsum;
-
+      vx = _mm512_mask_mul_ps(vx, mask, vx, vrsum);
       _mm512_mask_storeu_ps(buf + off, mask, vx);
     }
   }
@@ -301,7 +302,10 @@ void update_out_blk(float* output,
       __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
       __m512 vout = _mm512_maskz_loadu_ps(mask, outbuf + off);
       __m512 vabc = _mm512_maskz_loadu_ps(mask, buf + off);
-      __m512 vupt = vout * merr * vfac + vabc;
+      vout = _mm512_mask_mul_ps(vout, mask, vout, merr);
+      vout = _mm512_mask_mul_ps(vout, mask, vout, vfac);
+      __m512 vupt = _mm512_set1_ps(0.0f);
+      vupt = _mm512_mask_add_ps(vupt, mask, vout, vabc);
       _mm512_mask_storeu_ps(outbuf + off, mask, vupt);
     }
     pre_sum[i] = sum[i];
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_act.py b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_act.py
index 2104c676c9b82..9dd7e98a4109b 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_act.py
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_act.py
@@ -360,6 +360,117 @@ def generate_sm80_16816(cutlass_dtype="cutlass::half_t"):
     return sm80_code
 
 
+# hers is sm80 tf32.
+def generate_sm80_1688(cutlass_dtype="cutlass::tfloat32_t"):
+    kernel_dict = {
+        "element_a": cutlass_dtype,
+        "layout_a": "cutlass::layout::TensorNHWC",
+        "element_b": cutlass_dtype,
+        "layout_b": "cutlass::layout::TensorNHWC",
+        "element_c": cutlass_dtype,
+        "layout_c": "cutlass::layout::TensorNHWC",
+        "opcode_class": "cutlass::arch::OpClassTensorOp",
+        "arch": "cutlass::arch::Sm80",
+        "swizzling_functor": "cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>",
+        # alpha is always float!
+        "element_epilogue": "float",
+        "math_operator": "cutlass::arch::OpMultiplyAdd",
+    }
+
+    kernel_dict["stride_support"] = "cutlass::conv::StrideSupport::kStrided"
+
+    # iterate over this loop
+    iterator_algorithms = [
+        "cutlass::conv::IteratorAlgorithm::kOptimized",
+    ]
+
+    math_instructions = [
+        (
+            "16,8,8",
+            cutlass_dtype,
+            cutlass_dtype,
+            "float",
+        ),
+    ]
+
+    alignments = [4]
+
+    kernel_dict["align_a"] = "4"
+    kernel_dict["align_b"] = "4"
+    # this should divided by oc
+    kernel_dict["epilogue_vector_length"] = "4"
+    kernel_dict["split_k_slices"] = "1"
+
+    sm80_code = ""
+    for epi_func in SupportedAct:
+        op_dict = {}
+        op_dict["func_name"] = UnderScoreName[epi_func].lower() + "_sm80_fp32"
+        op_dict["enum_op_name"] = UnderScoreName[epi_func].upper()
+        # For a function, we record all its kernels into a std::vector in C++ code
+        all_kernel_names = ""
+        all_kernel_declares = ""
+        kernel_dict["epi_func"] = ActTag[epi_func]
+        suffix = 0
+        for iterator_algorithm in iterator_algorithms:
+            for alignment in alignments:
+                for math_inst in math_instructions:
+                    tiles = [
+                        TileDesc("128, 128, 16", 4, "32, 64, 16", math_inst),
+                        TileDesc("128, 128, 16", 3, "32, 64, 16", math_inst),
+                        TileDesc("256, 64, 16", 3, "64, 32, 16", math_inst),
+                        TileDesc("64, 256, 16", 3, "32, 64, 16", math_inst),
+                        TileDesc("128, 64, 16", 4, "64, 32, 16", math_inst),
+                        TileDesc("64, 128, 16", 4, "32, 64, 16", math_inst),
+                        TileDesc("64, 64, 16", 3, "32, 32, 16", math_inst),
+                        TileDesc("128, 128, 32", 3, "32, 64, 32", math_inst),
+                        TileDesc("256, 64, 32", 3, "64, 32, 32", math_inst),
+                        TileDesc("64, 256, 32", 3, "32, 64, 32", math_inst),
+                        TileDesc("128, 64, 32", 3, "64, 32, 32", math_inst),
+                        TileDesc("64, 128, 32", 3, "32, 64, 32", math_inst),
+                        TileDesc("64, 64, 32", 3, "32, 32, 32", math_inst),
+                    ]
+                    for tile in tiles:
+                        kernel_dict["iterator_algorithm"] = iterator_algorithm
+                        kernel_dict["Tshape"] = tile.Tshape
+                        kernel_dict["Wshape"] = tile.Wshape
+                        kernel_dict["Ishape"] = tile.math_inst[0]
+                        kernel_dict["stages"] = str(tile.stages)
+                        kernel_dict["element_accum"] = tile.math_inst[3]
+                        kernel_dict["kernel_func_name"] = op_dict[
+                            "func_name"
+                        ] + str(suffix)
+                        suffix += 1
+                        cba_kernel = cba_kernel_no_alpha
+                        if epi_func in [CbaAct.LeakyRelu]:
+                            cba_kernel = cba_kernel_alpha
+                        kernel_str = (
+                            cba_header
+                            + SubstituteTemplate(cba_kernel, kernel_dict)
+                            + CommonTail
+                        )
+                        file_name = (
+                            "generated_tmp/"
+                            + kernel_dict["kernel_func_name"]
+                            + ".cu"
+                        )
+                        write_kernel_to_file(kernel_str, file_name)
+                        all_kernel_names += (
+                            kernel_dict["kernel_func_name"] + ", \n"
+                        )
+                        all_kernel_declares += (
+                            "cutlass::Status "
+                            + kernel_dict["kernel_func_name"]
+                            + "(const ConvAllParams& params);"
+                        )
+
+        # Generate op code
+        op_dict["kernel_func_declare"] = all_kernel_declares
+        op_dict["all_kernel_func_name"] = all_kernel_names
+        sm80_code += SubstituteTemplate(CommonConvFunction, op_dict)
+
+    return sm80_code
+
+
 if __name__ == "__main__":
     sm_versions_and_types = []
     args = parse_args()
@@ -371,8 +482,10 @@ def generate_sm80_16816(cutlass_dtype="cutlass::half_t"):
     if args.cuda_arch in ["80", "86", "89"]:
         sm_versions_and_types.append(["80", "fp16"])
         sm_versions_and_types.append(["80", "bf16"])
+        sm_versions_and_types.append(["80", "fp32"])
         all_code += generate_sm80_16816()
         all_code += generate_sm80_16816(cutlass_dtype="cutlass::bfloat16_t")
+        all_code += generate_sm80_1688(cutlass_dtype="cutlass::tfloat32_t")
 
     all_code += GenerateFunctionForPhi(
         sm_versions_and_types, SupportedAct, UnderScoreName, CamelName
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_residual.py b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_residual.py
index 629ffc12415e9..e243a64e1548d 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_residual.py
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_residual.py
@@ -350,6 +350,121 @@ def generate_sm80_16816(cutlass_dtype="cutlass::half_t"):
     return sm80_code
 
 
+def generate_sm80_1688(cutlass_dtype="cutlass::tfloat32_t"):
+    kernel_dict = {
+        "conv_kind_name": "Fprop",
+        "element_a": cutlass_dtype,
+        "layout_a": "cutlass::layout::TensorNHWC",
+        "element_b": cutlass_dtype,
+        "layout_b": "cutlass::layout::TensorNHWC",
+        "element_c": cutlass_dtype,
+        "layout_c": "cutlass::layout::TensorNHWC",
+        "opcode_class": "cutlass::arch::OpClassTensorOp",
+        "arch": "cutlass::arch::Sm80",
+        "swizzling_functor": "cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>",
+        # alpha is always float!
+        "element_epilogue": "float",
+        "math_operator": "cutlass::arch::OpMultiplyAdd",
+        "element_residul": cutlass_dtype,
+    }
+
+    kernel_dict["stride_support"] = "cutlass::conv::StrideSupport::kStrided"
+
+    # iterate over this loop
+    iterator_algorithms = [
+        "cutlass::conv::IteratorAlgorithm::kOptimized",
+    ]
+
+    math_instructions = [
+        (
+            "16,8,8",
+            cutlass_dtype,
+            cutlass_dtype,
+            "float",
+        ),
+    ]
+
+    alignments = [4]
+
+    kernel_dict["align_a"] = "4"
+    kernel_dict["align_b"] = "4"
+    kernel_dict["epilogue_vector_length"] = "4"
+    kernel_dict["split_k_slices"] = "1"
+
+    sm80_code = ""
+    for epi_res_block in SupportedEpilogue:
+        op_dict = {}
+        op_dict["func_name"] = (
+            UnderScoreName[epi_res_block].lower() + "_sm80_fp32"
+        )
+        op_dict["enum_op_name"] = UnderScoreName[epi_res_block].upper()
+        # for a op, we record all its kernels into a std::vector in C++ code
+        all_kernel_names = ""
+        all_kernel_declares = ""
+        suffix = 0
+        for iterator_algorithm in iterator_algorithms:
+            for alignment in alignments:
+                for math_inst in math_instructions:
+                    tiles = [
+                        TileDesc("128, 128, 16", 4, "32, 64, 16", math_inst),
+                        TileDesc("128, 128, 16", 3, "32, 64, 16", math_inst),
+                        TileDesc("256, 64, 16", 3, "64, 32, 16", math_inst),
+                        TileDesc("64, 256, 16", 3, "32, 64, 16", math_inst),
+                        TileDesc("128, 64, 16", 4, "64, 32, 16", math_inst),
+                        TileDesc("64, 128, 16", 4, "32, 64, 16", math_inst),
+                        TileDesc("64, 64, 16", 3, "32, 32, 16", math_inst),
+                        TileDesc("128, 128, 32", 3, "32, 64, 32", math_inst),
+                        TileDesc("256, 64, 32", 3, "64, 32, 32", math_inst),
+                        TileDesc("64, 256, 32", 3, "32, 64, 32", math_inst),
+                        TileDesc("128, 64, 32", 3, "64, 32, 32", math_inst),
+                        TileDesc("64, 128, 32", 3, "32, 64, 32", math_inst),
+                        TileDesc("64, 64, 32", 3, "32, 32, 32", math_inst),
+                    ]
+
+                    for tile in tiles:
+                        kernel_dict["iterator_algorithm"] = iterator_algorithm
+                        kernel_dict["Tshape"] = tile.Tshape
+                        kernel_dict["Wshape"] = tile.Wshape
+                        kernel_dict["Ishape"] = tile.math_inst[0]
+                        kernel_dict["stages"] = str(tile.stages)
+                        kernel_dict["element_accum"] = tile.math_inst[3]
+                        kernel_dict["kernel_func_name"] = op_dict[
+                            "func_name"
+                        ] + str(suffix)
+                        suffix += 1
+                        kernel_dict["act1"] = ActTag[epi_res_block[0]]
+                        kernel_dict["binary"] = epi_res_block[1]
+                        kernel_dict["act2"] = ActTag[epi_res_block[2]]
+
+                        # sm80_code += SubstituteTemplate(cbr_kernel, kernel_dict)
+                        kernel_str = (
+                            cbr_header
+                            + SubstituteTemplate(cbr_kernel, kernel_dict)
+                            + CommonTail
+                        )
+                        file_name = (
+                            "generated_tmp/"
+                            + kernel_dict["kernel_func_name"]
+                            + ".cu"
+                        )
+                        write_kernel_to_file(kernel_str, file_name)
+
+                        all_kernel_names += (
+                            kernel_dict["kernel_func_name"] + ", \n"
+                        )
+                        all_kernel_declares += (
+                            "cutlass::Status "
+                            + kernel_dict["kernel_func_name"]
+                            + "(const ConvAllParams& params);"
+                        )
+
+        # Generate op code
+        op_dict["kernel_func_declare"] = all_kernel_declares
+        op_dict["all_kernel_func_name"] = all_kernel_names
+        sm80_code += SubstituteTemplate(CommonConvFunction, op_dict)
+    return sm80_code
+
+
 if __name__ == "__main__":
     sm_versions_and_types = []
     args = parse_args()
@@ -361,8 +476,10 @@ def generate_sm80_16816(cutlass_dtype="cutlass::half_t"):
     if args.cuda_arch in ["80", "86", "89"]:
         sm_versions_and_types.append(["80", "fp16"])
         sm_versions_and_types.append(["80", "bf16"])
+        sm_versions_and_types.append(["80", "fp32"])
         all_code += generate_sm80_16816()
         all_code += generate_sm80_16816(cutlass_dtype="cutlass::bfloat16_t")
+        all_code += generate_sm80_1688(cutlass_dtype="cutlass::tfloat32_t")
 
     all_code += GenerateFunctionForPhi(
         sm_versions_and_types, SupportedEpilogue, UnderScoreName, CamelName
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_common.py b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_common.py
index 6dbf6bcbbb82a..5d2425fe4059b 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_common.py
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_common.py
@@ -57,7 +57,7 @@
   ${element_c} *bias = (${element_c} *)(params.bias);
   ${element_c} *output = (${element_c} *)(params.output);
   // only used by conv2d_bias_residual
- auto residual = (${element_c} *)(params.residual);
+  auto residual = (${element_c} *)(params.residual);
 
   int batch = params.batch;
   int ic = params.ic;
@@ -96,8 +96,8 @@
   ImplicitGemm implicit_gemm_op;
   size_t bytes = implicit_gemm_op.get_workspace_size(arguments);
 
-auto stream = params.stream;
-void *workspace = params.workspace;
+  auto stream = params.stream;
+  void *workspace = params.workspace;
 
   cutlass::Status status = implicit_gemm_op.can_implement(arguments);
   CUTLASS_CHECK(status);
@@ -125,7 +125,7 @@
 std::map<std::vector<int>, int> map_problem_${func_name};
 std::mutex ${func_name}_mutex;
 
-void ${func_name}(ConvAllParams params) {
+bool ${func_name}(ConvAllParams params) {
   int batch = params.batch;
   int ic = params.ic;
   int ih = params.ih;
@@ -145,7 +145,7 @@
   if (map_problem_${func_name}.count(problem_size)) {
     ${func_name}_all_func[map_problem_${func_name}.at(problem_size)](
         params);
-    return;
+    return true;
   }
 
   int best_config_index = ProfileToGetBestConfig(
@@ -155,6 +155,7 @@
 
   map_problem_${func_name}[problem_size] = best_config_index;
   ${func_name}_all_func[best_config_index](params);
+  return true;
 }
 """
 
@@ -164,8 +165,8 @@
 # this function is invoked by phi kernel
 
 CommonWrapperForPhi = """
-void ${op_name}(ConvAllParams params) {
-    ${dispatch_body}
+bool ${op_name}(ConvAllParams params) {
+  ${dispatch_body}
 }
 """
 
@@ -173,14 +174,18 @@
 def convert_c_data_type(dtype):
     if dtype == "fp16":
         return "Conv2dDataType::fp16"
-    if dtype == "bf16":
+    elif dtype == "bf16":
         return "Conv2dDataType::bf16"
+    elif dtype == "fp32":
+        return "Conv2dDataType::fp32"
+    else:
+        return None
 
 
 CommonDispatchTemp = '''
     if (params.sm_version == ${sm_code} && params.data_type == ${data_type})
     {
-        ${op_name_with_sm}(params);
+        return ${op_name_with_sm}(params);
     }
     '''
 
@@ -213,6 +218,7 @@ def GenerateFunctionForPhi(
                 + data_type
             )
             dispatch_body += SubstituteTemplate(CommonDispatchTemp, sm_dicts)
+        dispatch_body += '''    return false;'''
         op_dicts = {}
         op_dicts["dispatch_body"] = dispatch_body
         op_dicts["op_name"] = camel_names[epi_func]
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h
index b29ce65f5230a..a36495ca6abfb 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h
@@ -59,19 +59,17 @@ typedef struct {
 } ConvAllParams;
 
 // Below functions are provided by cutlass, they are called by phi.
-extern "C" void Conv2dBiasAddRelu(ConvAllParams params);
-extern "C" void Conv2dBiasRelu(ConvAllParams params);
-extern "C" void Conv2dBiasLeakyRelu(ConvAllParams params);
-extern "C" void Conv2dBiasSilu(ConvAllParams params);
-extern "C" void Conv2dBias(ConvAllParams params);
-extern "C" void Conv2dBiasSigmoid(ConvAllParams params);
+extern "C" bool Conv2dBiasAddRelu(ConvAllParams params);
+extern "C" bool Conv2dBiasRelu(ConvAllParams params);
+extern "C" bool Conv2dBiasLeakyRelu(ConvAllParams params);
+extern "C" bool Conv2dBiasSilu(ConvAllParams params);
+extern "C" bool Conv2dBias(ConvAllParams params);
+extern "C" bool Conv2dBiasSigmoid(ConvAllParams params);
 
-extern "C" void Conv2dDepthwiseBias(ConvAllParams params);
-extern "C" void Conv2dDepthwiseBiasRelu(ConvAllParams params);
-extern "C" void Conv2dDepthwiseBiasSigmoid(ConvAllParams params);
-extern "C" void Conv2dDepthwiseBiasSilu(ConvAllParams params);
-
-extern "C" int HelloFromCutlassConv2d(int a, int b);
+extern "C" bool Conv2dDepthwiseBias(ConvAllParams params);
+extern "C" bool Conv2dDepthwiseBiasRelu(ConvAllParams params);
+extern "C" bool Conv2dDepthwiseBiasSigmoid(ConvAllParams params);
+extern "C" bool Conv2dDepthwiseBiasSilu(ConvAllParams params);
 
 }  // namespace cutlass_internal
 }  // namespace fusion
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu
index 0a08cd165519d..6aed60cf1c23b 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu
@@ -325,6 +325,14 @@ int ProfileToGetBestConfig(
                          params, op_type, static_cast<float>(1.0))
                   << " compared with baseline,"
                   << "cost_time: " << elapsed_time << "ms." << std::endl;
+      } else if (params.data_type == Conv2dDataType::fp32) {
+        // debug code
+        std::cout << OpType2String(op_type) << ": tactic " << i
+                  << " has max diff "
+                  << conv2d_diff_gpu<float>(
+                         params, op_type, static_cast<float>(1.0))
+                  << " compared with baseline,"
+                  << "cost_time: " << elapsed_time << "ms." << std::endl;
       }
     }
   }
diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/generic_mixed_gemm_kernelLauncher.py b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/generic_mixed_gemm_kernelLauncher.py
index 5847956020ceb..17911e4898220 100644
--- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/generic_mixed_gemm_kernelLauncher.py
+++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/generic_mixed_gemm_kernelLauncher.py
@@ -234,9 +234,7 @@ def generate_source_cu(
             for arch in archs:
                 for epilogue_tag in EpilogueTags.keys():
                     for stages in StagesList[arch]:
-                        file_name = "autogen_tmp/generic_mixed_gemm_kernelLauncher_{}_sm{}_stages{}_{}.cu".format(
-                            element_type, arch, stages, epilogue_tag
-                        )
+                        file_name = f"autogen_tmp/generic_mixed_gemm_kernelLauncher_{element_type}_sm{arch}_stages{stages}_{epilogue_tag}.cu"
                         all_code = generate_source_cu(
                             element_type,
                             arch,
diff --git a/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu b/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu
index 79057bee76219..9bf22b8ff84a1 100644
--- a/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu
+++ b/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu
@@ -26,7 +26,7 @@ namespace phi {
 namespace fusion {
 namespace cutlass_internal {
 
-typedef void (*func)(phi::fusion::cutlass_internal::ConvAllParams);
+typedef bool (*func)(phi::fusion::cutlass_internal::ConvAllParams);
 
 template <typename T, typename Context>
 void FusedConv2dAddActKernel(const Context& ctx,
@@ -230,7 +230,12 @@ void FusedConv2dAddActKernel(const Context& ctx,
           "Cutlass conv2d_depthwise does not support this activation: %s.",
           activation.c_str()));
     }
-    conv_func(params);
+
+    if (!conv_func(params)) {
+      PADDLE_THROW(
+          phi::errors::Fatal("no fused_conv2d_add_act cutlass kernel "));
+    }
+
     output->set_layout(DataLayout::NHWC);
     return;
   }
@@ -265,7 +270,11 @@ void FusedConv2dAddActKernel(const Context& ctx,
     PADDLE_THROW(phi::errors::InvalidArgument(
         "Cutlass does not support this activation: %s.", activation.c_str()));
   }
-  conv_func(params);
+
+  if (!conv_func(params)) {
+    PADDLE_THROW(phi::errors::Fatal("no fused_conv2d_add_act cutlass kernel "));
+  }
+
   output->set_layout(DataLayout::NHWC);
 }
 }  // namespace cutlass_internal
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu
index 60a82cfe7c198..48819c12a8dc0 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu
@@ -11,7 +11,12 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#else
 #include <cuda_fp16.h>
 #include <cub/cub.cuh>
 #endif
@@ -21,9 +26,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
-#ifndef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h"
-#endif
 
 namespace phi {
 namespace fusion {
@@ -51,7 +54,6 @@ void FusedBiasDropoutResidualLnGradKernel(
     DenseTensor* bias_grad,
     DenseTensor* ln_scale_grad,
     DenseTensor* ln_bias_grad) {
-#ifndef PADDLE_WITH_HIP
   using U = LayerNormParamType<T>;
   auto* d_y_data = y_grad.data<T>();
   auto* ln_scale_data =
@@ -114,15 +116,19 @@ void FusedBiasDropoutResidualLnGradKernel(
       d_x_data,
       d_bias_data,
       d_residual_data);
-#else
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "FusedBiasDropoutResidualLnGradKernel not surpport for rocm"));
-#endif
 }
 
 }  // namespace fusion
 }  // namespace phi
 
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(fused_bias_dropout_residual_layer_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::fusion::FusedBiasDropoutResidualLnGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
 PD_REGISTER_KERNEL(fused_bias_dropout_residual_layer_norm_grad,
                    GPU,
                    ALL_LAYOUT,
@@ -130,3 +136,4 @@ PD_REGISTER_KERNEL(fused_bias_dropout_residual_layer_norm_grad,
                    float,
                    double,
                    phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu
index 37450d3a4e178..ca0bcbe7f2466 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu
@@ -17,9 +17,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
-#ifndef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h"
-#endif
 
 namespace phi {
 namespace fusion {
@@ -42,7 +40,6 @@ void FusedBiasDropoutResidualLnKernel(
     DenseTensor* dropout_mask_out,
     DenseTensor* ln_mean,
     DenseTensor* ln_variance) {
-#ifndef PADDLE_WITH_HIP
   using U = phi::funcs::LayerNormParamType<T>;
   auto* x_data = x.data<T>();
   auto* bias_data = (bias.get_ptr() == nullptr) ? nullptr : bias->data<T>();
@@ -95,14 +92,20 @@ void FusedBiasDropoutResidualLnKernel(
       y_data,
       ln_mean_data,
       ln_var_data);
-#else
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "FusedBiasDropoutResidualLnKernel not support for rocm"));
-#endif
 }
 }  // namespace fusion
 }  // namespace phi
 
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(fused_bias_dropout_residual_layer_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::fusion::FusedBiasDropoutResidualLnKernel,
+                   float,
+                   phi::dtype::float16) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::UINT8);
+}
+#else
 PD_REGISTER_KERNEL(fused_bias_dropout_residual_layer_norm,
                    GPU,
                    ALL_LAYOUT,
@@ -112,3 +115,4 @@ PD_REGISTER_KERNEL(fused_bias_dropout_residual_layer_norm,
                    phi::dtype::float16) {
   kernel->OutputAt(1).SetDataType(phi::DataType::UINT8);
 }
+#endif
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h
index e5f5c9ba50ba4..d2cd2f1b545a7 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h
@@ -35,7 +35,11 @@ struct GeluFunctor {
 template <typename T>
 struct FastGeluFunctor {
   inline __device__ T operator()(const T x) const {
+#ifdef PADDLE_WITH_HIP
+    assert(0 && "ROCM does not support FastGelu");
+#else
     return phi::GeluFwd<T, true>(x);
+#endif
   }
 };
 
@@ -92,8 +96,8 @@ __global__ void FusedDropoutActBias(
   int row_id = blockIdx.y;
   int idx = row_id * cols + col_id;
 
-  curandStatePhilox4_32_10_t state;
-  curand_init(seed, idx, increment, &state);
+  GPURAND(StatePhilox4_32_10_t) state;
+  GPURAND(_init)(seed, idx, increment, &state);
 
   const T factor =
       phi::fusion::GetFactor<T>(dropout_prob, is_upscale_in_train, is_test);
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
index 801f070251fb2..8994d52138233 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
@@ -202,18 +202,6 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,
                        ? NoMaskBwFunctor<T, float>(1.0f - dropout_rate)
                        : NoMaskBwFunctor<T, float>(1.0f - dropout_rate, 1.0f);
 
-#ifdef PADDLE_WITH_HIP
-    VectorizedDropoutBackward<T, NoMaskBwFunctor<T, float>>
-        <<<grid_size, block_size, 0, stream>>>(0,
-                                               numel,
-                                               seed_data,  //  idx: 2 need save
-                                               x_grad_data,
-                                               y_grad_data,
-                                               out_grad_data,
-                                               increment,  //  idx: 6 need save
-                                               main_offset,
-                                               functor);
-#else
     // we assume seed/offset is same across iterations
     // seed_offset_data should preserved by cudaGraph pool
     const phi::GPUContext* dev_ctx_p = &dev_ctx;
@@ -233,9 +221,13 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,
         cudaKernelCallback = [=](unsigned int id) {
           void* functionPtr = reinterpret_cast<void*>(
               &(VectorizedDropoutBackward<T, NoMaskBwFunctor<T, float>>));
+#ifdef PADDLE_WITH_HIP
+          hipFunction_t cudaFunc = reinterpret_cast<hipFunction_t>(functionPtr);
+#else
           cudaFunction_t cudaFunc;
           PADDLE_ENFORCE_GPU_SUCCESS(
               cudaGetFuncBySymbol(&cudaFunc, functionPtr));
+#endif
           VLOG(10) << "[cudaKernelCallback] cudaFunc = " << cudaFunc
                    << " functionPtr = " << functionPtr;
 
@@ -257,7 +249,6 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,
 
     VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data
              << ", increment = " << increment;
-#endif
   }
 }
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
index c95c5fbf0ca3d..54ec3604bbee9 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
@@ -186,18 +186,6 @@ void FusedDropoutAddKernel(const Context& dev_ctx,
     auto dst_functor =
         NoMaskFwFunctor<T, float>(1.0f - dropout_rate, upscale_in_train);
 
-#ifdef PADDLE_WITH_HIP
-    VectorizedDropoutForward<T, NoMaskFwFunctor<T, float>>
-        <<<grid_size, block_size, 0, stream>>>(0,
-                                               numel,
-                                               seed_data,  // need save
-                                               x_data,
-                                               y_data,
-                                               out_data,
-                                               increment,  // need save
-                                               main_offset,
-                                               dst_functor);
-#else
     // we assume seed/offset is same across iterations
     // seed_offset_data should preserved by cudaGraph pool
     const phi::GPUContext* dev_ctx_p = &dev_ctx;
@@ -237,9 +225,13 @@ void FusedDropoutAddKernel(const Context& dev_ctx,
         cudaKernelCallback = [=](unsigned int id) {
           void* functionPtr = reinterpret_cast<void*>(
               &(VectorizedDropoutForward<T, NoMaskFwFunctor<T, float>>));
+#ifdef PADDLE_WITH_HIP
+          hipFunction_t cudaFunc = reinterpret_cast<hipFunction_t>(functionPtr);
+#else
           cudaFunction_t cudaFunc;
           PADDLE_ENFORCE_GPU_SUCCESS(
               cudaGetFuncBySymbol(&cudaFunc, functionPtr));
+#endif
           VLOG(10) << "[cudaKernelCallback] cudaFunc = " << cudaFunc
                    << " functionPtr = " << functionPtr;
 
@@ -260,7 +252,6 @@ void FusedDropoutAddKernel(const Context& dev_ctx,
 
     VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data
              << ", increment = " << increment;
-#endif
   } else {
     using MT = typename phi::dtype::MPTypeTrait<T>::Type;
     MT factor = static_cast<MT>(1.0f - dropout_rate);
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_common.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_common.h
index 2ef46378b1b9b..ef9ecbb435fdb 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_common.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_common.h
@@ -20,10 +20,25 @@ limitations under the License. */
 #include <curand_kernel.h>
 #endif
 
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hiprand.h>
+#include <hiprand_kernel.h>
+#endif
+
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
 
+#ifdef PADDLE_WITH_HIP
+#define GPU(str) hip##str
+#define GPURAND(str) hiprand##str
+#else
+#define GPU(str) cuda##str
+#define GPURAND(str) curand##str
+#endif
+
 namespace phi {
 namespace fusion {
 
@@ -63,26 +78,29 @@ inline phi::backends::gpu::GpuLaunchConfig Get1DBlocksAnd2DGrids(
 }
 
 template <int VecSize>
-__forceinline__ __device__ void RandVec(curandStatePhilox4_32_10_t *state,
+__forceinline__ __device__ void RandVec(GPURAND(StatePhilox4_32_10_t) * state,
                                         float *data);
 
 template <>
-__forceinline__ __device__ void RandVec<1>(curandStatePhilox4_32_10_t *state,
+__forceinline__ __device__ void RandVec<1>(GPURAND(StatePhilox4_32_10_t) *
+                                               state,
                                            float *data) {
-  data[0] = curand_uniform(state);
+  data[0] = GPURAND(_uniform)(state);
 }
 
 template <>
-__forceinline__ __device__ void RandVec<2>(curandStatePhilox4_32_10_t *state,
+__forceinline__ __device__ void RandVec<2>(GPURAND(StatePhilox4_32_10_t) *
+                                               state,
                                            float *data) {
-  data[0] = curand_uniform(state);
-  data[1] = curand_uniform(state);
+  data[0] = GPURAND(_uniform)(state);
+  data[1] = GPURAND(_uniform)(state);
 }
 
 template <>
-__forceinline__ __device__ void RandVec<4>(curandStatePhilox4_32_10_t *state,
+__forceinline__ __device__ void RandVec<4>(GPURAND(StatePhilox4_32_10_t) *
+                                               state,
                                            float *data) {
-  float4 rand4 = curand_uniform4(state);
+  float4 rand4 = GPURAND(_uniform4)(state);
   data[0] = rand4.x;
   data[1] = rand4.y;
   data[2] = rand4.w;
@@ -90,7 +108,8 @@ __forceinline__ __device__ void RandVec<4>(curandStatePhilox4_32_10_t *state,
 }
 
 template <>
-__forceinline__ __device__ void RandVec<8>(curandStatePhilox4_32_10_t *state,
+__forceinline__ __device__ void RandVec<8>(GPURAND(StatePhilox4_32_10_t) *
+                                               state,
                                            float *data) {
   RandVec<4>(state, data);
   RandVec<4>(state, data + 4);
@@ -99,7 +118,7 @@ __forceinline__ __device__ void RandVec<8>(curandStatePhilox4_32_10_t *state,
 template <typename T>
 inline void SetZero(const phi::GPUContext &ctx, T *ptr, const size_t size) {
   PADDLE_ENFORCE_GPU_SUCCESS(
-      cudaMemsetAsync(ptr, 0, size * sizeof(T), ctx.stream()));
+      GPU(MemsetAsync)(ptr, 0, size * sizeof(T), ctx.stream()));
 }
 
 /**
diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
index e31b24e7e105e..221019531a548 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
@@ -38,10 +38,19 @@ limitations under the License.
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
-#ifndef PADDLE_WITH_HIP
-#include <cub/cub.cuh>
 #include "paddle/phi/kernels/fusion/gpu/attention_layer.norm.h"
 #include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h"
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#define GPU(str) hip##str
+#define GPUMultiProcessorCount hipDeviceAttributeMultiprocessorCount
+#else
+#include <cub/cub.cuh>
+#define GPU(str) cuda##str
+#define GPUMultiProcessorCount cudaDevAttrMultiProcessorCount
 #endif
 
 namespace phi {
@@ -50,9 +59,11 @@ namespace fusion {
 
 namespace {
 
-#ifndef PADDLE_WITH_HIP
-
+#ifdef PADDLE_WITH_HIP
+constexpr int kWarpSize = 64;
+#else
 constexpr int kWarpSize = 32;
+#endif
 
 template <typename T>
 struct SumOp {
@@ -74,7 +85,11 @@ template <template <typename> class ReductionOp,
 __inline__ __device__ T WarpAllReduce(T val) {
   for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
     val = ReductionOp<T>()(
+#ifdef PADDLE_WITH_HIP
+        val, __shfl_xor(val, mask, thread_group_width));
+#else
         val, __shfl_xor_sync(0xffffffff, val, mask, thread_group_width));
+#endif
   }
   return val;
 }
@@ -97,7 +112,7 @@ __inline__ __device__ T Div(T a, T b);
 
 template <>
 __inline__ __device__ float Div<float>(float a, float b) {
-#ifdef OF_LAYER_NORM_USE_FAST_MATH
+#if defined(OF_LAYER_NORM_USE_FAST_MATH) || defined(PADDLE_WITH_HIP)
   return __fdividef(a, b);
 #else
   return a / b;
@@ -114,7 +129,7 @@ __inline__ __device__ T Rsqrt(T x);
 
 template <>
 __inline__ __device__ float Rsqrt<float>(float x) {
-#ifdef OF_LAYER_NORM_USE_FAST_MATH
+#if defined(OF_LAYER_NORM_USE_FAST_MATH) || defined(PADDLE_WITH_HIP)
   return __frsqrt_rn(x);
 #else
   return rsqrt(x);
@@ -127,35 +142,36 @@ __inline__ __device__ double Rsqrt<double>(double x) {
 }
 
 template <class Func>
-inline cudaError_t GetNumBlocks(Func func,
-                                int64_t block_size,
-                                size_t dynamic_smem_size,
-                                int64_t max_blocks,
-                                int64_t waves,
-                                int* num_blocks) {
+inline GPU(Error_t) GetNumBlocks(Func func,
+                                 int64_t block_size,
+                                 size_t dynamic_smem_size,
+                                 int64_t max_blocks,
+                                 int64_t waves,
+                                 int* num_blocks) {
   int dev;
   {
-    cudaError_t err = cudaGetDevice(&dev);
-    if (err != cudaSuccess) {
+    GPU(Error_t) err = GPU(GetDevice)(&dev);
+    if (err != GPU(Success)) {
       return err;
     }
   }
   int sm_count;
   {
-    cudaError_t err =
-        cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev);
-    if (err != cudaSuccess) {
+    GPU(Error_t)
+    err = GPU(DeviceGetAttribute)(&sm_count, GPUMultiProcessorCount, dev);
+    if (err != GPU(Success)) {
       return err;
     }
   }
   int max_active_blocks;
   {
-    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    GPU(Error_t)
+    err = GPU(OccupancyMaxActiveBlocksPerMultiprocessor)(
         &max_active_blocks, func, block_size, dynamic_smem_size);
   }
   *num_blocks = std::max<int>(
       1, std::min<int64_t>(max_blocks, sm_count * max_active_blocks * waves));
-  return cudaSuccess;
+  return GPU(Success);
 }
 
 template <typename T>
@@ -279,9 +295,15 @@ __inline__ __device__ void WelfordWarpReduce(
   *m2 = thread_m2;
   *count = thread_count;
   for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+#ifdef PADDLE_WITH_HIP
+    T b_mean = __shfl_down(*mean, mask, thread_group_width);
+    T b_m2 = __shfl_down(*m2, mask, thread_group_width);
+    T b_count = __shfl_down(*count, mask, thread_group_width);
+#else
     T b_mean = __shfl_down_sync(0xffffffff, *mean, mask, thread_group_width);
     T b_m2 = __shfl_down_sync(0xffffffff, *m2, mask, thread_group_width);
     T b_count = __shfl_down_sync(0xffffffff, *count, mask, thread_group_width);
+#endif
     WelfordCombine(b_mean, b_m2, b_count, mean, m2, count);
   }
 }
@@ -291,9 +313,15 @@ __inline__ __device__ void WelfordWarpAllReduce(
     T thread_mean, T thread_m2, T thread_count, T* mean, T* m2, T* count) {
   WelfordWarpReduce<T, thread_group_width>(
       thread_mean, thread_m2, thread_count, mean, m2, count);
+#ifdef PADDLE_WITH_HIP
+  *mean = __shfl(*mean, 0, thread_group_width);
+  *m2 = __shfl(*m2, 0, thread_group_width);
+  *count = __shfl(*count, 0, thread_group_width);
+#else
   *mean = __shfl_sync(0xffffffff, *mean, 0, thread_group_width);
   *m2 = __shfl_sync(0xffffffff, *m2, 0, thread_group_width);
   *count = __shfl_sync(0xffffffff, *count, 0, thread_group_width);
+#endif
 }
 
 template <typename T, int thread_group_width = kWarpSize>
@@ -301,7 +329,11 @@ __inline__ __device__ T WarpReduceSum(T x) {
   T result = 0.0f;
 #pragma unroll
   for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+#ifdef PADDLE_WITH_HIP
+    result += __shfl_xor(x, mask, thread_group_width);
+#else
     result += __shfl_xor_sync(0xffffffff, x, mask, thread_group_width);
+#endif
   }
   return result;
 }
@@ -343,7 +375,11 @@ __inline__ __device__ void WelfordBlockAllReduce(T thread_mean,
       warp_m2 = static_cast<T>(0);
       warp_count = static_cast<T>(0);
     }
+#ifdef PADDLE_WITH_HIP
+    __syncthreads();
+#else
     __syncwarp();
+#endif
     T block_mean = 0;
     T block_m2 = 0;
     T block_count = 0;
@@ -429,63 +465,75 @@ template <typename LOAD,
           typename ComputeType,
           int pack_size,
           int block_size>
-inline cudaError_t LaunchLayerNormBlockSMemImpl(cudaStream_t stream,
-                                                LOAD load,
-                                                STORE store,
-                                                int smem,
-                                                const int64_t rows,
-                                                const int64_t cols,
-                                                const double epsilon,
-                                                ComputeType* mean,
-                                                ComputeType* inv_variance,
-                                                ComputeType col_divisor) {
+inline GPU(Error_t) LaunchLayerNormBlockSMemImpl(GPU(Stream_t) stream,
+                                                 LOAD load,
+                                                 STORE store,
+                                                 int smem,
+                                                 const int64_t rows,
+                                                 const int64_t cols,
+                                                 const double epsilon,
+                                                 ComputeType* mean,
+                                                 ComputeType* inv_variance,
+                                                 ComputeType col_divisor) {
   constexpr int waves = 32;
   int grid_dim_x;
   {
-    cudaError_t err = GetNumBlocks(
+    GPU(Error_t)
+    err = GetNumBlocks(
         LayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size>,
         block_size,
         smem,
         rows,
         waves,
         &grid_dim_x);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return err;
     }
   }
   LayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size>
       <<<grid_dim_x, block_size, smem, stream>>>(
           load, store, rows, cols, epsilon, mean, inv_variance, col_divisor);
-  return cudaPeekAtLastError();
+  return GPU(PeekAtLastError)();
 }
 
 template <typename Func>
-cudaError_t MaximizeDynamicSharedMemorySize(Func func,
-                                            const int max_smem_size) {
-  cudaFuncAttributes attr{};
+GPU(Error_t)
+MaximizeDynamicSharedMemorySize(Func func, const int max_smem_size) {
+  GPU(FuncAttributes) attr{};
+#ifdef PADDLE_WITH_HIP
+  hipError_t err = hipFuncGetAttributes(&attr, (const void*)func);
+#else
   cudaError_t err = cudaFuncGetAttributes(&attr, func);
-  if (err != cudaSuccess) {
+#endif
+  if (err != GPU(Success)) {
     return err;
   }
   constexpr int reserved_smem = 1024;  // 1K
+#ifdef PADDLE_WITH_HIP
+  return hipFuncSetAttribute(
+      (const void*)func,
+      hipFuncAttributeMaxDynamicSharedMemorySize,
+      max_smem_size - attr.sharedSizeBytes - reserved_smem);
+#else
   return cudaFuncSetAttribute(
       func,
       cudaFuncAttributeMaxDynamicSharedMemorySize,
       max_smem_size - attr.sharedSizeBytes - reserved_smem);
+#endif
 }
 
 template <typename LOAD, typename STORE, typename ComputeType, int pack_size>
-inline cudaError_t TryDispatchLayerNormBlockSMemImplBlockSize(
-    cudaStream_t stream,
-    LOAD load,
-    STORE store,
-    const int64_t rows,
-    const int64_t cols,
-    const double epsilon,
-    ComputeType* mean,
-    ComputeType* inv_variance,
-    ComputeType col_divisor,
-    bool* success) {
+inline GPU(Error_t)
+    TryDispatchLayerNormBlockSMemImplBlockSize(GPU(Stream_t) stream,
+                                               LOAD load,
+                                               STORE store,
+                                               const int64_t rows,
+                                               const int64_t cols,
+                                               const double epsilon,
+                                               ComputeType* mean,
+                                               ComputeType* inv_variance,
+                                               ComputeType col_divisor,
+                                               bool* success) {
   // Note(Zhengzekang): We choose a fixed blocksize to avoid layernorm diff, by
   // RichardWooSJTU.
 
@@ -493,8 +541,8 @@ inline cudaError_t TryDispatchLayerNormBlockSMemImplBlockSize(
 
   int dev = 0;
   {
-    cudaError_t err = cudaGetDevice(&dev);
-    if (err != cudaSuccess) {
+    GPU(Error_t) err = GPU(GetDevice)(&dev);
+    if (err != GPU(Success)) {
       return err;
     }
   }
@@ -520,16 +568,17 @@ inline cudaError_t TryDispatchLayerNormBlockSMemImplBlockSize(
 
 template <typename LOAD, typename STORE, typename ComputeType>
 struct TryDispatchLayerNormBlockSMemImplPackSize {
-  cudaError_t operator()(cudaStream_t stream,
-                         LOAD load,
-                         STORE store,
-                         const int64_t rows,
-                         const int64_t cols,
-                         const double epsilon,
-                         ComputeType* mean,
-                         ComputeType* inv_variance,
-                         ComputeType col_divisor,
-                         bool* success) {
+  GPU(Error_t)
+  operator()(GPU(Stream_t) stream,
+             LOAD load,
+             STORE store,
+             const int64_t rows,
+             const int64_t cols,
+             const double epsilon,
+             ComputeType* mean,
+             ComputeType* inv_variance,
+             ComputeType col_divisor,
+             bool* success) {
     if (cols % 4 == 0 && CanPackAs<LOAD>(load, 4) &&
         CanPackAs<STORE>(store, 4)) {
       return TryDispatchLayerNormBlockSMemImplBlockSize<LOAD,
@@ -579,16 +628,16 @@ struct TryDispatchLayerNormBlockSMemImplPackSize {
 };
 
 template <typename LOAD, typename STORE, typename ComputeType>
-inline cudaError_t TryDispatchLayerNormBlockSMemImpl(cudaStream_t stream,
-                                                     LOAD load,
-                                                     STORE store,
-                                                     const int64_t rows,
-                                                     const int64_t cols,
-                                                     const double epsilon,
-                                                     ComputeType* mean,
-                                                     ComputeType* inv_variance,
-                                                     ComputeType col_divisor,
-                                                     bool* success) {
+inline GPU(Error_t) TryDispatchLayerNormBlockSMemImpl(GPU(Stream_t) stream,
+                                                      LOAD load,
+                                                      STORE store,
+                                                      const int64_t rows,
+                                                      const int64_t cols,
+                                                      const double epsilon,
+                                                      ComputeType* mean,
+                                                      ComputeType* inv_variance,
+                                                      ComputeType col_divisor,
+                                                      bool* success) {
   return TryDispatchLayerNormBlockSMemImplPackSize<LOAD, STORE, ComputeType>()(
       stream,
       load,
@@ -663,48 +712,51 @@ __global__ void __launch_bounds__(1024)
 }
 
 template <typename LOAD, typename STORE, typename ComputeType, int pack_size>
-inline cudaError_t LaunchLayerNormBlockUncachedImpl(cudaStream_t stream,
-                                                    LOAD load,
-                                                    STORE store,
-                                                    const int64_t rows,
-                                                    const int64_t cols,
-                                                    const double epsilon,
-                                                    ComputeType* mean,
-                                                    ComputeType* inv_variance) {
+inline GPU(Error_t)
+    LaunchLayerNormBlockUncachedImpl(GPU(Stream_t) stream,
+                                     LOAD load,
+                                     STORE store,
+                                     const int64_t rows,
+                                     const int64_t cols,
+                                     const double epsilon,
+                                     ComputeType* mean,
+                                     ComputeType* inv_variance) {
   constexpr int block_size = 1024;
   constexpr int waves = 32;
   int grid_dim_x;
   {
-    cudaError_t err = GetNumBlocks(LayerNormBlockUncachedImpl<LOAD,
-                                                              STORE,
-                                                              ComputeType,
-                                                              pack_size,
-                                                              block_size>,
-                                   block_size,
-                                   0,
-                                   rows,
-                                   waves,
-                                   &grid_dim_x);
-    if (err != cudaSuccess) {
+    GPU(Error_t)
+    err = GetNumBlocks(LayerNormBlockUncachedImpl<LOAD,
+                                                  STORE,
+                                                  ComputeType,
+                                                  pack_size,
+                                                  block_size>,
+                       block_size,
+                       0,
+                       rows,
+                       waves,
+                       &grid_dim_x);
+    if (err != GPU(Success)) {
       return err;
     }
   }
   LayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, pack_size, block_size>
       <<<grid_dim_x, block_size, 0, stream>>>(
           load, store, rows, cols, epsilon, mean, inv_variance);
-  return cudaPeekAtLastError();
+  return GPU(PeekAtLastError)();
 }
 
 template <typename LOAD, typename STORE, typename ComputeType>
 struct DispatchLayerNormBlockUncachedImplPackSize {
-  cudaError_t operator()(cudaStream_t stream,
-                         LOAD load,
-                         STORE store,
-                         const int64_t rows,
-                         const int64_t cols,
-                         const double epsilon,
-                         ComputeType* mean,
-                         ComputeType* inv_variance) {
+  GPU(Error_t)
+  operator()(GPU(Stream_t) stream,
+             LOAD load,
+             STORE store,
+             const int64_t rows,
+             const int64_t cols,
+             const double epsilon,
+             ComputeType* mean,
+             ComputeType* inv_variance) {
     if (cols % 4 == 0 && CanPackAs<LOAD>(load, 4) &&
         CanPackAs<STORE>(store, 4)) {
       return LaunchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, 4>(
@@ -721,23 +773,23 @@ struct DispatchLayerNormBlockUncachedImplPackSize {
 };
 
 template <typename LOAD, typename STORE, typename ComputeType>
-inline cudaError_t DispatchLayerNormBlockUncachedImpl(
-    cudaStream_t stream,
-    LOAD load,
-    STORE store,
-    const int64_t rows,
-    const int64_t cols,
-    const double epsilon,
-    ComputeType* mean,
-    ComputeType* inv_variance) {
+inline GPU(Error_t)
+    DispatchLayerNormBlockUncachedImpl(GPU(Stream_t) stream,
+                                       LOAD load,
+                                       STORE store,
+                                       const int64_t rows,
+                                       const int64_t cols,
+                                       const double epsilon,
+                                       ComputeType* mean,
+                                       ComputeType* inv_variance) {
   return DispatchLayerNormBlockUncachedImplPackSize<LOAD, STORE, ComputeType>()(
       stream, load, store, rows, cols, epsilon, mean, inv_variance);
 }
 
 template <typename LOAD, typename STORE, typename ComputeType>
 inline typename std::enable_if<!std::is_same<ComputeType, double>::value,
-                               cudaError_t>::type
-DispatchLayerNorm(cudaStream_t stream,
+                               GPU(Error_t)>::type
+DispatchLayerNorm(GPU(Stream_t) stream,
                   LOAD load,
                   STORE store,
                   const int64_t rows,
@@ -748,19 +800,19 @@ DispatchLayerNorm(cudaStream_t stream,
   const ComputeType col_divisor = 1.0f / cols;
   bool dispatch_smem_impl_success;
   {
-    cudaError_t err =
-        TryDispatchLayerNormBlockSMemImpl<LOAD, STORE, ComputeType>(
-            stream,
-            load,
-            store,
-            rows,
-            cols,
-            epsilon,
-            mean,
-            inv_variance,
-            col_divisor,
-            &dispatch_smem_impl_success);
-    if (err != cudaSuccess) {
+    GPU(Error_t)
+    err = TryDispatchLayerNormBlockSMemImpl<LOAD, STORE, ComputeType>(
+        stream,
+        load,
+        store,
+        rows,
+        cols,
+        epsilon,
+        mean,
+        inv_variance,
+        col_divisor,
+        &dispatch_smem_impl_success);
+    if (err != GPU(Success)) {
       return err;
     }
   }
@@ -768,13 +820,13 @@ DispatchLayerNorm(cudaStream_t stream,
     return DispatchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType>(
         stream, load, store, rows, cols, epsilon, mean, inv_variance);
   }
-  return cudaSuccess;
+  return GPU(Success);
 }
 
 template <typename LOAD, typename STORE, typename ComputeType>
 inline typename std::enable_if<std::is_same<ComputeType, double>::value,
-                               cudaError_t>::type
-DispatchLayerNorm(cudaStream_t stream,
+                               GPU(Error_t)>::type
+DispatchLayerNorm(GPU(Stream_t) stream,
                   LOAD load,
                   STORE store,
                   const int64_t rows,
@@ -918,8 +970,6 @@ struct SkipLoadAndStoreResidual {
   int64_t row_size;
 };
 
-#endif
-
 }  // namespace
 
 template <typename T, typename Context>
@@ -940,9 +990,6 @@ void FusedLayerNormKernel(const Context& dev_ctx,
                           DenseTensor* residual_out,
                           DenseTensor* mean,
                           DenseTensor* variance) {
-#if defined(PADDLE_WITH_HIP)
-  LOG(ERROR) << "Please compile with CUDA, ROCM platform isn't support it";
-#else
   using U = phi::funcs::LayerNormParamType<T>;
   const T* x_data = x.data<T>();
   const U* norm_weight_data =
@@ -1059,7 +1106,6 @@ void FusedLayerNormKernel(const Context& dev_ctx,
                                                             variance_data);
     }
   }
-#endif
 }
 
 }  // namespace fusion
diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
index bed1535d6fa1d..c3c9ece6676cb 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
@@ -24,6 +24,12 @@ namespace fusion {
 
 #define LN_NUM_COLS 1024
 
+#ifdef PADDLE_WITH_HIP
+#define WARPSIZE 64
+#else
+#define WARPSIZE 32
+#endif
+
 template <typename T>
 using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
 template <typename T>
@@ -137,9 +143,9 @@ __global__ void FusedLayernormResidualDropoutBias(
   int col_id = threadIdx.x;
   int row_id = blockIdx.x;
   int idx = row_id * cols + col_id;
-  curandStatePhilox4_32_10_t state;
+  GPURAND(StatePhilox4_32_10_t) state;
   if (HasDropout) {
-    curand_init(seed, idx, increment, &state);
+    GPURAND(_init)(seed, idx, increment, &state);
   }
 
   T factor =
@@ -147,8 +153,13 @@ __global__ void FusedLayernormResidualDropoutBias(
 
   __shared__ U mean_share;
   __shared__ U var_share;
+#ifdef PADDLE_WITH_HIP
+  __shared__ U shared_mean[64];
+  __shared__ U shared_var[64];
+#else
   __shared__ U shared_mean[32];
   __shared__ U shared_var[32];
+#endif
 
   phi::funcs::ReluFunctor<T> relu;
   U mean_val = 0;
@@ -331,16 +342,21 @@ __global__ void FusedLayernormResidualDropoutBiasInfer(
   int col_id = threadIdx.x;
   int row_id = blockIdx.x;
   int idx = row_id * cols + col_id;
-  curandStatePhilox4_32_10_t state;
-  curand_init(seed, idx, increment, &state);
+  GPURAND(StatePhilox4_32_10_t) state;
+  GPURAND(_init)(seed, idx, increment, &state);
 
   T factor =
       phi::fusion::GetFactor<T>(dropout_prob, is_upscale_in_train, is_test);
 
   __shared__ U mean_share;
   __shared__ U var_share;
+#ifdef PADDLE_WITH_HIP
+  __shared__ U shared_mean[64];
+  __shared__ U shared_var[64];
+#else
   __shared__ U shared_mean[32];
   __shared__ U shared_var[32];
+#endif
 
   phi::funcs::ReluFunctor<T> relu;
   U mean_val = 0;
@@ -421,7 +437,7 @@ struct FusedLayernormResidualDropoutBiasFunctor {
       T *layernorm_dst,
       LayerNormParamType<T> *mean,
       LayerNormParamType<T> *var,
-      cudaStream_t stream) {
+      GPU(Stream_t) stream) {
     int blockDim = phi::funcs::GetDesiredBlockDim(cols / VecSize);
     if (mean != nullptr && var != nullptr) {
       LaunchFusedLayernormResidualDropoutBiasCUDAKernel<T,
@@ -512,7 +528,7 @@ template <bool HasDropout,
           int WARPS_N = 1,
           int BYTES_PER_LDG = 16,
           int ELTS_PER_ROW = 1024,
-          int THREADS_PER_WARP = 32,
+          int THREADS_PER_WARP = WARPSIZE,
           int THREADS_PER_ROW = WARPS_N *THREADS_PER_WARP,
           int THREADS_PER_CTA = WARPS_M *THREADS_PER_ROW,
           int ROWS_PER_CTA = WARPS_M,
@@ -565,9 +581,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
   const int r = bidx * ROWS_PER_CTA + warp_m;      // row id
 
   int idx = r * ELTS_PER_ROW + c;
-  curandStatePhilox4_32_10_t state;
+  GPURAND(StatePhilox4_32_10_t) state;
   if (HasDropout) {
-    curand_init(seed, idx, increment, &state);
+    GPURAND(_init)(seed, idx, increment, &state);
   }
 
   T factor =
@@ -620,7 +636,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
         RandVec<VecSize>(&state, rand);
 #pragma unroll
         for (int jt = 0; jt < VecSize; jt++) {
+#ifndef PADDLE_WITH_HIP
 #pragma unroll
+#endif
           mask_vec[it][jt] = static_cast<MaskType>(rand[jt] >= dropout_prob);
         }
       }
@@ -708,7 +726,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
 
 #pragma unroll
     for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
+#ifdef PADDLE_WITH_HIP
+      mu_local += __shfl_xor(mu_local, it);
+#else
       mu_local += __shfl_xor_sync(uint32_t(-1), mu_local, it);
+#endif
     }
     if (WARPS_N > 1) {
       if (lane == 0) {
@@ -743,7 +765,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
 
 #pragma unroll
     for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
+#ifdef PADDLE_WITH_HIP
+      var_local += __shfl_xor(var_local, it);
+#else
       var_local += __shfl_xor_sync(uint32_t(-1), var_local, it);
+#endif
     }
     if (WARPS_N > 1) {
       if (lane == 0) {
@@ -867,7 +893,7 @@ void LaunchLayernormResidualDropoutBias(
                             rows * cols * sizeof(T),
                             ctx.stream());
     if (mask_data != nullptr) {
-      PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(
+      PADDLE_ENFORCE_GPU_SUCCESS(GPU(MemsetAsync)(
           mask_data, 0, rows * cols * sizeof(MaskType), ctx.stream()));
     }
     // call layernorm forward
@@ -896,7 +922,7 @@ void LaunchLayernormResidualDropoutBias(
   case (cols): {                                                               \
     constexpr int WARPS_N = cols < 1024 ? 1 : (cols / 1024);                   \
     constexpr int WARPS_M = 4 / WARPS_N;                                       \
-    const int THREADS_PER_WARP = 32;                                           \
+    const int THREADS_PER_WARP = WARPSIZE;                                     \
     const int BYTES_PER_LDG = 16;                                              \
     const int VecSize = BYTES_PER_LDG / sizeof(T);                             \
     const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;          \
diff --git a/paddle/phi/kernels/fusion/gpu/fused_linear_param_grad_add_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_linear_param_grad_add_kernel.cu
index d31fe01bb1c94..ba47fd983e32b 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_linear_param_grad_add_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_linear_param_grad_add_kernel.cu
@@ -24,6 +24,7 @@
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/kernels/elementwise_add_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
 
 namespace phi {
@@ -68,23 +69,44 @@ void FusedLinearParamGradAddImpl(const Context &ctx,
 
   if (!has_bias) return;
 
+  // if dbias is given, dbias_out will share memory with dbias:
+  //       dbias_tmp = sum(dout), dbias_out = dbias + dbias_tmp
+  // else: dbias_out = sum(dout)
+  DenseTensor dbias_tmp_tensor;
+  if (dbias) {
+    if (kIsMultiPrecision) {
+      dbias_tmp_tensor = phi::EmptyLike<MT, Context>(ctx, dbias.get());
+    } else {
+      dbias_tmp_tensor = phi::EmptyLike<T, Context>(ctx, dbias.get());
+    }
+  }
+  DenseTensor *dbias_tmp = !dbias ? dbias_out : &dbias_tmp_tensor;
+
   if (!fuse_bias_grad) {
     auto dout_copy = dout;
     dout_copy.Resize({M, N});
     if (kIsMultiPrecision) {
-      *dbias_out = phi::Sum<T, Context>(
-          ctx, dout_copy, {0}, phi::CppTypeToDataType<MT>::Type(), false);
+      phi::SumKernel<T, Context>(ctx,
+                                 dout_copy,
+                                 {0},
+                                 phi::CppTypeToDataType<MT>::Type(),
+                                 false,
+                                 dbias_tmp);
     } else {
-      *dbias_out = phi::Sum<T, Context>(
-          ctx, dout_copy, {0}, phi::CppTypeToDataType<T>::Type(), false);
+      phi::SumKernel<T, Context>(ctx,
+                                 dout_copy,
+                                 {0},
+                                 phi::CppTypeToDataType<T>::Type(),
+                                 false,
+                                 dbias_tmp);
     }
   }
 
   if (dbias) {
     if (kIsMultiPrecision) {
-      phi::AddKernel<MT, Context>(ctx, *dbias_out, dbias.get(), dbias_out);
+      phi::AddKernel<MT, Context>(ctx, dbias.get(), *dbias_tmp, dbias_out);
     } else {
-      phi::AddKernel<T, Context>(ctx, *dbias_out, dbias.get(), dbias_out);
+      phi::AddKernel<T, Context>(ctx, dbias.get(), *dbias_tmp, dbias_out);
     }
   }
 }
@@ -132,6 +154,10 @@ void FusedLinearParamGradAdd(const Context &ctx,
                              DenseTensor *dbias_out) {
   using MT = typename phi::dtype::MPTypeTrait<T>::Type;
 
+  if (std::is_same<T, MT>::value) {
+    multi_precision = false;
+  }
+
   bool use_addto = false;
   if (dweight_out) {
     if (dweight_out->dtype() == phi::DataType::FLOAT16) {
@@ -164,12 +190,27 @@ void FusedLinearParamGradAdd(const Context &ctx,
     }
   }
 
-  if (std::is_same<T, MT>::value) {
-    multi_precision = false;
-  }
-
   if (has_bias && dbias_out) {
-    ctx.template Alloc<T>(dbias_out);
+    if (dbias) {
+      *dbias_out = dbias.get();
+      if (multi_precision) {
+        PADDLE_ENFORCE_EQ(
+            dbias_out->dtype(),
+            phi::CppTypeToDataType<MT>::Type(),
+            phi::errors::InvalidArgument("Invaid data type error."));
+      } else {
+        PADDLE_ENFORCE_EQ(
+            dbias_out->dtype(),
+            phi::CppTypeToDataType<T>::Type(),
+            phi::errors::InvalidArgument("Invaid data type error."));
+      }
+    } else {
+      if (multi_precision) {
+        ctx.template Alloc<MT>(dbias_out);
+      } else {
+        ctx.template Alloc<T>(dbias_out);
+      }
+    }
   }
 
   int64_t K = x.dims()[x.dims().size() - 1];
diff --git a/paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h
index 4995360811b38..8cd4902ec59c3 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h
@@ -41,7 +41,7 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
     const int row_id,
     const int col_id,
     const int cols,
-    curandStatePhilox4_32_10_t *state,
+    GPURAND(StatePhilox4_32_10_t) * state,
     const float dropout_prob,
     const T factor,
     const InType *__restrict__ src,
@@ -281,9 +281,9 @@ __global__ void FusedResidualDropoutBias(
   int col_id = blockDim.x * blockIdx.x + threadIdx.x;
   int row_id = blockIdx.y;
   int idx = row_id * cols + col_id;
-  curandStatePhilox4_32_10_t state;
+  GPURAND(StatePhilox4_32_10_t) state;
   if (HasDropout) {
-    curand_init(seed, idx, increment, &state);
+    GPURAND(_init)(seed, idx, increment, &state);
   }
   T factor;
   if (HasDropout) {
diff --git a/paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu
index f7fd4d8589aac..b1ad9e6dcdf21 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu
@@ -33,6 +33,7 @@ void FusedRopeGradKernel(const Context& dev_ctx,
                          const paddle::optional<DenseTensor>& dout_v,
                          bool use_neox_rotary_style,
                          bool time_major,
+                         float rotary_emb_base,
                          DenseTensor* dq,
                          DenseTensor* dk,
                          DenseTensor* dv) {
@@ -132,9 +133,10 @@ void FusedRopeGradKernel(const Context& dev_ctx,
                                             head_dim,
                                             batch_stride,
                                             seq_stride,
-                                            outs_data,
                                             num_inputs,
-                                            div_c);
+                                            div_c,
+                                            rotary_emb_base,
+                                            outs_data);
 
   } else {
     // rotary position embedding Q
@@ -153,9 +155,10 @@ void FusedRopeGradKernel(const Context& dev_ctx,
                                             head_dim,
                                             batch_stride_q,
                                             seq_stride_q,
-                                            outs_data,
                                             1,
-                                            div_c);
+                                            div_c,
+                                            rotary_emb_base,
+                                            outs_data);
 
     // rotary position embedding K,V
     int64_t batch_stride_kv = time_major
@@ -178,9 +181,10 @@ void FusedRopeGradKernel(const Context& dev_ctx,
                                             head_dim,
                                             batch_stride_kv,
                                             seq_stride_kv,
-                                            out_kv,
                                             num_inputs - 1,
-                                            div_c);
+                                            div_c,
+                                            rotary_emb_base,
+                                            out_kv);
   }
 }
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu
index 62c09235f09d8..e75f440b58315 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu
@@ -33,6 +33,7 @@ void FusedRopeKernel(const Context& dev_ctx,
                      const paddle::optional<DenseTensor>& position_ids,
                      bool use_neox_rotary_style,
                      bool time_major,
+                     float rotary_emb_base,
                      DenseTensor* out_q,
                      DenseTensor* out_k,
                      DenseTensor* out_v) {
@@ -202,9 +203,10 @@ void FusedRopeKernel(const Context& dev_ctx,
                                             head_dim,
                                             batch_stride,
                                             seq_stride,
-                                            outs_data,
                                             num_inputs,
-                                            div_c);
+                                            div_c,
+                                            rotary_emb_base,
+                                            outs_data);
   } else {
     // Multi Query Attention (MQA) or Group Query Attention (GQA)
     PADDLE_ENFORCE_EQ(
@@ -245,9 +247,10 @@ void FusedRopeKernel(const Context& dev_ctx,
                                             head_dim,
                                             batch_stride_q,
                                             seq_stride_q,
-                                            outs_data,
                                             1,
-                                            div_c);
+                                            div_c,
+                                            rotary_emb_base,
+                                            outs_data);
 
     // rotary position embedding K,V
     phi::Array<const T*, 3> input_kv{ins_data[1], ins_data[2], nullptr};
@@ -270,9 +273,10 @@ void FusedRopeKernel(const Context& dev_ctx,
                                             head_dim,
                                             batch_stride_kv,
                                             seq_stride_kv,
-                                            out_kv,
                                             num_inputs - 1,
-                                            div_c);
+                                            div_c,
+                                            rotary_emb_base,
+                                            out_kv);
   }
 }
 }  // namespace fusion
diff --git a/paddle/phi/kernels/fusion/gpu/fused_rope_utils.h b/paddle/phi/kernels/fusion/gpu/fused_rope_utils.h
index 34dab8dab7d0d..4d12821f062cb 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_rope_utils.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_rope_utils.h
@@ -18,7 +18,8 @@
 
 namespace phi {
 namespace fusion {
-
+constexpr int kDefaultRotaryBase = 10000;
+constexpr float Epsilon = 1e-7;
 template <typename T, typename MPType, int VecSize>
 using VectorizedFusedRopeCudaKernelFunc =
     void (*)(phi::Array<const T*, 3> ins_data,
@@ -32,61 +33,202 @@ using VectorizedFusedRopeCudaKernelFunc =
              int64_t head_dim,
              int64_t batch_stride,
              int64_t seq_stride,
-             phi::Array<T*, 3> outs_data,
              int num_inputs,
-             MPType div_c);
+             MPType div_c,
+             float rotary_emb_base,
+             phi::Array<T*, 3> outs_data);
 
 template <typename T, typename MPType, int VecSize = 2>
-__device__ void VectorizedGetSinCos(phi::Array<const T*, 2> sin_cos_data,
-                                    const int64_t* position_ids_data,
-                                    bool flag_sin_cos,
-                                    int64_t index,
-                                    int64_t batch_size,
-                                    int64_t seq_len,
-                                    int64_t num_heads,
-                                    int64_t head_dim,
-                                    int64_t batch_stride,
-                                    int64_t seq_stride,
-                                    MPType* out_sin,
-                                    MPType* out_cos,
-                                    MPType div_c) {
-  MPType* sin_value = out_sin;
-  MPType* cos_value = out_cos;
-
-  if (flag_sin_cos) {
+__device__ __forceinline__ void get_sin_cos_by_passed_values(
+    phi::Array<const T*, 2> sin_cos_data,
+    const int64_t* position_ids_data,
+    int64_t index,
+    int64_t batch_size,
+    int64_t seq_len,
+    int64_t head_dim,
+    int64_t batch_stride,
+    int64_t seq_stride,
+    MPType* out_sin,
+    MPType* out_cos) {
 #pragma unroll
-    for (int64_t nx = 0; nx < VecSize; ++nx) {
-      int64_t pos_seq_ori = (index + nx) / seq_stride % seq_len;
-      int64_t pos_seq;
-      if (position_ids_data) {
-        int64_t pos_bs = (index + nx) / batch_stride % batch_size;
-        int64_t index_ids = pos_bs * seq_len + pos_seq_ori;
-        pos_seq = position_ids_data[index_ids];
-      } else {
-        pos_seq = pos_seq_ori;
-      }
-      int64_t pos_head = (index + nx) % head_dim;
-      int64_t index_sc = pos_seq * head_dim + pos_head;
-      const T* sin_input = sin_cos_data[0] + index_sc;
-      const T* cos_input = sin_cos_data[1] + index_sc;
+  for (int64_t nx = 0; nx < VecSize; ++nx) {
+    int64_t pos_seq_ori = (index + nx) / seq_stride % seq_len;
+    int64_t pos_seq;
+    if (position_ids_data) {
+      int64_t pos_bs = (index + nx) / batch_stride % batch_size;
+      int64_t index_ids = pos_bs * seq_len + pos_seq_ori;
+      pos_seq = position_ids_data[index_ids];
+    } else {
+      pos_seq = pos_seq_ori;
+    }
+    int64_t pos_head = (index + nx) % head_dim;
+    int64_t index_sc = pos_seq * head_dim + pos_head;
+    const T* sin_input = sin_cos_data[0] + index_sc;
+    const T* cos_input = sin_cos_data[1] + index_sc;
+
+    out_sin[nx] = static_cast<MPType>(sin_input[0]);
+    out_cos[nx] = static_cast<MPType>(cos_input[0]);
+  }
+}
+
+template <typename MPType, int VecSize = 2>
+__device__ __forceinline__ void get_sin_cos_by_rotary_base(
+    int64_t index,
+    int64_t seq_len,
+    int64_t head_dim,
+    int64_t seq_stride,
+    float rotary_emb_base,
+    MPType div_c,
+    MPType* out_sin,
+    MPType* out_cos) {
+#pragma unroll
+  for (int nx = 0; nx < VecSize; ++nx) {
+    // get sin_index and cos_index
+    int64_t pos_seq = (index + nx) / seq_stride % seq_len;
+
+    MPType idx = static_cast<MPType>(((index + nx) % head_dim) / 2 * 2.0);
+    MPType indicses = static_cast<MPType>(1) /
+                      pow(static_cast<MPType>(rotary_emb_base), idx * div_c);
+    MPType value = pos_seq * indicses;
+    out_sin[nx] = sin(value);
+    out_cos[nx] = cos(value);
+  }
+}
 
-      sin_value[nx] = static_cast<MPType>(sin_input[0]);
-      cos_value[nx] = static_cast<MPType>(cos_input[0]);
+template <typename T, typename MPType, int VecSize = 2, int ROTARY_BASE = -1>
+struct VectorizedGetSinCos {
+  static __device__ void run(phi::Array<const T*, 2> sin_cos_data,
+                             const int64_t* position_ids_data,
+                             bool flag_sin_cos,
+                             int64_t index,
+                             int64_t batch_size,
+                             int64_t seq_len,
+                             int64_t num_heads,
+                             int64_t head_dim,
+                             int64_t batch_stride,
+                             int64_t seq_stride,
+                             MPType div_c,
+                             float rotary_emb_base,
+                             MPType* out_sin,
+                             MPType* out_cos) {
+    if (flag_sin_cos) {
+      get_sin_cos_by_passed_values(sin_cos_data,
+                                   position_ids_data,
+                                   index,
+                                   batch_size,
+                                   seq_len,
+                                   head_dim,
+                                   batch_stride,
+                                   seq_stride,
+                                   out_sin,
+                                   out_cos);
+    } else {
+      get_sin_cos_by_rotary_base(index,
+                                 seq_len,
+                                 head_dim,
+                                 seq_stride,
+                                 rotary_emb_base,
+                                 div_c,
+                                 out_sin,
+                                 out_cos);
     }
-  } else {
+  }
+};
+
+// NOTE(zhengzhonghui): make rotary_emb_base as a parameter will cause the cuda
+// kernel slower down than before, so partial specialization for
+// kDefaultRotaryBase=10000 to avoid the slowdown.
+template <typename T, typename MPType, int VecSize>
+struct VectorizedGetSinCos<T, MPType, VecSize, kDefaultRotaryBase> {
+  static __device__ void run(phi::Array<const T*, 2> sin_cos_data,
+                             const int64_t* position_ids_data,
+                             bool flag_sin_cos,
+                             int64_t index,
+                             int64_t batch_size,
+                             int64_t seq_len,
+                             int64_t num_heads,
+                             int64_t head_dim,
+                             int64_t batch_stride,
+                             int64_t seq_stride,
+                             MPType div_c,
+                             float rotary_emb_base,
+                             MPType* out_sin,
+                             MPType* out_cos) {
+    MPType* sin_value = out_sin;
+    MPType* cos_value = out_cos;
+
+    if (flag_sin_cos) {
+      get_sin_cos_by_passed_values(sin_cos_data,
+                                   position_ids_data,
+                                   index,
+                                   batch_size,
+                                   seq_len,
+                                   head_dim,
+                                   batch_stride,
+                                   seq_stride,
+                                   out_sin,
+                                   out_cos);
+    } else {
 #pragma unroll
-    for (int nx = 0; nx < VecSize; ++nx) {
-      // get sin_index and cos_index
-      int64_t pos_seq = (index + nx) / seq_stride % seq_len;
-
-      MPType idx = static_cast<MPType>(((index + nx) % head_dim) / 2 * 2.0);
-      MPType indicses =
-          static_cast<MPType>(1) /
-          pow(static_cast<MPType>(10000), idx * static_cast<MPType>(div_c));
-      MPType value = pos_seq * indicses;
-      sin_value[nx] = sin(value);
-      cos_value[nx] = cos(value);
+      for (int nx = 0; nx < VecSize; ++nx) {
+        // get sin_index and cos_index
+        int64_t pos_seq = (index + nx) / seq_stride % seq_len;
+
+        MPType idx = static_cast<MPType>(((index + nx) % head_dim) / 2 * 2.0);
+        MPType indicses =
+            static_cast<MPType>(1) /
+            pow(static_cast<MPType>(kDefaultRotaryBase), idx * div_c);
+
+        MPType value = pos_seq * indicses;
+        sin_value[nx] = sin(value);
+        cos_value[nx] = cos(value);
+      }
+    }
+  }
+};
+
+template <typename T, typename MPType, int VecSize = 2>
+__device__ __forceinline__ void rotate_every_two(
+    phi::Array<const T*, 3> ins_data,
+    int num_inputs,
+    int64_t index,
+    int sign,
+    MPType* sin_value,
+    MPType* cos_value,
+    phi::Array<T*, 3> outs_data) {
+  using VecType = phi::AlignedVector<T, VecSize>;
+  constexpr int kVectorsPerThread = VecSize / 2;
+  MPType result[VecSize];
+  T store[VecSize];
+#pragma unroll
+  for (int iter = 0; iter < 3; iter++) {
+    if (iter >= num_inputs) break;
+    const T* input = ins_data[iter] + index;
+    VecType* out = reinterpret_cast<VecType*>(outs_data[iter] + index);
+
+#pragma unroll
+    for (int nx = 0; nx < kVectorsPerThread; ++nx) {
+      int pr_index = nx * 2;
+      int ls_index = pr_index + 1;
+
+      MPType p0 = static_cast<MPType>(input[pr_index]);
+      MPType p1 = static_cast<MPType>(input[ls_index]);
+
+      if (sign == 1) {
+        result[pr_index] = cos_value[pr_index] * p0;
+        result[pr_index] -= sin_value[pr_index] * p1;
+
+        result[ls_index] = sin_value[ls_index] * p0;
+        result[ls_index] += cos_value[ls_index] * p1;
+      } else if (sign == -1) {
+        result[pr_index] = cos_value[pr_index] * p0 + sin_value[ls_index] * p1;
+        result[ls_index] = cos_value[ls_index] * p1 - sin_value[pr_index] * p0;
+      }
+
+      store[pr_index] = static_cast<T>(result[pr_index]);
+      store[ls_index] = static_cast<T>(result[ls_index]);
     }
+    out[0] = *(reinterpret_cast<VecType*>(store));
   }
 }
 
@@ -103,9 +245,10 @@ __global__ void VectorizedFusedRopeWithRotateEveryTwoKernel(
     int64_t head_dim,
     int64_t batch_stride,
     int64_t seq_stride,
-    phi::Array<T*, 3> outs_data,
     int num_inputs,
-    MPType div_c) {
+    MPType div_c,
+    float rotary_emb_base,
+    phi::Array<T*, 3> outs_data) {
   int64_t index =
       (static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
        threadIdx.x) *
@@ -115,58 +258,87 @@ __global__ void VectorizedFusedRopeWithRotateEveryTwoKernel(
   int64_t size = batch_size * seq_len * num_heads * head_dim;
   MPType sin_value[VecSize];
   MPType cos_value[VecSize];
+
+  if (fabs(rotary_emb_base - static_cast<float>(kDefaultRotaryBase)) <
+      Epsilon) {
+    for (; index < size; index += stride) {
+      VectorizedGetSinCos<T, MPType, VecSize, kDefaultRotaryBase>::run(
+          sin_cos_data,
+          position_ids_data,
+          flag_sin_cos,
+          index,
+          batch_size,
+          seq_len,
+          num_heads,
+          head_dim,
+          batch_stride,
+          seq_stride,
+          div_c,
+          rotary_emb_base,
+          sin_value,
+          cos_value);
+      rotate_every_two<T, MPType, VecSize>(
+          ins_data, num_inputs, index, sign, sin_value, cos_value, outs_data);
+    }
+  } else {
+    for (; index < size; index += stride) {
+      VectorizedGetSinCos<T, MPType, VecSize>::run(sin_cos_data,
+                                                   position_ids_data,
+                                                   flag_sin_cos,
+                                                   index,
+                                                   batch_size,
+                                                   seq_len,
+                                                   num_heads,
+                                                   head_dim,
+                                                   batch_stride,
+                                                   seq_stride,
+                                                   div_c,
+                                                   rotary_emb_base,
+                                                   sin_value,
+                                                   cos_value);
+      rotate_every_two<T, MPType, VecSize>(
+          ins_data, num_inputs, index, sign, sin_value, cos_value, outs_data);
+    }
+  }
+}
+
+template <typename T, typename MPType, int VecSize = 2>
+__device__ __forceinline__ void rotate_half(phi::Array<const T*, 3> ins_data,
+                                            int num_inputs,
+                                            int64_t head_dim,
+                                            int64_t index,
+                                            int sign,
+                                            MPType* sin_value,
+                                            MPType* cos_value,
+                                            phi::Array<T*, 3> outs_data) {
   MPType result[VecSize];
   T store[VecSize];
   using VecType = phi::AlignedVector<T, VecSize>;
   constexpr int kVectorsPerThread = VecSize / 2;
-
-  for (; index < size; index += stride) {
-    VectorizedGetSinCos(sin_cos_data,
-                        position_ids_data,
-                        flag_sin_cos,
-                        index,
-                        batch_size,
-                        seq_len,
-                        num_heads,
-                        head_dim,
-                        batch_stride,
-                        seq_stride,
-                        sin_value,
-                        cos_value,
-                        div_c);
-
+  int stride_r = head_dim / 2;
 #pragma unroll
-    for (int iter = 0; iter < 3; iter++) {
-      if (iter >= num_inputs) break;
-      const T* input = ins_data[iter] + index;
-      VecType* out = reinterpret_cast<VecType*>(outs_data[iter] + index);
+  for (int iter = 0; iter < 3; iter++) {
+    if (iter >= num_inputs) break;
+    // get value_index and rotate_half_index
+    int index_v = index;
+    int index_r =
+        (index % head_dim) < stride_r ? (index + stride_r) : (index - stride_r);
+    MPType sign_r = (index % head_dim) < stride_r ? static_cast<MPType>(-1)
+                                                  : static_cast<MPType>(1);
+    const T* input_v = ins_data[iter] + index_v;
+    const T* input_r = ins_data[iter] + index_r;
+    VecType* out = reinterpret_cast<VecType*>(outs_data[iter] + index);
 
 #pragma unroll
-      for (int nx = 0; nx < kVectorsPerThread; ++nx) {
-        int pr_index = nx * 2;
-        int ls_index = pr_index + 1;
-
-        MPType p0 = static_cast<MPType>(input[pr_index]);
-        MPType p1 = static_cast<MPType>(input[ls_index]);
-
-        if (sign == 1) {
-          result[pr_index] = cos_value[pr_index] * p0;
-          result[pr_index] -= sin_value[pr_index] * p1;
-
-          result[ls_index] = sin_value[ls_index] * p0;
-          result[ls_index] += cos_value[ls_index] * p1;
-        } else if (sign == -1) {
-          result[pr_index] =
-              cos_value[pr_index] * p0 + sin_value[ls_index] * p1;
-          result[ls_index] =
-              cos_value[ls_index] * p1 - sin_value[pr_index] * p0;
-        }
-
-        store[pr_index] = static_cast<T>(result[pr_index]);
-        store[ls_index] = static_cast<T>(result[ls_index]);
-      }
-      out[0] = *(reinterpret_cast<VecType*>(store));
+    for (int nx = 0; nx < VecSize; ++nx) {
+      MPType p0 = static_cast<MPType>(input_v[nx]);
+      MPType p1 = static_cast<MPType>(input_r[nx]);
+
+      result[nx] = cos_value[nx] * p0 + sign * sign_r * sin_value[nx] * p1;
+
+      store[nx] = static_cast<T>(result[nx]);
     }
+    out[0] = *(reinterpret_cast<VecType*>(store));
   }
 }
 
@@ -183,9 +355,10 @@ __global__ void VectorizedFusedRopeWithRotateHalfKernel(
     int64_t head_dim,
     int64_t batch_stride,
     int64_t seq_stride,
-    phi::Array<T*, 3> outs_data,
     int num_inputs,
-    MPType div_c) {
+    MPType div_c,
+    float rotary_emb_base,
+    phi::Array<T*, 3> outs_data) {
   int64_t index =
       (static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
        threadIdx.x) *
@@ -195,52 +368,57 @@ __global__ void VectorizedFusedRopeWithRotateHalfKernel(
   int64_t size = batch_size * seq_len * num_heads * head_dim;
   MPType sin_value[VecSize];
   MPType cos_value[VecSize];
-  MPType result[VecSize];
-  T store[VecSize];
-  using VecType = phi::AlignedVector<T, VecSize>;
-  constexpr int kVectorsPerThread = VecSize / 2;
-
-  for (; index < size; index += stride) {
-    VectorizedGetSinCos(sin_cos_data,
-                        position_ids_data,
-                        flag_sin_cos,
-                        index,
-                        batch_size,
-                        seq_len,
-                        num_heads,
-                        head_dim,
-                        batch_stride,
-                        seq_stride,
-                        sin_value,
-                        cos_value,
-                        div_c);
-
-    // use rotate_half mode
-    int stride_r = head_dim / 2;
-#pragma unroll
-    for (int iter = 0; iter < 3; iter++) {
-      if (iter >= num_inputs) break;
-      // get value_index and rotate_half_index
-      int index_v = index;
-      int index_r = (index % head_dim) < stride_r ? (index + stride_r)
-                                                  : (index - stride_r);
-      MPType sign_r = (index % head_dim) < stride_r ? static_cast<MPType>(-1)
-                                                    : static_cast<MPType>(1);
-      const T* input_v = ins_data[iter] + index_v;
-      const T* input_r = ins_data[iter] + index_r;
-      VecType* out = reinterpret_cast<VecType*>(outs_data[iter] + index);
 
-#pragma unroll
-      for (int nx = 0; nx < VecSize; ++nx) {
-        MPType p0 = static_cast<MPType>(input_v[nx]);
-        MPType p1 = static_cast<MPType>(input_r[nx]);
-
-        result[nx] = cos_value[nx] * p0 + sign * sign_r * sin_value[nx] * p1;
-
-        store[nx] = static_cast<T>(result[nx]);
-      }
-      out[0] = *(reinterpret_cast<VecType*>(store));
+  if (fabs(rotary_emb_base - static_cast<float>(kDefaultRotaryBase)) <
+      Epsilon) {
+    for (; index < size; index += stride) {
+      VectorizedGetSinCos<T, MPType, VecSize, kDefaultRotaryBase>::run(
+          sin_cos_data,
+          position_ids_data,
+          flag_sin_cos,
+          index,
+          batch_size,
+          seq_len,
+          num_heads,
+          head_dim,
+          batch_stride,
+          seq_stride,
+          div_c,
+          rotary_emb_base,
+          sin_value,
+          cos_value);
+      rotate_half<T, MPType, VecSize>(ins_data,
+                                      num_inputs,
+                                      head_dim,
+                                      index,
+                                      sign,
+                                      sin_value,
+                                      cos_value,
+                                      outs_data);
     }
+  } else {
+    VectorizedGetSinCos<T, MPType, VecSize>::run(sin_cos_data,
+                                                 position_ids_data,
+                                                 flag_sin_cos,
+                                                 index,
+                                                 batch_size,
+                                                 seq_len,
+                                                 num_heads,
+                                                 head_dim,
+                                                 batch_stride,
+                                                 seq_stride,
+                                                 div_c,
+                                                 rotary_emb_base,
+                                                 sin_value,
+                                                 cos_value);
+    rotate_half<T, MPType, VecSize>(ins_data,
+                                    num_inputs,
+                                    head_dim,
+                                    index,
+                                    sign,
+                                    sin_value,
+                                    cos_value,
+                                    outs_data);
   }
 }
 
diff --git a/paddle/phi/kernels/fusion/xpu/cross_attention_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/cross_attention_xpu_kernel.cc
new file mode 100644
index 0000000000000..a3740c9e9126c
--- /dev/null
+++ b/paddle/phi/kernels/fusion/xpu/cross_attention_xpu_kernel.cc
@@ -0,0 +1,243 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "glog/logging.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/xpu/xpu_api_wrapper.h"
+
+namespace phi {
+namespace fusion {
+
+template <typename T_X,
+          typename T_W,
+          typename T_QKV,
+          typename T_GEMM,
+          typename Context>
+void CrossAttentionXPUKernelImpl(
+    const Context& ctx,
+    const DenseTensor& input_q,
+    const DenseTensor& input_kv,
+    const std::vector<const DenseTensor*>& fc_weight,
+    const std::vector<const DenseTensor*>& fc_weight_max,
+    const std::vector<const DenseTensor*>& fc_bias,
+    const DenseTensor& mask,
+    int head_num,
+    int head_dim,
+    float alpha,
+    DataType qkv_dtype,
+    DenseTensor* qkv,
+    DenseTensor* qkv_max) {
+  using XPUTypeX = typename XPUTypeTrait<T_X>::Type;
+  using XPUTypeW = typename XPUTypeTrait<T_W>::Type;
+  using XPUTypeOut = typename XPUTypeTrait<T_QKV>::Type;
+  using XPUTypeGEMM = typename XPUTypeTrait<T_GEMM>::Type;
+  auto* input_q_data = reinterpret_cast<const XPUTypeX*>(input_q.data<T_X>());
+  auto* input_kv_data = reinterpret_cast<const XPUTypeX*>(input_kv.data<T_X>());
+
+  xpu::ctx_guard RAII_GUARD(ctx.x_context());
+
+  XPUTypeFP16* q_data = RAII_GUARD.alloc_l3_or_gm<XPUTypeFP16>(input_q.numel());
+  XPUTypeFP16* k_data =
+      RAII_GUARD.alloc_l3_or_gm<XPUTypeFP16>(input_kv.numel());
+  XPUTypeFP16* v_data =
+      RAII_GUARD.alloc_l3_or_gm<XPUTypeFP16>(input_kv.numel());
+
+  const XPUTypeX* loop_x[3] = {input_q_data, input_kv_data, input_kv_data};
+  XPUTypeFP16* loop_y[3] = {q_data, k_data, v_data};
+  std::vector<const int16_t*> fc_weight_data_int16_t;
+  std::vector<const float*> fc_weight_max_data;
+  std::vector<const float*> fc_bias_data;
+  for (size_t i = 0; i < fc_weight.size(); i++) {
+    fc_weight_data_int16_t.emplace_back(
+        reinterpret_cast<const int16_t*>(fc_weight[i]->data()));
+    fc_weight_max_data.push_back(fc_weight_max[i]->data<float>());
+    fc_bias_data.emplace_back(fc_bias[i]->data<float>());
+  }
+
+  int batch = input_q.dims()[0];
+  int max_q_len = input_q.dims()[1];
+  int max_kv_len = input_kv.dims()[1];
+  int max_seq_len = std::max(max_q_len, max_kv_len);
+  int qkv_shape = 0;  // B x L x H x D
+  int hidden_dim = head_num * head_dim;
+  int q_mul_m = batch * max_q_len;
+  int kv_mul_m = batch * max_kv_len;
+  int loop_m[3] = {q_mul_m, kv_mul_m, kv_mul_m};
+  int n = hidden_dim;
+  int k = hidden_dim;
+  bool do_fc_qkv_fusion = false;
+  xpu::Activation_t act_type = xpu::Activation_t::LINEAR;
+
+  // q_mul + k_mul + v_mul
+  for (int i = 0; i < 3; ++i) {
+    int r = xpu::
+        fc_fusion<XPUTypeX, XPUTypeW, XPUTypeFP16, T_GEMM>(  // TX/TW/TY/TGEMM
+            ctx.x_context(),                                 // ctx
+            loop_x[i],                                       // x
+            fc_weight_data_int16_t[i],                       // w
+            loop_y[i],                                       // y
+            loop_m[i],                                       // m
+            n,                                               // n
+            k,                                               // k
+            false,                                           // x_trans
+            false,                                           // w_trans
+            nullptr,                                         // x_maxptr
+            fc_weight_max_data[i],                           // w_maxptr
+            nullptr,                                         // y_maxptr
+            hidden_dim,                                      // ldx
+            hidden_dim,                                      // ldw
+            hidden_dim,                                      // ldy
+            1.0f,                                            // alpha
+            0.0f,                                            // beta
+            fc_bias_data[i],                                 // bias
+            act_type);                                       // act
+
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "fc_xpu");
+  }
+  auto mask_dim = mask.dims();
+  int mask_dim_size = mask_dim.size();
+  const float* mask_data = mask.data<float>();
+  auto* qkv_data =
+      reinterpret_cast<XPUTypeOut*>(ctx.template Alloc<T_QKV>(qkv));
+  auto* qkv_max_data = ctx.template Alloc<float>(qkv_max);
+  std::vector<int64_t> z_shape(4, 1);
+  if (mask_dim_size < 4) {
+    int index = 4 - mask_dim_size;
+    for (int i = 0; i < mask_dim_size; ++i) {
+      z_shape[index + i] = mask_dim[i];
+    }
+  } else {
+    // mask_dim_size = 4
+    // The check in fusion.cc has ensured that it is not greater than 4
+    for (int i = 0; i < mask_dim_size; ++i) {
+      z_shape[i] = mask_dim[i];
+    }
+  }
+  // no vsl
+  xpu::CrossAttnParam qkv_attn_param(batch,
+                                     max_seq_len,
+                                     head_num,
+                                     head_dim,
+                                     do_fc_qkv_fusion,
+                                     max_q_len,
+                                     max_kv_len);
+  qkv_attn_param.qkv_shape = qkv_shape;
+  qkv_attn_param.alpha = alpha;
+  qkv_attn_param.zshape = z_shape;
+  XPUTypeFP16* qkv_temp_data =
+      RAII_GUARD.alloc_l3_or_gm<XPUTypeFP16>(input_q.numel());
+
+  // qk_matmul + qkv_matmul
+  int r = xpu::qkv_attention<XPUTypeFP16,
+                             XPUTypeFP16,
+                             XPUTypeFP16,
+                             XPUTypeFP16,
+                             XPUTypeGEMM>(ctx.x_context(),
+                                          q_data,
+                                          k_data,
+                                          v_data,
+                                          qkv_temp_data,
+                                          nullptr,
+                                          nullptr,
+                                          nullptr,
+                                          qkv_max_data,
+                                          qkv_attn_param,
+                                          mask_data);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "qkv_attention_xpu");
+
+  if (input_q.dtype() == DataType::FLOAT32) {
+    int r_cast_out = xpu::cast_v2<XPUTypeFP16, XPUTypeOut>(
+        ctx.x_context(), qkv_temp_data, qkv_data, qkv->numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(
+        r_cast_out, "cross_attention_xpu(cast out from fp16 to fp32)");
+  }
+  if (input_q.dtype() == DataType::FLOAT16) {
+    int r_copy =
+        xpu::copy(ctx.x_context(), qkv_temp_data, qkv_data, qkv->numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r_copy, "cross_attention_xpu(copy out)");
+  }
+}
+
+#define CROSS_ATTENTION_XPU_KERNEL_IMPL(              \
+    x_dtype_, w_dtype_, qkv_dtype_, gemm_dtype_)      \
+  CrossAttentionXPUKernelImpl<x_dtype_,               \
+                              w_dtype_,               \
+                              qkv_dtype_,             \
+                              gemm_dtype_,            \
+                              Context>(ctx,           \
+                                       input_q,       \
+                                       input_kv,      \
+                                       fc_weight,     \
+                                       fc_weight_max, \
+                                       fc_bias,       \
+                                       mask,          \
+                                       head_num,      \
+                                       head_dim,      \
+                                       alpha,         \
+                                       qkv_dtype,     \
+                                       qkv,           \
+                                       qkv_max);
+
+template <typename T, typename Context>
+void CrossAttentionXPUKernel(
+    const Context& ctx,
+    const DenseTensor& input_q,
+    const DenseTensor& input_kv,
+    const std::vector<const DenseTensor*>& fc_weight,
+    const std::vector<const DenseTensor*>& fc_weight_max,
+    const std::vector<const DenseTensor*>& fc_bias,
+    const DenseTensor& mask,
+    int head_num,
+    int head_dim,
+    float alpha,
+    DataType qkv_dtype,
+    DenseTensor* qkv,
+    DenseTensor* qkv_max) {
+  VLOG(4) << "cross-attn data type: " << input_q.dtype() << " ,"
+          << input_kv.dtype() << " ," << qkv_dtype;
+
+  // Temporarily only supports the case of TY=TX
+  if (input_q.dtype() == DataType::FLOAT16 &&
+      input_kv.dtype() == DataType::FLOAT16 && qkv_dtype == DataType::FLOAT16) {
+    // float16 kernel
+    CROSS_ATTENTION_XPU_KERNEL_IMPL(
+        phi::dtype::float16, int16_t, phi::dtype::float16, int16_t);
+    return;
+  }
+  if (input_q.dtype() == DataType::FLOAT32 &&
+      input_kv.dtype() == DataType::FLOAT32 && qkv_dtype == DataType::FLOAT32) {
+    // float32 kernel
+    CROSS_ATTENTION_XPU_KERNEL_IMPL(
+        float, int16_t, phi::dtype::float16, int16_t);
+    return;
+  }
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "Not support q_dtype is %s, k_dtype is %s, k_dtype is %s"
+      "and qkv_dtype is %s.",
+      DataTypeToString(input_q.dtype()),
+      DataTypeToString(input_kv.dtype()),
+      DataTypeToString(qkv_dtype)));
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cross_attention_xpu,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::fusion::CrossAttentionXPUKernel,
+                   float,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc
index c4cb02ce21184..0915339567650 100644
--- a/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc
@@ -59,10 +59,17 @@ void FcXPUKernelImpl(const Context& ctx,
   auto* scale_max_data = scale_max.get_ptr() == nullptr
                              ? nullptr
                              : scale_max.get_ptr()->data<float>();
-  auto* out_max_data = ctx.template Alloc<float>(out_max);
-  out_max_data = out_max_in.get_ptr() != nullptr
-                     ? const_cast<float*>(out_max_in.get_ptr()->data<float>())
-                     : out_max_data;
+  float* out_max_data = nullptr;
+  // when T_OUT is float and TGEMM is int8_t, out_max_data should better set to
+  // nullptr for better performance
+  if (!(std::is_same<T_OUT, float>::value &&
+        std::is_same<T_GEMM, int8_t>::value)) {
+    out_max_data = ctx.template Alloc<float>(out_max);
+    out_max_data = out_max_in.get_ptr() != nullptr
+                       ? const_cast<float*>(out_max_in.get_ptr()->data<float>())
+                       : out_max_data;
+  }
+
   xpu::Activation_t act(static_cast<xpu::Activation_t::act_enum>(act_type));
   if (act_type == xpu::Activation_t::LEAKY_RELU) {
     act.leaky_alpha = act_alpha;
diff --git a/paddle/phi/kernels/fusion/xpu/group_norm_silu_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/group_norm_silu_xpu_kernel.cc
new file mode 100644
index 0000000000000..b958c6fde5e6e
--- /dev/null
+++ b/paddle/phi/kernels/fusion/xpu/group_norm_silu_xpu_kernel.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+
+namespace phi {
+namespace fusion {
+
+template <typename T, typename Context>
+void GroupNormalizeSiluXPUKernel(const Context& ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& scale,
+                                 const DenseTensor& bias,
+                                 int groups,
+                                 float epsilon,
+                                 DenseTensor* out) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+  auto* in_data = reinterpret_cast<const XPUType*>(x.data<T>());
+  auto* scale_data = reinterpret_cast<const float*>(scale.data<float>());
+  auto* bias_data = reinterpret_cast<const float*>(bias.data<float>());
+  auto* out_data = reinterpret_cast<XPUType*>(ctx.template Alloc<T>(out));
+  int n = static_cast<int>(x.dims()[0]);
+  int c = static_cast<int>(x.dims()[1]);
+  int h = static_cast<int>(x.dims()[2]);
+  int w = static_cast<int>(x.dims()[3]);
+
+  int r = xpu::group_norm_silu_fusion<XPUType>(ctx.x_context(),
+                                               in_data,
+                                               out_data,
+                                               n,
+                                               c,
+                                               h,
+                                               w,
+                                               groups,
+                                               epsilon,
+                                               scale_data,
+                                               bias_data,
+                                               nullptr,
+                                               nullptr,
+                                               true);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "group_norm_silu_fusion");
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(group_norm_silu_xpu,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::fusion::GroupNormalizeSiluXPUKernel,
+                   float,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/fusion/xpu/multi_encoder_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/multi_encoder_xpu_kernel.cc
index 8b65964671b0b..bded0fec90e1c 100644
--- a/paddle/phi/kernels/fusion/xpu/multi_encoder_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/multi_encoder_xpu_kernel.cc
@@ -114,27 +114,26 @@ void MultiEncoderXPUKernel(
                                               xpu::QuantType::NOT_QUANT);
   if (enable_int8) {
     for (size_t i = 0; i < quant_types.size(); i++) {
-      set_quant_types[i] = xpu::QuantType::QUANT_INT8;
+      if (quant_types[i] == "enable_int8") {
+        set_quant_types[i] = xpu::QuantType::QUANT_INT8;
+      }
     }
   }
   std::vector<const float*> fc_input_max_data;
   std::vector<const int16_t*> fc_weight_data_int16_t;
-  std::vector<const int8_t*> fc_weight_data_int8_t;
   std::vector<const XPUTypeFP16*> fc_weight_data_XPUTypeFP16;
   std::vector<const float*> fc_weight_max_data;
   std::vector<const float*> fc_bias_data;
   for (size_t i = 0; i < fc_weight.size(); i++) {
-    if (!enable_int8) {
-      if (local_quant) {
-        fc_weight_data_XPUTypeFP16.push_back(
-            reinterpret_cast<const XPUTypeFP16*>(fc_weight[i]->data()));
-      } else {
-        fc_weight_data_int16_t.push_back(
-            reinterpret_cast<const int16_t*>(fc_weight[i]->data()));
-      }
+    if (!enable_int8 && local_quant) {
+      fc_weight_data_XPUTypeFP16.push_back(
+          reinterpret_cast<const XPUTypeFP16*>(fc_weight[i]->data()));
     } else {
-      fc_weight_data_int8_t.push_back(
-          reinterpret_cast<const int8_t*>(fc_weight[i]->data()));
+      // Int8 weight also convert to int16_t* for temperary storage.
+      // The kenerl dytpe of int8 is choosen by quant_type in
+      // xpu::transformer_encoder
+      fc_weight_data_int16_t.push_back(
+          reinterpret_cast<const int16_t*>(fc_weight[i]->data()));
     }
     fc_weight_max_data.push_back(fc_weight_max[i]->data<float>());
     fc_bias_data.push_back(fc_bias[i]->data<float>());
@@ -143,8 +142,6 @@ void MultiEncoderXPUKernel(
       fc_weight_data_int16_t.push_back(nullptr);
       fc_weight_data_XPUTypeFP16.push_back(nullptr);
       fc_weight_data_XPUTypeFP16.push_back(nullptr);
-      fc_weight_data_int8_t.push_back(nullptr);
-      fc_weight_data_int8_t.push_back(nullptr);
       fc_weight_max_data.push_back(nullptr);
       fc_weight_max_data.push_back(nullptr);
       fc_bias_data.push_back(nullptr);
@@ -210,16 +207,14 @@ void MultiEncoderXPUKernel(
       qkv_attn_param.relative_pos.assign(roformer_embedding_data.begin(),
                                          roformer_embedding_data.end());
     }
-    if (!enable_int8) {
-      if (local_quant) {
-        TRANSFORMER_ENCODER_KERNEL_IMPL(XPUTypeFP16, XPUTypeFP16, float)
-      } else {
-        TRANSFORMER_ENCODER_KERNEL_IMPL(XPUTypeFP16, int16_t, int16_t)
-      }
+    if (!enable_int8 && local_quant) {
+      TRANSFORMER_ENCODER_KERNEL_IMPL(XPUTypeFP16, XPUTypeFP16, float)
     } else {
-      TRANSFORMER_ENCODER_KERNEL_IMPL(XPUTypeFP16, int8_t, int8_t)
+      // The kenerl dytpe of int8 is choosen by quant_type in
+      // xpu::transformer_encoder This template args, int16_t, is only for skip
+      // quant fc
+      TRANSFORMER_ENCODER_KERNEL_IMPL(XPUTypeFP16, int16_t, int16_t)
     }
-
   } else if (mask_data) {
     auto mask_dims = mask.get_ptr()->dims();
     std::vector<int> mask_shape(mask_dims.Get(),
@@ -263,14 +258,10 @@ void MultiEncoderXPUKernel(
       qkv_attn_param.relative_pos.assign(roformer_embedding_data.begin(),
                                          roformer_embedding_data.end());
     }
-    if (!enable_int8) {
-      if (local_quant) {
-        TRANSFORMER_ENCODER_KERNEL_IMPL(XPUTypeFP16, XPUTypeFP16, float)
-      } else {
-        TRANSFORMER_ENCODER_KERNEL_IMPL(XPUTypeFP16, int16_t, int16_t)
-      }
+    if (!enable_int8 && local_quant) {
+      TRANSFORMER_ENCODER_KERNEL_IMPL(XPUTypeFP16, XPUTypeFP16, float)
     } else {
-      TRANSFORMER_ENCODER_KERNEL_IMPL(XPUTypeFP16, int8_t, int8_t)
+      TRANSFORMER_ENCODER_KERNEL_IMPL(XPUTypeFP16, int16_t, int16_t)
     }
   } else {
     // When no mask input, like VIT, create LOD to act as vsl.
@@ -319,14 +310,10 @@ void MultiEncoderXPUKernel(
       qkv_attn_param.relative_pos.assign(roformer_embedding_data.begin(),
                                          roformer_embedding_data.end());
     }
-    if (!enable_int8) {
-      if (local_quant) {
-        TRANSFORMER_ENCODER_KERNEL_IMPL(XPUTypeFP16, XPUTypeFP16, float)
-      } else {
-        TRANSFORMER_ENCODER_KERNEL_IMPL(XPUTypeFP16, int16_t, int16_t)
-      }
+    if (!enable_int8 && local_quant) {
+      TRANSFORMER_ENCODER_KERNEL_IMPL(XPUTypeFP16, XPUTypeFP16, float)
     } else {
-      TRANSFORMER_ENCODER_KERNEL_IMPL(XPUTypeFP16, int8_t, int8_t)
+      TRANSFORMER_ENCODER_KERNEL_IMPL(XPUTypeFP16, int16_t, int16_t)
     }
   }
 
diff --git a/paddle/phi/kernels/fusion/xpu/qkv_attention_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/qkv_attention_xpu_kernel.cc
index 5c8562d6c3969..6f813f23c8911 100644
--- a/paddle/phi/kernels/fusion/xpu/qkv_attention_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/qkv_attention_xpu_kernel.cc
@@ -29,12 +29,13 @@ void QKVAttentionXPUKernelImpl(const Context& ctx,
                                const paddle::optional<DenseTensor>& q_max,
                                const paddle::optional<DenseTensor>& k_max,
                                const paddle::optional<DenseTensor>& v_max,
+                               const paddle::optional<DenseTensor>& qk_max,
+                               const paddle::optional<DenseTensor>& qkv_max,
                                float alpha,
                                int head_num,
                                int head_dim,
                                bool qkv_fc_fusion,
-                               DenseTensor* qkv,
-                               DenseTensor* qkv_max) {
+                               DenseTensor* qkv) {
   using XPUTypeX = typename XPUTypeTrait<T_X>::Type;
   using XPUTypeOut = typename XPUTypeTrait<T_QKV>::Type;
   using XPUTypeGEMM = typename XPUTypeTrait<T_GEMM>::Type;
@@ -51,10 +52,16 @@ void QKVAttentionXPUKernelImpl(const Context& ctx,
       k_max.get_ptr() == nullptr ? nullptr : k_max.get_ptr()->data<float>();
   const float* v_max_data =
       v_max.get_ptr() == nullptr ? nullptr : v_max.get_ptr()->data<float>();
+  const float* qk_max_data =
+      qk_max.get_ptr() == nullptr ? nullptr : qk_max.get_ptr()->data<float>();
+  float* qkv_max_data =
+      qkv_max.get_ptr() == nullptr
+          ? nullptr
+          : const_cast<float*>(qkv_max.get_ptr()->data<float>());
 
   auto* qkv_data =
       reinterpret_cast<XPUTypeOut*>(ctx.template Alloc<T_QKV>(qkv));
-  auto* qkv_max_data = ctx.template Alloc<float>(qkv_max);
+  float* tmp_mask = nullptr;
   int batch = q.dims()[0];
   int max_seq_len = q.dims()[1];
   int qkv_shape = 0;  // B x L x H x D
@@ -80,25 +87,151 @@ void QKVAttentionXPUKernelImpl(const Context& ctx,
   // not all cases apply flash_attention,
   // xdnn will make further internal judgments
   if (apply_flash_attention) {
-    int r = xpu::qkv_attention<XPUTypeX,
-                               XPUTypeX,
-                               XPUTypeX,
-                               XPUTypeOut,
-                               XPUTypeGEMM,
+    if (std::is_same<T_GEMM, int8_t>::value) {
+      if (std::is_same<T_X, float>::value) {
+        phi::DenseTensor x_fp16, out_fp16;
+        out_fp16.set_type(phi::DataType::FLOAT16);
+        out_fp16.Resize(qkv->dims());
+        x_fp16.set_type(phi::DataType::FLOAT16);
+        if (qkv_fc_fusion) {
+          x_fp16.Resize(q.dims());
+        } else {
+          std::vector<int64_t> out_dims = common::vectorize(q.dims());
+          out_dims.insert(out_dims.begin(), 3);
+          x_fp16.Resize(common::make_ddim(out_dims));
+        }
+        auto* x_fp16_data_t = reinterpret_cast<XPUTypeFP16*>(
+            ctx.template Alloc<phi::dtype::float16>(&x_fp16));
+        int r_cast_x;
+        XPUTypeFP16* q_data_fp16 = nullptr;
+        XPUTypeFP16* k_data_fp16 = nullptr;
+        XPUTypeFP16* v_data_fp16 = nullptr;
+        if (qkv_fc_fusion) {
+          r_cast_x = xpu::cast_v2<float, XPUTypeFP16>(
+              ctx.x_context(), q.data<float>(), x_fp16_data_t, q.numel());
+          q_data_fp16 = x_fp16_data_t;
+          k_data_fp16 = x_fp16_data_t + head_num * head_dim;
+          v_data_fp16 = x_fp16_data_t + 2 * head_num * head_dim;
+        } else {
+          r_cast_x = xpu::cast_v2<float, XPUTypeFP16>(
+              ctx.x_context(), q.data<float>(), x_fp16_data_t, q.numel());
+          r_cast_x = xpu::cast_v2<float, XPUTypeFP16>(ctx.x_context(),
+                                                      k.data<float>(),
+                                                      x_fp16_data_t + q.numel(),
+                                                      k.numel());
+          r_cast_x = xpu::cast_v2<float, XPUTypeFP16>(
+              ctx.x_context(),
+              v.data<float>(),
+              x_fp16_data_t + q.numel() + k.numel(),
+              v.numel());
+          q_data_fp16 = x_fp16_data_t;
+          k_data_fp16 = x_fp16_data_t + q.numel();
+          v_data_fp16 = x_fp16_data_t + q.numel() + k.numel();
+        }
+        PADDLE_ENFORCE_XDNN_SUCCESS(
+            r_cast_x, "multi_encoder_xpu(cast x from fp32 to fp16)");
+        auto* out_fp16_data = reinterpret_cast<XPUTypeFP16*>(
+            ctx.template Alloc<phi::dtype::float16>(&out_fp16));
+        int r = xpu::qkv_attention<XPUTypeFP16,
+                                   XPUTypeFP16,
+                                   XPUTypeFP16,
+                                   XPUTypeFP16,
+                                   int8_t,
+                                   float,
+                                   int,
+                                   float,
+                                   int16_t>(ctx.x_context(),
+                                            q_data_fp16,
+                                            k_data_fp16,
+                                            v_data_fp16,
+                                            out_fp16_data,
+                                            q_max_data,
+                                            k_max_data,
+                                            v_max_data,
+                                            qkv_max_data,
+                                            qkv_attn_param,
+                                            tmp_mask,
+                                            qk_max_data);
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "qkv_attention_xpu");
+        int r_cast_out = xpu::cast_v2<XPUTypeFP16, float>(
+            ctx.x_context(), out_fp16_data, qkv->data<float>(), qkv->numel());
+        PADDLE_ENFORCE_XDNN_SUCCESS(
+            r_cast_out, "multi_encoder_xpu(cast out from fp16 to fp32)");
+      } else if (std::is_same<T_X, float16>::value) {
+        int r = xpu::qkv_attention<XPUTypeFP16,
+                                   XPUTypeFP16,
+                                   XPUTypeFP16,
+                                   XPUTypeFP16,
+                                   int8_t,
+                                   float,
+                                   int,
+                                   float,
+                                   int16_t>(
+            ctx.x_context(),
+            reinterpret_cast<const XPUTypeFP16*>(q_data),
+            reinterpret_cast<const XPUTypeFP16*>(k_data),
+            reinterpret_cast<const XPUTypeFP16*>(v_data),
+            reinterpret_cast<XPUTypeFP16*>(qkv_data),
+            q_max_data,
+            k_max_data,
+            v_max_data,
+            qkv_max_data,
+            qkv_attn_param,
+            tmp_mask,
+            qk_max_data);
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "qkv_attention_xpu");
+      } else if (std::is_same<T_X, int8_t>::value) {
+        int r =
+            xpu::qkv_attention<int8_t,
+                               int8_t,
+                               int8_t,
+                               int8_t,
+                               int8_t,
                                float,
                                int,
                                float,
-                               int16_t>(ctx.x_context(),
-                                        q_data,
-                                        k_data,
-                                        v_data,
-                                        qkv_data,
-                                        q_max_data,
-                                        k_max_data,
-                                        v_max_data,
-                                        qkv_max_data,
-                                        qkv_attn_param);
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "qkv_attention_xpu");
+                               int8_t>(ctx.x_context(),
+                                       reinterpret_cast<const int8_t*>(q_data),
+                                       reinterpret_cast<const int8_t*>(k_data),
+                                       reinterpret_cast<const int8_t*>(v_data),
+                                       reinterpret_cast<int8_t*>(qkv_data),
+                                       q_max_data,
+                                       k_max_data,
+                                       v_max_data,
+                                       qkv_max_data,
+                                       qkv_attn_param,
+                                       tmp_mask,
+                                       qk_max_data);
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "qkv_attention_xpu");
+      } else {
+        PADDLE_THROW(phi::errors::Unimplemented(
+            "flas attention with int8 Not support q_dtype is %s.",
+            DataTypeToString(q.dtype())));
+      }
+    } else {
+      int r = xpu::qkv_attention<XPUTypeX,
+                                 XPUTypeX,
+                                 XPUTypeX,
+                                 XPUTypeOut,
+                                 int16_t,
+                                 float,
+                                 int,
+                                 float,
+                                 int16_t>(ctx.x_context(),
+                                          q_data,
+                                          k_data,
+                                          v_data,
+                                          qkv_data,
+                                          q_max_data,
+                                          k_max_data,
+                                          v_max_data,
+                                          qkv_max_data,
+                                          qkv_attn_param,
+                                          tmp_mask,
+                                          qk_max_data);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "qkv_attention_xpu");
+    }
+
   } else {
     int r = xpu::
         qkv_attention<XPUTypeX, XPUTypeX, XPUTypeX, XPUTypeOut, XPUTypeGEMM>(
@@ -111,7 +244,9 @@ void QKVAttentionXPUKernelImpl(const Context& ctx,
             k_max_data,
             v_max_data,
             qkv_max_data,
-            qkv_attn_param);
+            qkv_attn_param,
+            tmp_mask,
+            qk_max_data);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "qkv_attention_xpu");
   }
 }
@@ -125,12 +260,13 @@ void QKVAttentionXPUKernelImpl(const Context& ctx,
       q_max,                                                             \
       k_max,                                                             \
       v_max,                                                             \
+      qk_max,                                                            \
+      qkv_max,                                                           \
       alpha,                                                             \
       head_num,                                                          \
       head_dim,                                                          \
       qkv_fc_fusion,                                                     \
-      qkv,                                                               \
-      qkv_max);
+      qkv);
 
 template <typename T, typename Context>
 void QKVAttentionXPUKernel(const Context& ctx,
@@ -140,25 +276,43 @@ void QKVAttentionXPUKernel(const Context& ctx,
                            const paddle::optional<DenseTensor>& q_max,
                            const paddle::optional<DenseTensor>& k_max,
                            const paddle::optional<DenseTensor>& v_max,
+                           const paddle::optional<DenseTensor>& qk_max,
+                           const paddle::optional<DenseTensor>& qkv_max,
                            float alpha,
                            int head_num,
                            int head_dim,
                            bool qkv_fc_fusion,
                            DataType qkv_dtype,
-                           DenseTensor* qkv,
-                           DenseTensor* qkv_max) {
+                           DenseTensor* qkv) {
   VLOG(4) << "QKV kernel type: " << q.dtype() << " ," << k.dtype() << " ,"
           << v.dtype() << " ," << qkv_dtype;
-
+  auto use_int8 = false;
+  if (q_max.get_ptr() != nullptr) {
+    use_int8 = true;
+  }
   if (q.dtype() == DataType::FLOAT16 && k.dtype() == DataType::FLOAT16 &&
       v.dtype() == DataType::FLOAT16 && qkv_dtype == DataType::FLOAT16) {
     // float16 kernel
-    QKV_ATTENTION_XPU_KERNEL_IMPL(
-        phi::dtype::float16, phi::dtype::float16, int16_t);
+    if (use_int8) {
+      QKV_ATTENTION_XPU_KERNEL_IMPL(
+          phi::dtype::float16, phi::dtype::float16, int8_t);
+    } else {
+      QKV_ATTENTION_XPU_KERNEL_IMPL(
+          phi::dtype::float16, phi::dtype::float16, int16_t);
+    }
+
   } else if (q.dtype() == DataType::FLOAT32 && k.dtype() == DataType::FLOAT32 &&
              v.dtype() == DataType::FLOAT32 && qkv_dtype == DataType::FLOAT32) {
     // float32 kernel
-    QKV_ATTENTION_XPU_KERNEL_IMPL(float, float, int16_t);
+    if (use_int8) {
+      QKV_ATTENTION_XPU_KERNEL_IMPL(float, float, int8_t);
+    } else {
+      QKV_ATTENTION_XPU_KERNEL_IMPL(float, float, int16_t);
+    }
+  } else if (q.dtype() == DataType::INT8 && k.dtype() == DataType::INT8 &&
+             v.dtype() == DataType::INT8 && qkv_dtype == DataType::INT8) {
+    QKV_ATTENTION_XPU_KERNEL_IMPL(int8_t, int8_t, int8_t);
+
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Not support q_dtype is %s, k_dtype is %s, k_dtype is %s"
@@ -179,4 +333,5 @@ PD_REGISTER_KERNEL(qkv_attention_xpu,
                    ALL_LAYOUT,
                    phi::fusion::QKVAttentionXPUKernel,
                    float,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   int8_t) {}
diff --git a/paddle/phi/kernels/fusion/xpu/spatial_transformer_resblock_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/spatial_transformer_resblock_xpu_kernel.cc
new file mode 100644
index 0000000000000..32cef223b7ab4
--- /dev/null
+++ b/paddle/phi/kernels/fusion/xpu/spatial_transformer_resblock_xpu_kernel.cc
@@ -0,0 +1,188 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+
+#include "glog/logging.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+
+#ifdef PADDLE_WITH_XPU_XFT
+#include "layers/spatial_transformer.h"
+namespace xft = baidu::xpu::xft;
+#endif
+
+namespace phi {
+namespace fusion {
+
+static std::vector<std::vector<int>> IntVec1DTo2D(const std::vector<int>& vec,
+                                                  int dim) {
+  std::vector<std::vector<int>> res;
+  int size = static_cast<int>(vec.size());
+  for (auto i = 0; i < size; i += dim) {
+    std::vector<int> tmp;
+    for (auto j = 0; j < dim; j++) {
+      tmp.push_back(vec[i + j]);
+    }
+    res.emplace_back(std::move(tmp));
+  }
+  return res;
+}
+
+template <typename T, typename Context>
+void SpatialTransformerResblockXPUKernel(
+    const Context& ctx,
+    const DenseTensor& x,
+    const std::vector<const DenseTensor*>& x_max,
+    const std::vector<const DenseTensor*>& conv_bias,
+    const std::vector<const DenseTensor*>& conv_filter,
+    const std::vector<const DenseTensor*>& conv_filter_max,
+    const std::vector<const DenseTensor*>& gn_bias,
+    const std::vector<const DenseTensor*>& gn_scale,
+    const std::vector<int>& dilations,
+    const std::vector<int>& paddings,
+    const std::vector<int>& strides,
+    const std::vector<float>& gn_eps,
+    const std::vector<int>& gn_groups,
+    const std::vector<int>& groups,
+    bool conv_fix,
+    bool has_silu_fc_input,
+    bool include_silu,
+    DenseTensor* out,
+    DenseTensor* out_max) {
+#ifdef PADDLE_WITH_XPU_XFT
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+  auto* in1 = reinterpret_cast<const XPUType*>(x.data<T>());
+  const XPUType* in2 = nullptr;
+  auto* out_data = reinterpret_cast<XPUType*>(ctx.template Alloc<T>(out));
+  int batch = static_cast<int>(x.dims()[0]);
+  int channel = static_cast<int>(x.dims()[1]);
+  int nh = static_cast<int>(x.dims()[2]);
+  int nw = static_cast<int>(x.dims()[3]);
+  int input2_dim = -1;
+
+  if (has_silu_fc_input) {
+    PADDLE_ENFORCE_XDNN_SUCCESS(-1, "has_silu_fc_input unsupported yet!!!");
+  }
+  if (include_silu) {
+    PADDLE_ENFORCE_XDNN_SUCCESS(-1, "include_silu unsupported yet!!!");
+  }
+
+  std::vector<xft::xftVec<float>> xft_gn_weight_;
+  std::vector<xft::xftVec<float>> xft_gn_bias_;
+  std::vector<xft::xftMat<int16_t>> xft_fc_weights_;
+  std::vector<xft::xftVec<float>> xft_fc_bias_;
+  std::vector<xft::xftTensor<int16_t, 4>> xft_conv_weights_;
+  std::vector<xft::xftVec<float>> xft_conv_bias_;
+  std::vector<const float*> input_max_{nullptr};
+
+  // prepare gn_scale
+  for (auto* gn_scale : gn_scale) {
+    xft_gn_weight_.emplace_back(const_cast<float*>(gn_scale->data<float>()),
+                                xft::xftVec<float>::dim_t{gn_scale->dims()[0]});
+  }
+
+  // prepare gn_bias
+  for (auto* gn_bias : gn_bias) {
+    xft_gn_bias_.emplace_back(const_cast<float*>(gn_bias->data<float>()),
+                              xft::xftVec<float>::dim_t{gn_bias->dims()[0]});
+  }
+
+  // prepare input_max
+  for (auto* input_max : x_max) {
+    input_max_.emplace_back(const_cast<float*>(input_max->data<float>()));
+  }
+  if (x_max.size() == 0) {
+    input_max_.emplace_back(nullptr);
+  }
+
+  std::vector<std::vector<int>> kernel_dims_2d;
+  // prepare conv params
+  for (size_t i = 0; i < conv_filter.size(); i++) {
+    int xn = conv_filter[i]->dims()[0];
+    int nc = conv_filter[i]->dims()[1];
+    int nh = conv_filter[i]->dims()[2];
+    int nw = conv_filter[i]->dims()[3];
+    xft_conv_weights_.emplace_back(
+        const_cast<int16_t*>(
+            reinterpret_cast<const int16_t*>(conv_filter[i]->data<int16_t>())),
+        const_cast<float*>(conv_filter_max[i]->data<float>()),
+        xft::xftTensor<int16_t, 4>::dim_t{channel, xn, nh, nw});
+    kernel_dims_2d.emplace_back(std::vector<int>{xn, nc, nh, nw});
+  }
+
+  // prepare bias
+  for (auto* conv_bias : conv_bias) {
+    xft_conv_bias_.emplace_back(
+        const_cast<float*>(conv_bias->data<float>()),
+        xft::xftVec<float>::dim_t{conv_bias->dims()[0]});
+  }
+
+  xft::STResBlockParam resblock_param_;
+
+  std::vector<std::vector<int>> strides_2d{IntVec1DTo2D(strides, 2)};
+  std::vector<std::vector<int>> paddings_2d{IntVec1DTo2D(paddings, 4)};
+  std::vector<std::vector<int>> dilations_2d{IntVec1DTo2D(dilations, 2)};
+
+  // achieve params from model
+  resblock_param_.conv_fix = conv_fix;
+  resblock_param_.has_silu_fc_input = has_silu_fc_input;
+  resblock_param_.include_silu = include_silu;
+  resblock_param_.conv_groups = groups;
+  resblock_param_.kernel_dims = kernel_dims_2d;
+  resblock_param_.dilations = dilations_2d;
+  resblock_param_.paddings = paddings_2d;
+  resblock_param_.strides = strides_2d;
+  resblock_param_.gn_groups = gn_groups;
+  resblock_param_.gn_eps = gn_eps;
+
+  // input
+  xft::xftTensor<XPUType, 4> in_tensor(const_cast<XPUType*>(in1),
+                                       const_cast<float*>(input_max_[0]),
+                                       {batch, channel, nh, nw});
+  xft::xftMat<XPUType> in_silu_tensor(
+      const_cast<XPUType*>(in2), nullptr, {batch, input2_dim});
+  // output
+  xft::xftTensor<XPUType, 4> output_tensor(out_data, {batch, channel, nh, nw});
+  int r = xft::st_resblock_fusion<XPUType, int16_t, int16_t>(
+      ctx.x_context(),
+      in_tensor,
+      in_silu_tensor,
+      xft_gn_weight_,
+      xft_gn_bias_,
+      xft_fc_weights_,  // has_silu_fc_input
+      xft_fc_bias_,     // has_silu_fc_input_
+      xft_conv_weights_,
+      xft_conv_bias_,
+      &output_tensor,
+      resblock_param_);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "st_resblock_fusion");
+#else
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "spatial_transformer_resblock_xpu is not supported since it's not "
+      "compiled with XPU_XFT"));
+#endif
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(spatial_transformer_resblock_xpu,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::fusion::SpatialTransformerResblockXPUKernel,
+                   float,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
index 63e52527cb9cd..241556f088169 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
@@ -73,7 +73,7 @@ struct ExpAddFunctor {
 
 /*
   Cross entropy soft label with dynamic size on axis (log2_elements is
-  varibale).
+  variable).
   - if the input is softmax, compute loss with softmax
   - if the input is log_softmax, compute loss with log_softmax and update
   softmax
diff --git a/paddle/phi/kernels/gpu/cross_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_grad_kernel.cu
index 58f53fcf3f3d2..33a3ce4e12f3e 100644
--- a/paddle/phi/kernels/gpu/cross_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_grad_kernel.cu
@@ -18,6 +18,8 @@
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/index_calculator.h"
 
 namespace phi {
@@ -162,27 +164,56 @@ void CrossGradKernel(const Context& dev_ctx,
 
   const auto* input_x_data = input_x.data<T>();
   const auto* input_y_data = input_y.data<T>();
+  int64_t numel = x.numel();
   const auto* input_out_grad_data = input_out_grad.data<T>();
   auto* output_x_grad_data = dev_ctx.template Alloc<T>(x_grad);
   auto* output_y_grad_data = dev_ctx.template Alloc<T>(y_grad);
   auto index_calculator = phi::funcs::IndexCalculator(
       merged_dims.size() - 1, cal_dims, left_strides, full_strides);
 
-  int64_t numel = x.numel();
   backends::gpu::GpuLaunchConfig config =
       backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel / 3);
-
-  CrossGrad<<<config.block_per_grid,
-              config.thread_per_block,
-              0,
-              dev_ctx.stream()>>>(input_x_data,
-                                  input_y_data,
-                                  input_out_grad_data,
-                                  output_x_grad_data,
-                                  output_y_grad_data,
-                                  full_strides[merge_axis],
-                                  numel / 3,
-                                  index_calculator);
+  if (IsComplexType(x.dtype())) {
+    DenseTensor x_conj, y_conj;
+    DenseTensorMeta meta_xy(x.dtype(), x.dims());
+    x_conj.set_meta(meta_xy);
+    y_conj.set_meta(meta_xy);
+
+    auto* input_x_conj_data = dev_ctx.template Alloc<T>(&x_conj);
+    auto* input_y_conj_data = dev_ctx.template Alloc<T>(&y_conj);
+
+    phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+    phi::funcs::ConjFunctor<T> functor_x(
+        input_x_data, numel, input_x_conj_data);
+    phi::funcs::ConjFunctor<T> functor_y(
+        input_y_data, numel, input_y_conj_data);
+    for_range(functor_x);
+    for_range(functor_y);
+
+    CrossGrad<<<config.block_per_grid,
+                config.thread_per_block,
+                0,
+                dev_ctx.stream()>>>(input_x_conj_data,
+                                    input_y_conj_data,
+                                    input_out_grad_data,
+                                    output_x_grad_data,
+                                    output_y_grad_data,
+                                    full_strides[merge_axis],
+                                    numel / 3,
+                                    index_calculator);
+  } else {
+    CrossGrad<<<config.block_per_grid,
+                config.thread_per_block,
+                0,
+                dev_ctx.stream()>>>(input_x_data,
+                                    input_y_data,
+                                    input_out_grad_data,
+                                    output_x_grad_data,
+                                    output_y_grad_data,
+                                    full_strides[merge_axis],
+                                    numel / 3,
+                                    index_calculator);
+  }
 }
 }  // namespace phi
 
@@ -195,4 +226,6 @@ PD_REGISTER_KERNEL(cross_grad,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/cross_kernel.cu b/paddle/phi/kernels/gpu/cross_kernel.cu
index 461e3a219d5d6..f1671c67973f5 100644
--- a/paddle/phi/kernels/gpu/cross_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_kernel.cu
@@ -172,4 +172,6 @@ PD_REGISTER_KERNEL(cross,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/full_kernel.cu b/paddle/phi/kernels/gpu/full_kernel.cu
index fde2e33505f97..e5456743696e0 100644
--- a/paddle/phi/kernels/gpu/full_kernel.cu
+++ b/paddle/phi/kernels/gpu/full_kernel.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
-#include "paddle/phi/kernels/impl/full_whit_tensor_kernel_impl.h"
+#include "paddle/phi/kernels/impl/full_with_tensor_kernel_impl.h"
 
 namespace phi {
 
@@ -174,5 +174,4 @@ PD_REGISTER_KERNEL(full_with_tensor,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {
   kernel->InputAt(0).SetBackend(phi::Backend::CPU);
-  kernel->InputAt(1).SetBackend(phi::Backend::CPU);
 }
diff --git a/paddle/phi/kernels/gpu/isfinite_kernel.cu b/paddle/phi/kernels/gpu/isfinite_kernel.cu
index 9bde1d7a5bd38..f5e165f9ae00b 100644
--- a/paddle/phi/kernels/gpu/isfinite_kernel.cu
+++ b/paddle/phi/kernels/gpu/isfinite_kernel.cu
@@ -27,7 +27,10 @@ PD_REGISTER_KERNEL(isinf,
                    phi::dtype::float16,
                    phi::dtype::bfloat16,
                    int,
-                   int64_t) {
+                   int64_t,
+                   int16_t,
+                   int8_t,
+                   uint8_t) {
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
 }
 
diff --git a/paddle/phi/kernels/gpu/rms_norm_funcs.h b/paddle/phi/kernels/gpu/rms_norm_funcs.h
index 2bf035d30e1dc..82586aacd130f 100644
--- a/paddle/phi/kernels/gpu/rms_norm_funcs.h
+++ b/paddle/phi/kernels/gpu/rms_norm_funcs.h
@@ -21,22 +21,29 @@ limitations under the License. */
 #pragma once
 
 #include <assert.h>
-#include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
-#ifndef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#include "paddle/phi/backends/gpu/rocm/miopen_helper.h"
+#define GPU(str) hip##str
+#else
 #include <cuda.h>          // NOLINT
 #include <cuda_runtime.h>  // NOLINT
 #include <cub/cub.cuh>
+#include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
+#define GPU(str) cuda##str
 #endif
 
 namespace phi {
 
 namespace {  // NOLINT
-#ifndef PADDLE_WITH_HIP
 
 #define DEFAULT_THROW(NAME, TYPE)                              \
   default:                                                     \
@@ -78,14 +85,22 @@ namespace {  // NOLINT
     }                                                                      \
   } while (0)
 
+#ifdef PADDLE_WITH_HIP
+#define WARP_SIZE 64
+#else
 #define WARP_SIZE 32
+#endif
 
 template <typename T>
 __device__ __forceinline__ T WARP_SHFL_XOR(T value,
                                            int laneMask,
                                            int width = WARP_SIZE,
                                            unsigned int mask = 0xffffffff) {
+#ifdef PADDLE_WITH_HIP
+  return __shfl_xor(value, laneMask, width);
+#else
   return __shfl_xor_sync(mask, value, laneMask, width);
+#endif
 }
 
 template <typename T>
@@ -93,7 +108,11 @@ __device__ __forceinline__ T WARP_SHFL(T value,
                                        int srcLane,
                                        int width = WARP_SIZE,
                                        unsigned int mask = 0xffffffff) {
+#ifdef PADDLE_WITH_HIP
+  return __shfl(value, srcLane, width);
+#else
   return __shfl_sync(mask, value, srcLane, width);
+#endif
 }
 
 template <typename U>
@@ -296,11 +315,21 @@ __device__ void cuWelfordMuSigma2(const phi::dtype::float16* __restrict__ vals,
       for (int k = 0; k < 8; k += 2) {
         float2 curr = __half22float2(*((__half2*)(lvals + l + k)));  // NOLINT
         if (!rms_only) {
+#ifdef PADDLE_WITH_HIP
+          cuWelfordOnlineSum(static_cast<float>(curr.x), mu, sigma2, count);
+          cuWelfordOnlineSum(static_cast<float>(curr.y), mu, sigma2, count);
+#else
           cuWelfordOnlineSum(curr.x, mu, sigma2, count);
           cuWelfordOnlineSum(curr.y, mu, sigma2, count);
+#endif
         } else {
+#ifdef PADDLE_WITH_HIP
+          cuRMSOnlineSum(static_cast<float>(curr.x), sigma2);
+          cuRMSOnlineSum(static_cast<float>(curr.y), sigma2);
+#else
           cuRMSOnlineSum(curr.x, sigma2);
           cuRMSOnlineSum(curr.y, sigma2);
+#endif
         }
       }
     }
@@ -907,7 +936,7 @@ __global__ void cuComputeGradInput(const T* __restrict__ dout,
     __syncthreads();
   }
 }
-#endif
+
 }  // namespace
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu
index fab312470fe9f..d66fade233755 100644
--- a/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu
@@ -18,22 +18,29 @@ limitations under the License. */
  *     with minor changes. */
 
 #include <assert.h>
-#include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#ifndef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+#include "paddle/phi/backends/gpu/rocm/miopen_helper.h"
+namespace cub = hipcub;
+#define GPU(str) hip##str
+#else
 #include <cuda.h>          // NOLINT
 #include <cuda_runtime.h>  // NOLINT
 #include <cub/cub.cuh>
-#include "paddle/phi/kernels/gpu/rms_norm_funcs.h"
+#include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
+#define GPU(str) cuda##str
 #endif
+#include "paddle/phi/kernels/gpu/rms_norm_funcs.h"
 
 namespace phi {
 
 namespace {
-#ifndef PADDLE_WITH_HIP
 
 template <typename T, typename U, typename V, typename Context>
 void HostRMSNormGradient(const Context& dev_ctx,
@@ -46,7 +53,7 @@ void HostRMSNormGradient(const Context& dev_ctx,
                          double epsilon,
                          T* grad_input,
                          V* grad_gamma) {
-  cudaStream_t stream = dev_ctx.stream();
+  GPU(Stream_t) stream = dev_ctx.stream();
   if (gamma != NULL) {
     const int part_size = 16;
     const dim3 threads2(32, 4, 1);
@@ -144,7 +151,7 @@ void cuda_rms_norm_gradient(const Context& dev_ctx,
                           grad_x->data<T>(),
                           grad_scale->data<SCALE_TYPE>()));
 }
-#endif
+
 }  // namespace
 
 template <typename T, typename Context>
@@ -161,10 +168,6 @@ void RmsNormGradKernel(const Context& dev_ctx,
                        const float quant_scale,
                        DenseTensor* grad_x,
                        DenseTensor* grad_norm_weight) {
-#if defined(PADDLE_WITH_HIP)
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "Please compile with CUDA, ROCM platform isn't support it."));
-#else
   if (bias || residual || norm_bias) {
     PADDLE_THROW(phi::errors::Unimplemented(
         "bias or residual or norm_bias is not supported yet"));
@@ -181,7 +184,6 @@ void RmsNormGradKernel(const Context& dev_ctx,
                                      grad_x,
                                      grad_norm_weight,
                                      begin_norm_axis);
-#endif
 }
 }  // namespace phi
 
diff --git a/paddle/phi/kernels/gpu/rms_norm_kernel.cu b/paddle/phi/kernels/gpu/rms_norm_kernel.cu
index ec138271f4387..67a63694f83c8 100644
--- a/paddle/phi/kernels/gpu/rms_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/rms_norm_kernel.cu
@@ -39,17 +39,30 @@ limitations under the License.
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
-#ifndef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#define GPU(str) hip##str
+#define GPUMultiProcessorCount hipDeviceAttributeMultiprocessorCount
+#define GPUMaxSharedMemoryPerBlockOptin hipDeviceAttributeSharedMemPerBlockOptin
+#else
 #include <cub/cub.cuh>
+#define GPU(str) cuda##str
+#define GPUMultiProcessorCount cudaDevAttrMultiProcessorCount
+#define GPUMaxSharedMemoryPerBlockOptin cudaDevAttrMaxSharedMemoryPerBlockOptin
 #endif
 
 namespace phi {
 
 namespace {
 
-#ifndef PADDLE_WITH_HIP
-
+#ifdef PADDLE_WITH_HIP
+constexpr int kWarpSize = 64;
+#else
 constexpr int kWarpSize = 32;
+#endif
 
 template <typename T>
 struct SumOp {
@@ -71,7 +84,11 @@ template <template <typename> class ReductionOp,
 __inline__ __device__ T WarpAllReduce(T val) {
   for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
     val = ReductionOp<T>()(
+#ifdef PADDLE_WITH_HIP
+        val, __shfl_xor(val, mask, thread_group_width));
+#else
         val, __shfl_xor_sync(0xffffffff, val, mask, thread_group_width));
+#endif
   }
   return val;
 }
@@ -116,35 +133,36 @@ __inline__ __device__ double Rsqrt<double>(double x) {
 }
 
 template <class Func>
-inline cudaError_t GetNumBlocks(Func func,
-                                int32_t block_size,
-                                size_t dynamic_smem_size,
-                                int32_t max_blocks,
-                                int32_t waves,
-                                int* num_blocks) {
+inline GPU(Error_t) GetNumBlocks(Func func,
+                                 int32_t block_size,
+                                 size_t dynamic_smem_size,
+                                 int32_t max_blocks,
+                                 int32_t waves,
+                                 int* num_blocks) {
   int dev;
   {
-    cudaError_t err = cudaGetDevice(&dev);
-    if (err != cudaSuccess) {
+    GPU(Error_t) err = GPU(GetDevice)(&dev);
+    if (err != GPU(Success)) {
       return err;
     }
   }
   int sm_count;
   {
-    cudaError_t err =
-        cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev);
-    if (err != cudaSuccess) {
+    GPU(Error_t)
+    err = GPU(DeviceGetAttribute)(&sm_count, GPUMultiProcessorCount, dev);
+    if (err != GPU(Success)) {
       return err;
     }
   }
   int max_active_blocks;
   {
-    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    GPU(Error_t)
+    err = GPU(OccupancyMaxActiveBlocksPerMultiprocessor)(
         &max_active_blocks, func, block_size, dynamic_smem_size);
   }
   *num_blocks = std::max<int>(
       1, std::min<int32_t>(max_blocks, sm_count * max_active_blocks * waves));
-  return cudaSuccess;
+  return GPU(Success);
 }
 
 template <typename T>
@@ -299,9 +317,15 @@ __inline__ __device__ void WelfordWarpReduce(
   *m2 = thread_m2;
   *count = thread_count;
   for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+#ifdef PADDLE_WITH_HIP
+    T b_mean = __shfl_down(*mean, mask, thread_group_width);
+    T b_m2 = __shfl_down(*m2, mask, thread_group_width);
+    T b_count = __shfl_down(*count, mask, thread_group_width);
+#else
     T b_mean = __shfl_down_sync(0xffffffff, *mean, mask, thread_group_width);
     T b_m2 = __shfl_down_sync(0xffffffff, *m2, mask, thread_group_width);
     T b_count = __shfl_down_sync(0xffffffff, *count, mask, thread_group_width);
+#endif
     WelfordCombine(b_mean, b_m2, b_count, mean, m2, count);
   }
 }
@@ -311,9 +335,15 @@ __inline__ __device__ void WelfordWarpAllReduce(
     T thread_mean, T thread_m2, T thread_count, T* mean, T* m2, T* count) {
   WelfordWarpReduce<T, thread_group_width>(
       thread_mean, thread_m2, thread_count, mean, m2, count);
+#ifdef PADDLE_WITH_HIP
+  *mean = __shfl(*mean, 0, thread_group_width);
+  *m2 = __shfl(*m2, 0, thread_group_width);
+  *count = __shfl(*count, 0, thread_group_width);
+#else
   *mean = __shfl_sync(0xffffffff, *mean, 0, thread_group_width);
   *m2 = __shfl_sync(0xffffffff, *m2, 0, thread_group_width);
   *count = __shfl_sync(0xffffffff, *count, 0, thread_group_width);
+#endif
 }
 
 template <typename T, int thread_group_width = kWarpSize>
@@ -321,7 +351,11 @@ __inline__ __device__ T WarpReduceSum(T x) {
   T result = 0.0f;
 #pragma unroll
   for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+#ifdef PADDLE_WITH_HIP
+    result += __shfl_xor(x, mask, thread_group_width);
+#else
     result += __shfl_xor_sync(0xffffffff, x, mask, thread_group_width);
+#endif
   }
   return result;
 }
@@ -363,7 +397,11 @@ __inline__ __device__ void WelfordBlockAllReduce(T thread_mean,
       warp_m2 = static_cast<T>(0);
       warp_count = static_cast<T>(0);
     }
+#ifdef PADDLE_WITH_HIP
+    __syncthreads();
+#else
     __syncwarp();
+#endif
     T block_mean = 0;
     T block_m2 = 0;
     T block_count = 0;
@@ -441,61 +479,73 @@ template <typename LOAD,
           typename ComputeType,
           int kPackSize,
           int block_size>
-inline cudaError_t LaunchRmsNormBlockSMemImpl(cudaStream_t stream,
-                                              LOAD load,
-                                              STORE store,
-                                              int smem,
-                                              const int32_t rows,
-                                              const int32_t cols,
-                                              const float epsilon,
-                                              ComputeType col_divisor,
-                                              float* inv_var_data) {
+inline GPU(Error_t) LaunchRmsNormBlockSMemImpl(GPU(Stream_t) stream,
+                                               LOAD load,
+                                               STORE store,
+                                               int smem,
+                                               const int32_t rows,
+                                               const int32_t cols,
+                                               const float epsilon,
+                                               ComputeType col_divisor,
+                                               float* inv_var_data) {
   constexpr int waves = 32;
   int grid_dim_x;
   {
-    cudaError_t err = GetNumBlocks(
+    GPU(Error_t)
+    err = GetNumBlocks(
         RmsNormBlockSMemImpl<LOAD, STORE, ComputeType, kPackSize, block_size>,
         block_size,
         smem,
         rows,
         waves,
         &grid_dim_x);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return err;
     }
   }
   RmsNormBlockSMemImpl<LOAD, STORE, ComputeType, kPackSize, block_size>
       <<<grid_dim_x, block_size, smem, stream>>>(
           load, store, rows, cols, epsilon, col_divisor, inv_var_data);
-  return cudaPeekAtLastError();
+  return GPU(PeekAtLastError)();
 }
 
 template <typename Func>
-cudaError_t MaximizeDynamicSharedMemorySize(Func func,
-                                            const int max_smem_size) {
-  cudaFuncAttributes attr{};
+GPU(Error_t)
+MaximizeDynamicSharedMemorySize(Func func, const int max_smem_size) {
+  GPU(FuncAttributes) attr{};
+#ifdef PADDLE_WITH_HIP
+  hipError_t err = hipFuncGetAttributes(&attr, (const void*)func);
+#else
   cudaError_t err = cudaFuncGetAttributes(&attr, func);
-  if (err != cudaSuccess) {
+#endif
+  if (err != GPU(Success)) {
     return err;
   }
   constexpr int reserved_smem = 1024;  // 1K
+#ifdef PADDLE_WITH_HIP
+  return hipFuncSetAttribute(
+      (const void*)func,
+      hipFuncAttributeMaxDynamicSharedMemorySize,
+      max_smem_size - attr.sharedSizeBytes - reserved_smem);
+#else
   return cudaFuncSetAttribute(
       func,
       cudaFuncAttributeMaxDynamicSharedMemorySize,
       max_smem_size - attr.sharedSizeBytes - reserved_smem);
+#endif
 }
 
 template <typename LOAD, typename STORE, typename ComputeType, int kPackSize>
-inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
-    cudaStream_t stream,
-    LOAD load,
-    STORE store,
-    const int32_t rows,
-    const int32_t cols,
-    const float epsilon,
-    ComputeType col_divisor,
-    bool* success,
-    float* inv_var_data) {
+inline GPU(Error_t)
+    TryDispatchRmsNormBlockSMemImplBlockSize(GPU(Stream_t) stream,
+                                             LOAD load,
+                                             STORE store,
+                                             const int32_t rows,
+                                             const int32_t cols,
+                                             const float epsilon,
+                                             ComputeType col_divisor,
+                                             bool* success,
+                                             float* inv_var_data) {
   constexpr int block_size_conf_1 = 128;
   constexpr int block_size_conf_2 = 256;
   constexpr int block_size_conf_3 = 512;
@@ -503,26 +553,27 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
 
   int dev = 0;
   {
-    cudaError_t err = cudaGetDevice(&dev);
-    if (err != cudaSuccess) {
+    GPU(Error_t) err = GPU(GetDevice)(&dev);
+    if (err != GPU(Success)) {
       return err;
     }
   }
 
   int sm_count = 0;
   {
-    cudaError_t err =
-        cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev);
-    if (err != cudaSuccess) {
+    GPU(Error_t)
+    err = GPU(DeviceGetAttribute)(&sm_count, GPUMultiProcessorCount, dev);
+    if (err != GPU(Success)) {
       return err;
     }
   }
 
   static const bool max_smem_configed = [=]() {
     int max_smem_size = 0;
-    cudaError_t err = cudaDeviceGetAttribute(
-        &max_smem_size, cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
-    if (err != cudaSuccess) {
+    GPU(Error_t)
+    err = GPU(DeviceGetAttribute)(
+        &max_smem_size, GPUMaxSharedMemoryPerBlockOptin, dev);
+    if (err != GPU(Success)) {
       return false;
     }
 
@@ -533,7 +584,7 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
                                                              kPackSize,
                                                              block_size_conf_1>,
                                         max_smem_size);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return false;
     }
     err =
@@ -543,7 +594,7 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
                                                              kPackSize,
                                                              block_size_conf_2>,
                                         max_smem_size);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return false;
     }
     err =
@@ -553,7 +604,7 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
                                                              kPackSize,
                                                              block_size_conf_3>,
                                         max_smem_size);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return false;
     }
     err =
@@ -563,7 +614,7 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
                                                              kPackSize,
                                                              block_size_conf_4>,
                                         max_smem_size);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return false;
     }
 
@@ -574,7 +625,8 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
 
   int max_active_blocks_conf_1;
   {
-    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    GPU(Error_t)
+    err = GPU(OccupancyMaxActiveBlocksPerMultiprocessor)(
         &max_active_blocks_conf_1,
         RmsNormBlockSMemImpl<LOAD,
                              STORE,
@@ -583,18 +635,19 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
                              block_size_conf_1>,
         block_size_conf_1,
         smem);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return err;
     }
   }
   if (max_active_blocks_conf_1 <= 0) {
     *success = false;
-    return cudaSuccess;
+    return GPU(Success);
   }
 
   int max_active_blocks_conf_4;
   {
-    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    GPU(Error_t)
+    err = GPU(OccupancyMaxActiveBlocksPerMultiprocessor)(
         &max_active_blocks_conf_4,
         RmsNormBlockSMemImpl<LOAD,
                              STORE,
@@ -603,7 +656,7 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
                              block_size_conf_4>,
         block_size_conf_4,
         smem);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return err;
     }
   }
@@ -628,7 +681,8 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
 
   int max_active_blocks_conf_3;
   {
-    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    GPU(Error_t)
+    err = GPU(OccupancyMaxActiveBlocksPerMultiprocessor)(
         &max_active_blocks_conf_3,
         RmsNormBlockSMemImpl<LOAD,
                              STORE,
@@ -637,7 +691,7 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
                              block_size_conf_3>,
         block_size_conf_3,
         smem);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return err;
     }
   }
@@ -661,7 +715,8 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
 
   int max_active_blocks_conf_2;
   {
-    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    GPU(Error_t)
+    err = GPU(OccupancyMaxActiveBlocksPerMultiprocessor)(
         &max_active_blocks_conf_2,
         RmsNormBlockSMemImpl<LOAD,
                              STORE,
@@ -670,7 +725,7 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
                              block_size_conf_2>,
         block_size_conf_2,
         smem);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return err;
     }
   }
@@ -710,15 +765,16 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
 
 template <typename LOAD, typename STORE, typename ComputeType>
 struct TryDispatchRmsNormBlockSMemImplPackSize {
-  cudaError_t operator()(cudaStream_t stream,
-                         LOAD load,
-                         STORE store,
-                         const int32_t rows,
-                         const int32_t cols,
-                         const float epsilon,
-                         ComputeType col_divisor,
-                         bool* success,
-                         float* inv_var_data) {
+  GPU(Error_t)
+  operator()(GPU(Stream_t) stream,
+             LOAD load,
+             STORE store,
+             const int32_t rows,
+             const int32_t cols,
+             const float epsilon,
+             ComputeType col_divisor,
+             bool* success,
+             float* inv_var_data) {
     if (cols % 4 == 0 && CanPackAs<LOAD>(load, 4) &&
         CanPackAs<STORE>(store, 4)) {
       return TryDispatchRmsNormBlockSMemImplBlockSize<LOAD,
@@ -765,15 +821,15 @@ struct TryDispatchRmsNormBlockSMemImplPackSize {
 };
 
 template <typename LOAD, typename STORE, typename ComputeType>
-inline cudaError_t TryDispatchRmsNormBlockSMemImpl(cudaStream_t stream,
-                                                   LOAD load,
-                                                   STORE store,
-                                                   const int32_t rows,
-                                                   const int32_t cols,
-                                                   const float epsilon,
-                                                   ComputeType col_divisor,
-                                                   bool* success,
-                                                   float* inv_var_data) {
+inline GPU(Error_t) TryDispatchRmsNormBlockSMemImpl(GPU(Stream_t) stream,
+                                                    LOAD load,
+                                                    STORE store,
+                                                    const int32_t rows,
+                                                    const int32_t cols,
+                                                    const float epsilon,
+                                                    ComputeType col_divisor,
+                                                    bool* success,
+                                                    float* inv_var_data) {
   return TryDispatchRmsNormBlockSMemImplPackSize<LOAD, STORE, ComputeType>()(
       stream,
       load,
@@ -788,8 +844,8 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImpl(cudaStream_t stream,
 
 template <typename LOAD, typename STORE, typename ComputeType>
 inline typename std::enable_if<!std::is_same<ComputeType, double>::value,
-                               cudaError_t>::type
-DispatchRmsNorm(cudaStream_t stream,
+                               GPU(Error_t)>::type
+DispatchRmsNorm(GPU(Stream_t) stream,
                 LOAD load,
                 STORE store,
                 const int32_t rows,
@@ -799,7 +855,8 @@ DispatchRmsNorm(cudaStream_t stream,
   const ComputeType col_divisor = 1.0f / cols;
   bool dispatch_smem_impl_success;
   {
-    cudaError_t err = TryDispatchRmsNormBlockSMemImpl<LOAD, STORE, ComputeType>(
+    GPU(Error_t)
+    err = TryDispatchRmsNormBlockSMemImpl<LOAD, STORE, ComputeType>(
         stream,
         load,
         store,
@@ -809,11 +866,11 @@ DispatchRmsNorm(cudaStream_t stream,
         col_divisor,
         &dispatch_smem_impl_success,
         inv_var_data);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return err;
     }
   }
-  return cudaSuccess;
+  return GPU(Success);
 }
 
 template <typename SRC, typename DST>
@@ -998,8 +1055,6 @@ struct AffineQuantStore {
   const float quant_min_bound;
 };
 
-#endif
-
 }  // namespace
 
 template <typename T, typename Context>
@@ -1018,9 +1073,6 @@ void RmsNormKernel(const Context& dev_ctx,
                    DenseTensor* out,
                    DenseTensor* residual_out,
                    DenseTensor* inv_var) {
-#if defined(PADDLE_WITH_HIP)
-  LOG(ERROR) << "Please compile with CUDA, ROCM platform isn't support it";
-#else
   using ComputeType = typename phi::dtype::MPTypeTrait<T>::Type;
 
   const T* x_data = x.data<T>();
@@ -1096,7 +1148,6 @@ void RmsNormKernel(const Context& dev_ctx,
           dev_ctx.stream(), load, store, rows, cols, epsilon, inv_var_data);
     }
   }
-#endif
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/weight_quantize_kernel.cu b/paddle/phi/kernels/gpu/weight_quantize_kernel.cu
index 103691f9cd8a4..51b4786155a92 100644
--- a/paddle/phi/kernels/gpu/weight_quantize_kernel.cu
+++ b/paddle/phi/kernels/gpu/weight_quantize_kernel.cu
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/datatype_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
@@ -72,11 +73,11 @@ void WeightQuantizeKernel(const Context& dev_ctx,
                                 weight_shape,
                                 arch);
   } else if (algo == "weight_only_int4") {
-    phi::errors::Unimplemented(
+    PADDLE_FATAL(
         "Weight quant gpu kernel currently don't support weight_only_int4 "
         "algo, please use cpu version.");
   } else {
-    phi::errors::Unimplemented(
+    PADDLE_FATAL(
         "The algo must be in ['weight_only_int8', 'weight_only_int4', "
         "'llm.int8'], but got[%s]",
         algo);
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index 77b636bbb4ba1..9187ac909aacc 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -1468,13 +1468,6 @@ PD_REGISTER_KERNEL(conv3d_grad,
                    phi::Conv3DCudnnGradKernel,
                    float,
                    phi::dtype::float16) {}
-
-PD_REGISTER_KERNEL(depthwise_conv2d_grad,
-                   GPUDNN,
-                   ALL_LAYOUT,
-                   phi::DepthwiseConvCudnnGradKernel,
-                   float,
-                   phi::dtype::float16) {}
 PD_REGISTER_KERNEL(conv2d_double_grad,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 69d91c9f7901d..16b927e83aabe 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/elementwise_utils.h"
 
 namespace phi {
 
@@ -195,42 +196,325 @@ struct DivGradDY<phi::dtype::complex<T>> {
 
 template <typename T>
 struct DivDoubleDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return y * out * dout - x * dout;
+  HOSTDEVICE T operator()(const T& x,
+                          const T& y,
+                          const T& out,
+                          const T& dout) const {
+    return (y * out - x) * dout;
+  }
+};
+
+template <typename T>
+struct DivDoubleDY_Only_DDY {
+  HOSTDEVICE T operator()(const T& x,
+                          const T& y,
+                          const T& out,
+                          const T& dout) const {
+    return y * out * dout;
   }
 };
 
+template <typename T>
+struct DivDoubleDY_Only_DDX {
+  HOSTDEVICE T operator()(const T& x,
+                          const T& y,
+                          const T& out,
+                          const T& dout) const {
+    return -x * dout;
+  }
+};
+
+// ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
+template <typename T>
+struct DivDoubleDDOut {
+  HOSTDEVICE T operator()(const T& ddx,
+                          const T& ddy,
+                          const T& y,
+                          const T& out) const {
+    return (ddx - out * ddy) / y;
+  }
+};
+
+template <typename T>
+struct DivDoubleDDOut_Only_DDY {
+  HOSTDEVICE T operator()(const T& ddx,
+                          const T& ddy,
+                          const T& y,
+                          const T& out) const {
+    return -out * ddy / y;
+  }
+};
+
+template <typename T, typename DDout_OP, typename OutType = T>
+void ComputeDDoutWithoutBroadcast(const CPUContext& dev_ctx UNUSED,
+                                  const phi::DenseTensor& ddx,
+                                  const phi::DenseTensor& ddy,
+                                  const phi::DenseTensor& y,
+                                  const phi::DenseTensor& out,
+                                  phi::DenseTensor* ddout,
+                                  DDout_OP dout_op) {
+  auto out_numel = out.numel();
+  auto* ddx_data = ddx.data<T>();
+  auto* ddy_data = ddy.data<T>();
+  auto* y_data = y.data<T>();
+  auto* out_data = out.data<T>();
+  auto* ddout_data = ddout->data<T>();
+  for (int i = 0; i < out_numel; i++) {
+    ddout_data[i] = dout_op(ddx_data[i], ddy_data[i], y_data[i], out_data[i]);
+  }
+}
+
+template <typename T, typename DDout_OP, typename OutType = T>
+void ComputeDDoutWithBroadcast(const CPUContext& dev_ctx UNUSED,
+                               const phi::DenseTensor& ddx,
+                               const phi::DenseTensor& ddy,
+                               const phi::DenseTensor& y,
+                               const phi::DenseTensor& out,
+                               phi::DenseTensor* ddout,
+                               const int* x_dims_array,
+                               const int* y_dims_array,
+                               const int* out_dims_array,
+                               const int max_dim,
+                               DDout_OP dout_op) {
+  auto out_numel = out.numel();
+  auto* ddx_data = ddx.data<T>();
+  auto* ddy_data = ddy.data<T>();
+  auto* y_data = y.data<T>();
+  auto* out_data = out.data<T>();
+  auto* ddout_data = ddout->data<T>();
+  std::vector<int> index_array(max_dim, 0);
+  for (int i = 0; i < out_numel; i++) {
+    int x_index = phi::funcs::GetElementwiseIndex(
+        x_dims_array, max_dim, index_array.data());
+    int y_index = phi::funcs::GetElementwiseIndex(
+        y_dims_array, max_dim, index_array.data());
+    ddout_data[i] = dout_op(
+        ddx_data[x_index], ddy_data[y_index], y_data[y_index], out_data[i]);
+    phi::funcs::UpdateElementwiseIndexArray(
+        out_dims_array, max_dim, index_array.data());
+  }
+}
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+
+template <typename T, typename DDout_OP, typename OutType = T>
+__global__ void ComputeDDoutWithoutBroadcastGPUKernel(const T* ddx_data,
+                                                      const T* ddy_data,
+                                                      const T* y_data,
+                                                      const T* out_data,
+                                                      T* ddout_data,
+                                                      int numel,
+                                                      DDout_OP dout_op) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= numel) return;
+  ddout_data[tid] =
+      dout_op(ddx_data[tid], ddy_data[tid], y_data[tid], out_data[tid]);
+}
+template <typename T, typename DDout_OP, typename OutType = T>
+void ComputeDDoutWithoutBroadcast(const GPUContext& dev_ctx UNUSED,
+                                  const phi::DenseTensor& ddx,
+                                  const phi::DenseTensor& ddy,
+                                  const phi::DenseTensor& y,
+                                  const phi::DenseTensor& out,
+                                  phi::DenseTensor* ddout,
+                                  DDout_OP dout_op) {
+  auto out_numel = out.numel();
+  auto* ddx_data = ddx.data<T>();
+  auto* ddy_data = ddy.data<T>();
+  auto* y_data = y.data<T>();
+  auto* out_data = out.data<T>();
+  auto* ddout_data = ddout->data<T>();
+  int block = 512;
+  int64_t grid = (out_numel + block - 1) / block;
+  auto stream = reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
+  ComputeDDoutWithoutBroadcastGPUKernel<T, DDout_OP, T>
+      <<<grid, block, 0, stream>>>(
+          ddx_data, ddy_data, y_data, out_data, ddout_data, out_numel, dout_op);
+}
+
+template <typename T, typename DDout_OP, typename OutType = T>
+__global__ void ComputeDDoutWithBroadcastGPUKernel(const T* ddx_data,
+                                                   const T* ddy_data,
+                                                   const T* y_data,
+                                                   const T* out_data,
+                                                   T* ddout_data,
+                                                   int numel,
+                                                   const int* x_dims_array,
+                                                   const int* y_dims_array,
+                                                   const int* out_dims_array,
+                                                   const int max_dim,
+                                                   DDout_OP dout_op) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= numel) return;
+  int x_index = 0, y_index = 0, x_index_prod = 1, y_index_prod = 1,
+      out_index = tid, dim_index;
+  for (int64_t i = max_dim - 1; i >= 0; i--) {
+    if (out_index == 0) break;
+    dim_index = out_index % out_dims_array[i];
+    out_index = out_index / out_dims_array[i];
+    if (x_dims_array[i] > 1) {
+      x_index += dim_index * x_index_prod;
+      x_index_prod *= x_dims_array[i];
+    }
+    if (y_dims_array[i] > 1) {
+      y_index += dim_index * y_index_prod;
+      y_index_prod *= y_dims_array[i];
+    }
+  }
+  ddout_data[tid] = dout_op(
+      ddx_data[x_index], ddy_data[y_index], y_data[y_index], out_data[tid]);
+}
+
+template <typename T, typename DDout_OP, typename OutType = T>
+void ComputeDDoutWithBroadcast(const GPUContext& dev_ctx UNUSED,
+                               const phi::DenseTensor& ddx,
+                               const phi::DenseTensor& ddy,
+                               const phi::DenseTensor& y,
+                               const phi::DenseTensor& out,
+                               phi::DenseTensor* ddout,
+                               const int* x_dims_array,
+                               const int* y_dims_array,
+                               const int* out_dims_array,
+                               const int max_dim,
+                               DDout_OP dout_op) {
+  auto out_numel = out.numel();
+  auto* ddx_data = ddx.data<T>();
+  auto* ddy_data = ddy.data<T>();
+  auto* y_data = y.data<T>();
+  auto* out_data = out.data<T>();
+  auto* ddout_data = ddout->data<T>();
+  DenseTensor x_dims_array_gpu;
+  x_dims_array_gpu.Resize({max_dim});
+  int* x_dims_array_gpu_data = dev_ctx.template Alloc<int>(&x_dims_array_gpu);
+#if defined(__NVCC__)
+  cudaMemcpy(x_dims_array_gpu_data,
+             x_dims_array,
+             sizeof(int) * max_dim,
+             cudaMemcpyHostToDevice);
+#else
+  hipMemcpy(x_dims_array_gpu_data,
+            x_dims_array,
+            sizeof(int) * max_dim,
+            hipMemcpyHostToDevice);
+#endif
+  DenseTensor y_dims_array_gpu;
+  y_dims_array_gpu.Resize({max_dim});
+  int* y_dims_array_gpu_data = dev_ctx.template Alloc<int>(&y_dims_array_gpu);
+#if defined(__NVCC__)
+  cudaMemcpy(y_dims_array_gpu_data,
+             y_dims_array,
+             sizeof(int) * max_dim,
+             cudaMemcpyHostToDevice);
+#else
+  hipMemcpy(y_dims_array_gpu_data,
+            y_dims_array,
+            sizeof(int) * max_dim,
+            hipMemcpyHostToDevice);
+#endif
+  DenseTensor out_dims_array_gpu;
+  out_dims_array_gpu.Resize({max_dim});
+  int* out_dims_array_gpu_data =
+      dev_ctx.template Alloc<int>(&out_dims_array_gpu);
+#if defined(__NVCC__)
+  cudaMemcpy(out_dims_array_gpu_data,
+             out_dims_array,
+             sizeof(int) * max_dim,
+             cudaMemcpyHostToDevice);
+#else
+  hipMemcpy(out_dims_array_gpu_data,
+            out_dims_array,
+            sizeof(int) * max_dim,
+            hipMemcpyHostToDevice);
+#endif
+  int block = 512;
+  int64_t grid = (out_numel + block - 1) / block;
+  auto stream = reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
+  ComputeDDoutWithBroadcastGPUKernel<T, DDout_OP, T>
+      <<<grid, block, 0, stream>>>(ddx_data,
+                                   ddy_data,
+                                   y_data,
+                                   out_data,
+                                   ddout_data,
+                                   out_numel,
+                                   x_dims_array_gpu_data,
+                                   y_dims_array_gpu_data,
+                                   out_dims_array_gpu_data,
+                                   max_dim,
+                                   dout_op);
+}
+
+#endif
+
+template <typename DeviceContext,
+          typename T,
+          typename DDout_OP,
+          typename Tout = T>
+void DivDoubleDDoutCompute(const DeviceContext& dev_ctx,
+                           const phi::DenseTensor& ddx,
+                           const phi::DenseTensor& ddy,
+                           const phi::DenseTensor& y,
+                           const phi::DenseTensor& out,
+                           int axis,
+                           phi::DenseTensor* ddout,
+                           DDout_OP dout_op) {
+  auto x_dims = ddx.dims();
+  auto y_dims = ddy.dims();
+  if (x_dims == y_dims) {
+    ComputeDDoutWithoutBroadcast<T, DDout_OP, T>(
+        dev_ctx, ddx, ddy, y, out, ddout, dout_op);
+  } else {
+    int max_dim = std::max(x_dims.size(), y_dims.size());
+    axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+    std::vector<int> x_dims_array(max_dim, 0);
+    std::vector<int> y_dims_array(max_dim, 0);
+    std::vector<int> out_dims_array(max_dim, 0);
+    phi::funcs::GetBroadcastDimsArrays(x_dims,
+                                       y_dims,
+                                       x_dims_array.data(),
+                                       y_dims_array.data(),
+                                       out_dims_array.data(),
+                                       max_dim,
+                                       axis);
+    ComputeDDoutWithBroadcast<T, DDout_OP, T>(dev_ctx,
+                                              ddx,
+                                              ddy,
+                                              y,
+                                              out,
+                                              ddout,
+                                              x_dims_array.data(),
+                                              y_dims_array.data(),
+                                              out_dims_array.data(),
+                                              max_dim,
+                                              dout_op);
+  }
+}
+
 template <typename T, typename Context>
 void DivideDoubleGradKernel(const Context& dev_ctx,
                             const DenseTensor& y,
                             const DenseTensor& out,
-                            const DenseTensor& dx,
+                            const DenseTensor& grad_out,
+                            const paddle::optional<DenseTensor>& dx,
                             const paddle::optional<DenseTensor>& ddx,
                             const paddle::optional<DenseTensor>& ddy,
                             int axis,
                             DenseTensor* dy,
                             DenseTensor* dout,
                             DenseTensor* ddout) {
-  if (dy) {
-    dy->Resize(y.dims());
-    dev_ctx.template Alloc<T>(dy);
-  }
-  if (dout) {
-    dout->Resize(out.dims());
-    dev_ctx.template Alloc<T>(dout);
-  }
-  if (ddout) {
-    ddout->Resize(out.dims());
-    dev_ctx.template Alloc<T>(ddout);
+  auto* ddx_tensor = ddx.get_ptr();
+  auto* ddy_tensor = ddy.get_ptr();
+  auto* dx_tensor = dx.get_ptr();
+  DenseTensor dz_div_y;
+  if ((dy || dout) && (!dx_tensor || dx_tensor->dims() != out.dims())) {
+    dz_div_y.Resize(out.dims());
+    dev_ctx.template Alloc<T>(&dz_div_y);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::DivideFunctor<T>,
+                                      funcs::InverseDivideFunctor<T>>(
+        dev_ctx, grad_out, y, &dz_div_y, axis);
+    dx_tensor = &dz_div_y;
   }
-  // ddX_safe == null ? 0 : ddX
-  // ddY_safe == null ? 0 : ddY
-  DenseTensor ddX_safe, ddY_safe;
-  phi::funcs::GetDoubleGradSafeTensor<Context, T>(
-      dev_ctx, dx, ddx.get_ptr(), &ddX_safe);
-  phi::funcs::GetDoubleGradSafeTensor<Context, T>(
-      dev_ctx, y, ddy.get_ptr(), &ddY_safe);
-
   // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
   // dY = Out * dX * ddY / Y - dX * ddX / Y
   // dOut = - dX * ddY
@@ -238,69 +522,169 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
   // inplace ddx
   DenseTensor tmp;
   if (dout) {
+    dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(dout);
     tmp = *dout;
   } else {
     tmp.Resize(out.dims());
     dev_ctx.template Alloc<T>(&tmp);
   }
   if (dy) {
-    // dX_div_Y = dX / Y;
-    DenseTensor dX_div_Y = tmp;
-    funcs::DefaultElementwiseOperator<Context,
-                                      T,
-                                      funcs::DivideFunctor<T>,
-                                      funcs::InverseDivideFunctor<T>>(
-        dev_ctx, dx, y, &dX_div_Y, axis);
-
-    // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the
-    // first output tensor is nullptr, the branch to calculate first
-    // output tensor will not be activated, DivGradDx function will not
-    // be called and can be ignored, the first branch has little effect
-    // on running speed.
+    dy->Resize(y.dims());
+    dev_ctx.template Alloc<T>(dy);
+    if (!ddx_tensor && !ddy_tensor) {
+      FullLikeKernel<T, Context>(
+          dev_ctx, y, Scalar(static_cast<T>(0.0)), y.dtype(), dy);
+    } else {
+      // pre-compute 'dX / Y' into 'tmp' for 'ddout' and/or 'dy'
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::DivideFunctor<T>,
+                                        funcs::InverseDivideFunctor<T>>(
+          dev_ctx, *dx_tensor, y, &tmp, axis);
+      if (ddx_tensor && !ddy_tensor) {
+        // dy = -dX * ddX / Y
+        phi::funcs::ElemwiseGradCompute<Context,
+                                        T,
+                                        DivGradDX<T>,
+                                        DivDoubleDY_Only_DDX<T>>(
+            dev_ctx,
+            *ddx_tensor,  // ddx
+            y,
+            out,  // out
+            tmp,  // dX /Y
+            axis,
+            nullptr,
+            dy,
+            DivGradDX<T>(),
+            DivDoubleDY_Only_DDX<T>());
+      } else if (!ddx_tensor && ddy_tensor) {
+        // dY = Out * dX * ddY / Y
+        phi::funcs::ElemwiseGradCompute<Context,
+                                        T,
+                                        DivGradDX<T>,
+                                        DivDoubleDY_Only_DDY<T>>(
+            dev_ctx,
+            *dx_tensor,
+            *ddy_tensor,  // ddy
+            out,          // out
+            tmp,          // dX / Y
+            axis,
+            nullptr,
+            dy,
+            DivGradDX<T>(),
+            DivDoubleDY_Only_DDY<T>());
+      } else {
+        // dY = Out * dX * ddY / Y - dX * ddX / Y
 
-    // dY = Out * dX * ddY / Y - dX * ddX / Y
-    phi::funcs::ElemwiseGradCompute<Context, T, DivGradDX<T>, DivDoubleDY<T>>(
-        dev_ctx,
-        ddX_safe,
-        ddY_safe,
-        out,
-        dX_div_Y,
-        axis,
-        nullptr,
-        dy,
-        DivGradDX<T>(),
-        DivDoubleDY<T>());
+        // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the
+        // first output tensor is nullptr, the branch to calculate first
+        // output tensor will not be activated, DivGradDx function will not
+        // be called and can be ignored, the first branch has little effect
+        // on running speed.
+        phi::funcs::
+            ElemwiseGradCompute<Context, T, DivGradDX<T>, DivDoubleDY<T>>(
+                dev_ctx,
+                *ddx_tensor,  // ddx
+                *ddy_tensor,  // ddy
+                out,          // out
+                tmp,          // dX / Y
+                axis,
+                nullptr,
+                dy,
+                DivGradDX<T>(),
+                DivDoubleDY<T>());
+      }
+    }
   }
 
   if (ddout) {
+    ddout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(ddout);
     // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
-    funcs::DefaultElementwiseOperator<Context,
-                                      T,
-                                      funcs::MultiplyFunctor<T>,
-                                      funcs::InverseMultiplyFunctor<T>>(
-        dev_ctx, out, ddY_safe, &tmp, axis);
-    funcs::DefaultElementwiseOperator<Context,
-                                      T,
-                                      funcs::SubtractFunctor<T>,
-                                      funcs::InverseSubtractFunctor<T>>(
-        dev_ctx, ddX_safe, tmp, &tmp, axis);
-    funcs::DefaultElementwiseOperator<Context,
-                                      T,
-                                      funcs::DivideFunctor<T>,
-                                      funcs::InverseDivideFunctor<T>>(
-        dev_ctx, tmp, y, ddout, axis);
+    if (!ddx_tensor && !ddy_tensor) {
+      FullLikeKernel<T, Context>(
+          dev_ctx, out, Scalar(static_cast<T>(0.0)), out.dtype(), ddout);
+    } else if (ddx_tensor != nullptr && ddy_tensor == nullptr) {
+      // ddOut = ddX / Y
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::DivideFunctor<T>,
+                                        funcs::InverseDivideFunctor<T>>(
+          dev_ctx, *ddx_tensor, y, ddout, axis);
+    } else if (!ddx_tensor && ddy_tensor) {
+// ddOut = - Out * ddY / Y
+#if defined(__xpu__)
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, out, *ddy_tensor, &tmp, axis);
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::DivideFunctor<T>,
+                                        funcs::InverseDivideFunctor<T>>(
+          dev_ctx, tmp, y, ddout, axis);
+      auto& place = *dev_ctx.eigen_device();
+      auto ddout_result = phi::EigenVector<T>::Flatten(*ddout);
+      ddout_result.device(place) = static_cast<T>(-1) * ddout_result;
+#else
+      DivDoubleDDoutCompute<Context, T, DivDoubleDDOut_Only_DDY<T>, T>(
+          dev_ctx,
+          *dx_tensor,
+          *ddy_tensor,
+          y,
+          out,
+          axis,
+          ddout,
+          DivDoubleDDOut_Only_DDY<T>());
+#endif
+    } else {
+#if defined(__xpu__)
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, out, *ddy_tensor, &tmp, axis);
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::SubtractFunctor<T>,
+                                        funcs::InverseSubtractFunctor<T>>(
+          dev_ctx, *ddx_tensor, tmp, &tmp, axis);
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::DivideFunctor<T>,
+                                        funcs::InverseDivideFunctor<T>>(
+          dev_ctx, tmp, y, ddout, axis);
+#else
+      DivDoubleDDoutCompute<Context, T, DivDoubleDDOut<T>, T>(
+          dev_ctx,
+          *ddx_tensor,
+          *ddy_tensor,
+          y,
+          out,
+          axis,
+          ddout,
+          DivDoubleDDOut<T>());
+#endif
+    }
   }
 
   if (dout) {
-    // dOut = - dX * ddY
-    funcs::DefaultElementwiseOperator<Context,
-                                      T,
-                                      funcs::MultiplyFunctor<T>,
-                                      funcs::InverseMultiplyFunctor<T>>(
-        dev_ctx, dx, ddY_safe, dout, axis);
-    auto& place = *dev_ctx.eigen_device();
-    auto dout_result = phi::EigenVector<T>::Flatten(*dout);
-    dout_result.device(place) = static_cast<T>(-1) * dout_result;
+    if (!ddy_tensor) {
+      FullLikeKernel<T, Context>(
+          dev_ctx, out, Scalar(static_cast<T>(0.0)), out.dtype(), dout);
+    } else {
+      // dOut = - dX * ddY
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, *dx_tensor, *ddy_tensor, dout, axis);
+      auto& place = *dev_ctx.eigen_device();
+      auto dout_result = phi::EigenVector<T>::Flatten(*dout);
+      dout_result.device(place) = static_cast<T>(-1) * dout_result;
+    }
   }
 }
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h b/paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h
index 54ef6e0c1f9cb..2b1d0d60bee50 100644
--- a/paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h
@@ -116,10 +116,19 @@ void ExpandAsGradKernel(const Context& context,
       ExpandAsBackward<Context, T, 6>(
           context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
       break;
+    case 7:
+      ExpandAsBackward<Context, T, 7>(
+          context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+      break;
+    case 8:
+      ExpandAsBackward<Context, T, 8>(
+          context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+      break;
     default:
       PADDLE_THROW(errors::InvalidArgument(
-          "Only support tensor with rank being between 1 and 6. But "
+          "Only support tensor with rank being between 1 and %d. But "
           "received tensor's rank = %d.",
+          MAX_RANK_SUPPORTED,
           dims));
   }
 }
diff --git a/paddle/phi/kernels/impl/expand_as_kernel_impl.h b/paddle/phi/kernels/impl/expand_as_kernel_impl.h
index cee562b42778e..927cd73b3eb4e 100755
--- a/paddle/phi/kernels/impl/expand_as_kernel_impl.h
+++ b/paddle/phi/kernels/impl/expand_as_kernel_impl.h
@@ -20,7 +20,7 @@
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
-#define MAX_RANK_SUPPORTED 6
+#define MAX_RANK_SUPPORTED 8
 
 namespace phi {
 
@@ -158,6 +158,12 @@ void ExpandAsKernel(const Context& ctx,
     case 6:
       ExpandAs<Context, T, 6>(ctx, x, real_target_shape, out);
       break;
+    case 7:
+      ExpandAs<Context, T, 7>(ctx, x, real_target_shape, out);
+      break;
+    case 8:
+      ExpandAs<Context, T, 8>(ctx, x, real_target_shape, out);
+      break;
   }
 }
 
diff --git a/paddle/phi/kernels/impl/expand_grad_kernel_impl.h b/paddle/phi/kernels/impl/expand_grad_kernel_impl.h
index 4dd9dc4d50337..f24fff253558a 100644
--- a/paddle/phi/kernels/impl/expand_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/expand_grad_kernel_impl.h
@@ -128,10 +128,19 @@ void ExpandGradKernel(const Context& ctx,
       ExpandBackward<Context, T, 6>(
           ctx, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
       break;
+    case 7:
+      ExpandBackward<Context, T, 7>(
+          ctx, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+      break;
+    case 8:
+      ExpandBackward<Context, T, 8>(
+          ctx, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+      break;
     default:
       PADDLE_THROW(phi::errors::InvalidArgument(
-          "Only support tensor with rank being between 1 and 6. But "
+          "Only support tensor with rank being between 1 and %d. But "
           "received tensor's rank = %d.",
+          MAX_RANK_SUPPORTED,
           dims));
   }
 }
diff --git a/paddle/phi/kernels/impl/expand_kernel_impl.h b/paddle/phi/kernels/impl/expand_kernel_impl.h
index 181dd2558fa38..7d675e036a55e 100644
--- a/paddle/phi/kernels/impl/expand_kernel_impl.h
+++ b/paddle/phi/kernels/impl/expand_kernel_impl.h
@@ -19,7 +19,7 @@
 
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
-#define MAX_RANK_SUPPORTED 6
+#define MAX_RANK_SUPPORTED 8
 
 namespace phi {
 using Tensor = DenseTensor;
@@ -169,6 +169,12 @@ void ExpandKernel(const Context& ctx,
     case 6:
       Expand<Context, T, 6>(ctx, x, shape, out);
       break;
+    case 7:
+      Expand<Context, T, 7>(ctx, x, shape, out);
+      break;
+    case 8:
+      Expand<Context, T, 8>(ctx, x, shape, out);
+      break;
   }
 }
 
diff --git a/paddle/phi/kernels/impl/full_whit_tensor_kernel_impl.h b/paddle/phi/kernels/impl/full_with_tensor_kernel_impl.h
similarity index 81%
rename from paddle/phi/kernels/impl/full_whit_tensor_kernel_impl.h
rename to paddle/phi/kernels/impl/full_with_tensor_kernel_impl.h
index ae7ce8a3f41a8..375c2f8eae696 100644
--- a/paddle/phi/kernels/impl/full_whit_tensor_kernel_impl.h
+++ b/paddle/phi/kernels/impl/full_with_tensor_kernel_impl.h
@@ -20,12 +20,11 @@ namespace phi {
 
 template <typename T, typename Context>
 void FullWithTensorKernel(const Context& dev_ctx,
-                          const DenseTensor& shape,
                           const DenseTensor& value,
+                          const IntArray& shape,
                           DataType dtype,
                           DenseTensor* out) {
-  auto shape_tmp = IntArray(shape);
-  out->Resize(common::make_ddim(shape_tmp.GetData()));
-  FullKernel<T, Context>(dev_ctx, shape_tmp, Scalar(value), dtype, out);
+  out->Resize(common::make_ddim(shape.GetData()));
+  FullKernel<T, Context>(dev_ctx, shape, Scalar(value), dtype, out);
 }
 }  // namespace phi
diff --git a/paddle/phi/kernels/kps/reduce_kernel.cu b/paddle/phi/kernels/kps/reduce_kernel.cu
index 74020a8f0975b..14b7c5809a14c 100644
--- a/paddle/phi/kernels/kps/reduce_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_kernel.cu
@@ -248,8 +248,25 @@ void SumRawKernel(const Context& dev_ctx,
         "now."));
 #endif
   } else {
-    phi::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
-        dev_ctx, x, reduce_all, dims.GetData(), keep_dim, out_dtype, out);
+    if (x.dtype() == phi::DataType::BFLOAT16 &&
+        out_dtype == phi::DataType::FLOAT32) {
+      std::vector<int> reduce_dims = phi::funcs::details::GetReduceDim(
+          dims.GetData(), x.dims().size(), reduce_all);
+
+      phi::funcs::ReduceKernel<
+          phi::dtype::bfloat16,
+          float,
+          kps::AddFunctor,
+          kps::IdentityFunctor<phi::dtype::bfloat16, float>>(
+          dev_ctx,
+          x,
+          out,
+          kps::IdentityFunctor<phi::dtype::bfloat16, float>(),
+          reduce_dims);
+    } else {
+      phi::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
+          dev_ctx, x, reduce_all, dims.GetData(), keep_dim, out_dtype, out);
+    }
   }
 }
 }  // namespace phi
diff --git a/paddle/phi/kernels/onednn/batch_norm_kernel.cc b/paddle/phi/kernels/onednn/batch_norm_kernel.cc
index 9925aed993256..ede572b243a55 100644
--- a/paddle/phi/kernels/onednn/batch_norm_kernel.cc
+++ b/paddle/phi/kernels/onednn/batch_norm_kernel.cc
@@ -100,7 +100,7 @@ void BatchNormKernel(const Context &dev_ctx,
   if (!global_stats) {
     const unsigned int C = common::vectorize(mean.dims())[0];
 
-    // mkldnn only compute stats for current batch
+    // onednn only compute stats for current batch
     // so we need compute momentum stats via Eigen lib
     EigenVectorArrayMap<T> batch_mean_e(dev_ctx.template Alloc<T>(saved_mean),
                                         C);
diff --git a/paddle/phi/kernels/onednn/conv_grad_kernel.cc b/paddle/phi/kernels/onednn/conv_grad_kernel.cc
index 8c7121740370e..359e14c7f5a2a 100644
--- a/paddle/phi/kernels/onednn/conv_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/conv_grad_kernel.cc
@@ -86,7 +86,7 @@ void ConvGradKernel(const Context& dev_ctx,
                                                          input_grad,
                                                          unique_name);
 
-        // create mkldnn memory from input tensors (data/weights)
+        // create onednn memory from input tensors (data/weights)
         auto& astream = OneDNNContext::tls().get_stream();
 
         if (filter_grad) {
diff --git a/paddle/phi/kernels/selected_rows/full_kernel.cc b/paddle/phi/kernels/selected_rows/full_kernel.cc
index 0a3b3ae62fe63..66a507b95dd1e 100644
--- a/paddle/phi/kernels/selected_rows/full_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/full_kernel.cc
@@ -37,12 +37,12 @@ void FullKernel(const Context& dev_ctx,
 
 template <typename T, typename Context>
 void FullWithTensorKernel(const Context& dev_ctx,
-                          const DenseTensor& shape,
                           const DenseTensor& value,
+                          const IntArray& shape,
                           DataType dtype,
                           SelectedRows* out) {
   phi::FullWithTensorKernel<T>(
-      dev_ctx, shape, value, dtype, out->mutable_value());
+      dev_ctx, value, shape, dtype, out->mutable_value());
 }
 
 }  // namespace sr
@@ -111,7 +111,6 @@ PD_REGISTER_KERNEL(full_with_tensor_sr,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {
   kernel->InputAt(0).SetBackend(phi::Backend::CPU);
-  kernel->InputAt(1).SetBackend(phi::Backend::CPU);
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -130,7 +129,6 @@ PD_REGISTER_KERNEL(full_with_tensor_sr,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {
   kernel->InputAt(0).SetBackend(phi::Backend::CPU);
-  kernel->InputAt(1).SetBackend(phi::Backend::CPU);
 }
 #endif
 
@@ -147,6 +145,5 @@ PD_REGISTER_KERNEL(full_with_tensor_sr,
                    bool,
                    phi::dtype::float16) {
   kernel->InputAt(0).SetBackend(phi::Backend::CPU);
-  kernel->InputAt(1).SetBackend(phi::Backend::CPU);
 }
 #endif
diff --git a/paddle/phi/kernels/selected_rows/full_kernel.h b/paddle/phi/kernels/selected_rows/full_kernel.h
index 07cfe7fd6378b..2515c60ebcfb5 100644
--- a/paddle/phi/kernels/selected_rows/full_kernel.h
+++ b/paddle/phi/kernels/selected_rows/full_kernel.h
@@ -30,8 +30,8 @@ void FullKernel(const Context& dev_ctx,
 
 template <typename T, typename Context>
 void FullWithTensorKernel(const Context& dev_ctx,
-                          const DenseTensor& shape,
                           const DenseTensor& value,
+                          const IntArray& shape,
                           DataType dtype,
                           SelectedRows* out);
 }  // namespace sr
diff --git a/paddle/phi/kernels/sparse/gpu/cutlass_generator/gather_gemm_scatter_operation.py b/paddle/phi/kernels/sparse/gpu/cutlass_generator/gather_gemm_scatter_operation.py
index bc17ae6eb2c13..b8f3254292bb4 100644
--- a/paddle/phi/kernels/sparse/gpu/cutlass_generator/gather_gemm_scatter_operation.py
+++ b/paddle/phi/kernels/sparse/gpu/cutlass_generator/gather_gemm_scatter_operation.py
@@ -305,7 +305,4 @@ def __init__(
         }
 
     def layout_name(self):
-        return "{}{}".format(
-            self.ShortLayoutTypeNames[self.A.layout],
-            self.ShortLayoutTypeNames[self.B.layout],
-        )
+        return f"{self.ShortLayoutTypeNames[self.A.layout]}{self.ShortLayoutTypeNames[self.B.layout]}"
diff --git a/paddle/phi/kernels/stride/flatten_kernel.cc b/paddle/phi/kernels/stride/flatten_kernel.cc
index f2240aa9bff87..074b4bbf61323 100644
--- a/paddle/phi/kernels/stride/flatten_kernel.cc
+++ b/paddle/phi/kernels/stride/flatten_kernel.cc
@@ -23,13 +23,14 @@ void FlattenInferStridedKernel(const Context& dev_ctx,
                                const DenseTensor& x,
                                int start_axis UNUSED,
                                int stop_axis UNUSED,
-                               DenseTensor* out) {
+                               DenseTensor* out,
+                               DenseTensor* xshape) {
   ReshapeStridedKernel<Context>(
       dev_ctx,
       x,
       IntArray(common::vectorize<int64_t>(out->dims())),
       out,
-      nullptr);
+      xshape);
 }
 
 template <typename Context>
@@ -38,8 +39,9 @@ void FlattenStridedKernel(const Context& dev_ctx,
                           int start_axis,
                           int stop_axis,
                           DenseTensor* out,
-                          DenseTensor* xshape UNUSED) {
-  FlattenInferStridedKernel<Context>(dev_ctx, x, start_axis, stop_axis, out);
+                          DenseTensor* xshape) {
+  FlattenInferStridedKernel<Context>(
+      dev_ctx, x, start_axis, stop_axis, out, xshape);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/stride/reshape_kernel.cc b/paddle/phi/kernels/stride/reshape_kernel.cc
index 02d36d825c36a..7cfb20430beb5 100644
--- a/paddle/phi/kernels/stride/reshape_kernel.cc
+++ b/paddle/phi/kernels/stride/reshape_kernel.cc
@@ -31,7 +31,6 @@ void ReshapeStridedKernel(const Context& dev_ctx,
   size_t x_offset = x.offset();
   if (xshape) {
     x_dims = DDim(xshape->dims().Get() + 1, xshape->dims().size() - 1);
-    x_stride = xshape->strides();
   }
   MetaTensor meta_out(out);
   InferMetaFromVecValue(x, shape.GetData(), &meta_out);
diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc
index 569be5ce9781f..8bbd77beba467 100644
--- a/paddle/phi/kernels/transfer_layout_kernel.cc
+++ b/paddle/phi/kernels/transfer_layout_kernel.cc
@@ -151,7 +151,7 @@ void TransferLayoutMKLDNN(const Context& dev_ctx,
   }
 
   if (src_layout != DataLayout::ONEDNN && dst_layout == DataLayout::ONEDNN) {
-    // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
+    // Case1 - transform from Non-MKLDNN OPKernel to OneDNN OPKernel
     // Just set layout/format. No real transform occur
     out->ShareDataWith(x);
     // For NHWC data we need reshape of tensors as MKL-DNN
@@ -166,8 +166,8 @@ void TransferLayoutMKLDNN(const Context& dev_ctx,
     out->set_mem_desc(out_mem_desc);
   } else if (src_layout == DataLayout::ONEDNN &&
              dst_layout != DataLayout::ONEDNN) {
-    // Case2 - transform from MKLDNN OPKernel to Non-MKLDNN OPKernel
-    // Do transform via MKLDNN lib
+    // Case2 - transform from OneDNN OPKernel to Non-MKLDNN OPKernel
+    // Do transform via OneDNN lib
     funcs::TransDataLayoutFromOneDNN(
         src_layout, dst_layout, x, out, dev_ctx.GetPlace());
   } else if (src_layout == DataLayout::ONEDNN &&
diff --git a/paddle/phi/kernels/xpu/cross_entropy_kernel.cc b/paddle/phi/kernels/xpu/cross_entropy_kernel.cc
index 4a3e68169c6b2..f6ab2f87f7bac 100644
--- a/paddle/phi/kernels/xpu/cross_entropy_kernel.cc
+++ b/paddle/phi/kernels/xpu/cross_entropy_kernel.cc
@@ -32,140 +32,115 @@ void CrossEntropyWithSoftmaxKernel(const Context& dev_ctx,
                                    DenseTensor* softmax,
                                    DenseTensor* loss) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  PADDLE_ENFORCE_EQ(
-      logits.place().GetType() == phi::AllocationType::XPU,
-      true,
-      errors::PreconditionNotMet("This kernel only runs on XPU."));
-
   const int rank = logits.dims().size();
   const int axis = phi::funcs::CanonicalAxis(axis_in, rank);
   dev_ctx.template Alloc<T>(softmax);
   dev_ctx.template Alloc<T>(loss);
-  const int n = phi::funcs::SizeToAxis(axis, logits.dims());
-  const int d = phi::funcs::SizeFromAxis(axis, logits.dims());
-  std::vector<int> logits_dims = common::vectorize<int>(logits.dims());
-
-  int t = logits_dims[axis];
+  const int64_t n = phi::funcs::SizeToAxis(axis, logits.dims());
+  const int64_t d = phi::funcs::SizeOutAxis(axis, logits.dims());
+  const int64_t t = logits.dims()[axis];
+  int64_t len = logits.numel();
 
   auto logits_data = reinterpret_cast<const XPUType*>(logits.data<T>());
   auto softmax_data = reinterpret_cast<XPUType*>(softmax->data<T>());
   auto loss_data = reinterpret_cast<XPUType*>(loss->data<T>());
-  // softmax
+
   int r = XPU_SUCCESS;
   xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
-
-  if (phi::backends::xpu::get_xpu_version(dev_ctx.GetPlace().GetDeviceId()) ==
-          phi::backends::xpu::XPUVersion::XPU2 &&
-      soft_label && axis == rank - 1) {
-    auto labels_data = reinterpret_cast<const XPUType*>(labels.data<T>());
-    r = xpu::soft_softmax_with_cross_entropy<XPUType>(dev_ctx.x_context(),
-                                                      logits_data,
-                                                      labels_data,
-                                                      softmax_data,
-                                                      loss_data,
-                                                      n,
-                                                      d);
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_softmax_with_cross_entropy");
-    return;
-  }
-
-  int len = logits.numel();
-  T* clip_logits = RAII_GUARD.alloc_l3_or_gm<T>(len);
-  PADDLE_ENFORCE_XDNN_NOT_NULL(clip_logits);
-  XPUType* clip_logits_data = reinterpret_cast<XPUType*>(clip_logits);
-
-  float max_val = 1e20;
-  float min_val = -1e20;
-  if (std::is_same<T, dtype::float16>::value) {
-    max_val = 65504;
-    min_val = -65504;
-  }
-
-  r = xpu::clip_v2<XPUType>(dev_ctx.x_context(),
-                            logits_data,
-                            clip_logits_data,
-                            len,
-                            static_cast<XPUType>(min_val),
-                            static_cast<XPUType>(max_val));
-  PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2");
-
-  if (use_softmax) {
-    r = xpu::softmax<XPUType>(
-        dev_ctx.x_context(), clip_logits_data, softmax_data, logits_dims, axis);
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax");
-  } else {
-    r = xpu::copy<XPUType>(
-        dev_ctx.x_context(), clip_logits_data, softmax_data, softmax->numel());
+  if (!use_softmax) {
+    // For cross entropy only cases, logits are outputs of softmax
+    // so we just copy input logits to the softmax output.
+    r = xpu::copy<XPUType>(dev_ctx.x_context(), logits_data, softmax_data, len);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
+  } else if (d != 1) {
+    // Because we transpose inputs when axis != logits.dims().size() - 1, we
+    // need a temp buffer to save the transposed softmax.
+    softmax_data = RAII_GUARD.alloc_l3_or_gm<XPUType>(len);
   }
-  // cross_entropy
-  if (axis != rank - 1) {
-    XPUType* trans_softmax = RAII_GUARD.alloc_l3_or_gm<XPUType>(n * d);
-    PADDLE_ENFORCE_XDNN_NOT_NULL(trans_softmax);
-
-    r = xpu::transpose(dev_ctx.x_context(),
-                       softmax_data,
-                       trans_softmax,
-                       {n, t, d / t},
-                       {0, 2, 1});
+  if (d != 1) {
+    // The XPU transpose API supports softmax with axis. However, we do the
+    // transpose before softmax due to the following two reasons:
+    // 1. the XPU cross_entropy APIs supports cross entropy on the last dim
+    // only, so the transpose here is unavoidable for them.
+    // 2. the XPU softmax api would do the transpose internaly if axis is not
+    // the last dim and we can eliminate a transpose call if we explicitly
+    // transpose the inputs before the softmax calculation.
+    XPUType* logits_trans = RAII_GUARD.alloc_l3_or_gm<XPUType>(len);
+    r = xpu::transpose<XPUType>(
+        dev_ctx.x_context(), logits_data, logits_trans, {n, t, d}, {0, 2, 1});
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
-    softmax_data = trans_softmax;
+    logits_data = logits_trans;
   }
 
   if (soft_label) {
     auto labels_data = reinterpret_cast<const XPUType*>(labels.data<T>());
-    if (axis != rank - 1) {
-      XPUType* trans_label = RAII_GUARD.alloc_l3_or_gm<XPUType>(n * d);
-      PADDLE_ENFORCE_XDNN_NOT_NULL(trans_label);
-      r = xpu::transpose(dev_ctx.x_context(),
-                         labels_data,
-                         trans_label,
-                         {n, t, d / t},
-                         {0, 2, 1});
+    if (d != 1) {
+      XPUType* labels_trans =
+          RAII_GUARD.alloc_l3_or_gm<XPUType>(labels.numel());
+      r = xpu::transpose<XPUType>(
+          dev_ctx.x_context(), labels_data, labels_trans, {n, t, d}, {0, 2, 1});
       PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
-      labels_data = trans_label;
     }
-    r = xpu::soft_cross_entropy<XPUType>(dev_ctx.x_context(),
-                                         softmax_data,
-                                         labels_data,
-                                         loss_data,
-                                         axis == rank - 1 ? n : n * d / t,
-                                         axis == rank - 1 ? d : t);
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_cross_entropy");
+    if (use_softmax) {
+      // 1. softmax + soft_cross_entropy
+      r = xpu::soft_softmax_with_cross_entropy<XPUType>(dev_ctx.x_context(),
+                                                        logits_data,
+                                                        labels_data,
+                                                        softmax_data,
+                                                        loss_data,
+                                                        n * d,
+                                                        t);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_softmax_with_cross_entropy");
+    } else {
+      r = xpu::soft_cross_entropy<XPUType>(
+          dev_ctx.x_context(), logits_data, labels_data, loss_data, n * d, t);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_cross_entropy");
+    }
   } else {
-    const int* labels_int_ptr = nullptr;
-    if (labels.dtype() == DataType::INT32) {
-      labels_int_ptr = labels.data<int32_t>();
-    } else if (labels.dtype() == DataType::INT64) {
-      xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
-      int* labels_int_ptr_l3 =
-          RAII_GUARD.alloc_l3_or_gm<int32_t>(labels.numel());
-      PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3);
-
-      r = xpu::cast<int64_t, int32_t>(dev_ctx.x_context(),
-                                      labels.data<int64_t>(),
-                                      labels_int_ptr_l3,
-                                      labels.numel());
+    // 2. soft_cross_entropy only
+    const int* labels_data = nullptr;
+    if (labels.dtype() == phi::DataType::INT32) {
+      labels_data = labels.data<int>();
+    } else if (labels.dtype() == phi::DataType::INT64) {
+      int* labels_tmp = RAII_GUARD.alloc_l3_or_gm<int>(labels.numel());
+      r = xpu::cast<int64_t, int>(dev_ctx.x_context(),
+                                  labels.data<int64_t>(),
+                                  labels_tmp,
+                                  labels.numel());
       PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
-      labels_int_ptr = labels_int_ptr_l3;
+      labels_data = labels_tmp;
     } else {
-      // TODO(lilujia): other data types should be handled
       errors::Unimplemented(
-          ("cross_entropy does not support data types other than int32 and "
-           "int64"));
+          "Unsupported dtype for labels in hard cross entropy, only int32 and "
+          "int64 are supported.");
     }
-
-    r = xpu::hard_cross_entropy<XPUType, int32_t>(
-        dev_ctx.x_context(),
-        softmax_data,
-        labels_int_ptr,
-        loss_data,
-        nullptr,
-        axis == rank - 1 ? n : n * d / t,
-        axis == rank - 1 ? d : t,
-        ignore_index);
+    if (use_softmax) {
+      // 3. softmax+hard_cross_entropy
+      // do not use the fusion api for performance reason now.
+      r = xpu::softmax<XPUType>(
+          dev_ctx.x_context(), logits_data, softmax_data, {n * d, t}, 1);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax");
+    }
+    // 4. hard_cross_entropy only
+    r = xpu::hard_cross_entropy<XPUType, int>(dev_ctx.x_context(),
+                                              softmax_data,
+                                              labels_data,
+                                              loss_data,
+                                              nullptr,
+                                              n * d,
+                                              t,
+                                              -100);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "hard_cross_entropy");
   }
+
+  if (use_softmax && d != 1) {
+    r = xpu::transpose<XPUType>(dev_ctx.x_context(),
+                                softmax_data,
+                                reinterpret_cast<XPUType*>(softmax->data<T>()),
+                                {n, d, t},
+                                {0, 2, 1});
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
+  }
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/xpu/expand_as_kernel.cc b/paddle/phi/kernels/xpu/expand_as_kernel.cc
index 0701294217f41..45d0515a0b822 100644
--- a/paddle/phi/kernels/xpu/expand_as_kernel.cc
+++ b/paddle/phi/kernels/xpu/expand_as_kernel.cc
@@ -17,7 +17,7 @@
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#define MAX_RANK_SUPPORTED 6
+#define MAX_RANK_SUPPORTED 8
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/xpu/full_kernel.cc b/paddle/phi/kernels/xpu/full_kernel.cc
index 1a780f132016d..d16362931637d 100644
--- a/paddle/phi/kernels/xpu/full_kernel.cc
+++ b/paddle/phi/kernels/xpu/full_kernel.cc
@@ -23,7 +23,7 @@
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
-#include "paddle/phi/kernels/impl/full_whit_tensor_kernel_impl.h"
+#include "paddle/phi/kernels/impl/full_with_tensor_kernel_impl.h"
 
 namespace phi {
 
@@ -171,5 +171,4 @@ PD_REGISTER_KERNEL(full_with_tensor,
                    bool,
                    phi::dtype::float16) {
   kernel->InputAt(0).SetBackend(phi::Backend::CPU);
-  kernel->InputAt(1).SetBackend(phi::Backend::CPU);
 }
diff --git a/paddle/phi/kernels/xpu/xpu_api_wrapper.h b/paddle/phi/kernels/xpu/xpu_api_wrapper.h
index c6560622eaaf6..a0c3fa548b565 100644
--- a/paddle/phi/kernels/xpu/xpu_api_wrapper.h
+++ b/paddle/phi/kernels/xpu/xpu_api_wrapper.h
@@ -41,7 +41,8 @@ enum XPUFCCalcType {
 
 template <typename T>
 XPUFCCalcType FCCalcType() {
-  if (std::getenv("XPU_PADDLE_FC_FLOAT16") != nullptr &&
+  const char* xpu_paddle_fc_float16 = std::getenv("XPU_PADDLE_FC_FLOAT16");
+  if (xpu_paddle_fc_float16 != nullptr &&
       (std::is_same<phi::dtype::float16, T>::value ||
        std::is_same<XPUTypeFP16, T>::value || std::is_same<float, T>::value)) {
     return XPUFCCalcType::FC_FLOAT16;
diff --git a/paddle/pir/include/core/builtin_op.h b/paddle/pir/include/core/builtin_op.h
index f723eaa96b138..e12db2e3be124 100644
--- a/paddle/pir/include/core/builtin_op.h
+++ b/paddle/pir/include/core/builtin_op.h
@@ -87,7 +87,8 @@ class IR_API SetParameterOp : public pir::Op<SetParameterOp, SideEffectTrait> {
 /// \brief ShdowOutputOp: ShdowOutputOp(OpOperand, {StrAttribute,
 /// StrAttribute})
 ///
-class IR_API ShadowOutputOp : public pir::Op<ShadowOutputOp, SideEffectTrait> {
+class IR_API ShadowOutputOp
+    : public pir::Op<ShadowOutputOp, SideEffectTrait, ImmutableLayoutTrait> {
  public:
   using Op::Op;
   static const char *name() { return "builtin.shadow_output"; }
diff --git a/paddle/pir/include/core/op_trait.h b/paddle/pir/include/core/op_trait.h
index f39794e0888c1..dc308645cd034 100644
--- a/paddle/pir/include/core/op_trait.h
+++ b/paddle/pir/include/core/op_trait.h
@@ -121,6 +121,14 @@ class IR_API SideEffectTrait : public OpTraitBase<SideEffectTrait> {
   using Base::Base;
 };
 
+///
+/// \brief This trait marks the op's layout can't be modified.
+///
+class IR_API ImmutableLayoutTrait : public OpTraitBase<ImmutableLayoutTrait> {
+ public:
+  using Base::Base;
+};
+
 }  // namespace pir
 
 IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::SameOperandsShapeTrait)
@@ -131,3 +139,4 @@ IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::SameOperandsAndResultTypeTrait)
 IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::SameTypeOperandsTrait)
 IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::OneResultTrait)
 IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::SideEffectTrait)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::ImmutableLayoutTrait)
diff --git a/paddle/pir/include/core/operation.h b/paddle/pir/include/core/operation.h
index 7d279e50bff6e..7620e4d603ed5 100644
--- a/paddle/pir/include/core/operation.h
+++ b/paddle/pir/include/core/operation.h
@@ -113,6 +113,7 @@ class IR_API alignas(8) Operation final
   void set_attribute(const std::string &key, Attribute value) {
     attributes_[key] = value;
   }
+  void erase_attribute(const std::string &key) { attributes_.erase(key); }
   bool HasAttribute(const std::string &key) const {
     return attributes_.find(key) != attributes_.end();
   }
diff --git a/paddle/pir/include/core/type.h b/paddle/pir/include/core/type.h
index fcfe0a77a8ac5..935017e8ae791 100644
--- a/paddle/pir/include/core/type.h
+++ b/paddle/pir/include/core/type.h
@@ -99,7 +99,7 @@ class IR_API Type {
 
   template <typename T>
   bool isa() const {
-    return pir::isa<T>(*this);
+    return *this && pir::isa<T>(*this);
   }
 
   template <typename U>
diff --git a/paddle/pir/include/dialect/shape/utils/constraints_manager.h b/paddle/pir/include/dialect/shape/utils/constraints_manager.h
new file mode 100644
index 0000000000000..65533e97cf9b2
--- /dev/null
+++ b/paddle/pir/include/dialect/shape/utils/constraints_manager.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <unordered_set>
+
+#include "paddle/common/union_find_set.h"
+#include "paddle/pir/include/dialect/shape/utils/dim_expr.h"
+
+namespace symbol {
+
+class IR_API ConstraintsManager {
+ public:
+  void AddEqCstr(const DimExpr& lhs, const DimExpr& rhs);
+
+  bool IsEqual(const DimExpr& lhs, const DimExpr& rhs) const;
+
+  void AddGTOneCstr(const DimExpr& dim_expr);
+
+  bool IsGTOne(const DimExpr& dim_expr) const;
+
+  void AddBroadcastableCstr(const DimExpr& lhs, const DimExpr& rhs);
+
+  bool IsBroadcastable(const DimExpr& lhs, const DimExpr& rhs) const;
+
+  template <typename DoEachClusterT>
+  void VisitEqualClusters(const DoEachClusterT& DoEachCluster) const;
+
+  using EqualCallbackFunc = std::function<void(const DimExpr&, const DimExpr&)>;
+  void SetEqualCallbackFunc(EqualCallbackFunc equal_callback_func);
+
+ private:
+  void SubstituteInConstraint(const DimExpr& lhs, const DimExpr& rhs);
+
+  template <typename DoEachT>
+  void EqualConstraintsVisitor(const DoEachT& DoEach);
+
+  template <typename DoEachT>
+  void GTOneConstraintsVisitor(const DoEachT& DoEach);
+
+  template <typename DoEachT>
+  void BroadcastableConstraintsVisitor(const DoEachT& DoEach);
+
+ private:
+  EqualCallbackFunc equal_callback_func_ = nullptr;
+
+  using EqualConstraints = common::UnionFindSet<DimExpr>;
+  using GTOneConstraints = std::unordered_set<DimExpr>;
+  using BroadcastableConstraints = std::vector<Broadcastable<DimExpr>>;
+
+  EqualConstraints equals_;
+  GTOneConstraints gtones_;
+  BroadcastableConstraints broadcastables_;
+};
+
+std::ostream& operator<<(std::ostream& os,
+                         const ConstraintsManager& constraints_manager);
+
+}  // namespace symbol
diff --git a/paddle/pir/include/dialect/shape/utils/dim_expr_builder.h b/paddle/pir/include/dialect/shape/utils/dim_expr_builder.h
index 6d0cfbc272be2..a744a5339782f 100644
--- a/paddle/pir/include/dialect/shape/utils/dim_expr_builder.h
+++ b/paddle/pir/include/dialect/shape/utils/dim_expr_builder.h
@@ -21,9 +21,6 @@ namespace symbol {
 
 class IR_API DimExprBuilder {
  public:
-  explicit DimExprBuilder(std::vector<DimExprConstraint>* constraints)
-      : constraints_(constraints) {}
-
   DimExpr ConstSize(std::int64_t dim);
   DimExpr Symbol(const std::string& symbol_name);
   DimExpr Add(const DimExpr& lhs, const DimExpr& rhs);
@@ -35,21 +32,10 @@ class IR_API DimExprBuilder {
   DimExpr Broadcast(const DimExpr& lhs, const DimExpr& rhs);
   std::vector<DimExpr> ConstShape(const std::vector<std::int64_t>& dims);
 
-  void CstrBroadcastable(const DimExpr& lhs, const DimExpr& rhs);
-  void CstrBroadcastable(const std::vector<DimExpr>& lhs,
-                         const std::vector<DimExpr>& rhs);
-  void CstrEq(const DimExpr& lhs, const DimExpr& rhs);
-  void CstrEq(const std::vector<DimExpr>& lhs, const std::vector<DimExpr>& rhs);
-
   std::vector<DimExpr> Concat(const std::vector<DimExpr>& lhs,
                               const std::vector<DimExpr>& rhs);
   std::pair<std::vector<DimExpr>, std::vector<DimExpr>> SplitAt(
       const std::vector<DimExpr>, int index);
-
-  const std::vector<DimExprConstraint>& constraints() const;
-
- private:
-  std::vector<DimExprConstraint>* constraints_;
 };
 
 }  // namespace symbol
diff --git a/paddle/pir/include/dialect/shape/utils/dim_expr_util.h b/paddle/pir/include/dialect/shape/utils/dim_expr_util.h
index 8c10ef805875f..1513788a68e18 100644
--- a/paddle/pir/include/dialect/shape/utils/dim_expr_util.h
+++ b/paddle/pir/include/dialect/shape/utils/dim_expr_util.h
@@ -27,6 +27,16 @@ IR_API DimExpr SubstituteDimExpr(
     const DimExpr& dim_expr,
     const std::unordered_map<DimExpr, DimExpr>& pattern_to_replacement);
 
+IR_API int GetDimExprPriority(const DimExpr& dim_expr);
+
+enum class PriorityComparisonStatus {
+  HIGHER,  // lhs has a higher priority than rhs
+  EQUAL,   // lhs and rhs have equal priority
+  LOWER    // lhs has a lower priority than rhs
+};
+IR_API PriorityComparisonStatus CompareDimExprPriority(const DimExpr& lhs,
+                                                       const DimExpr& rhs);
+
 IR_API std::unordered_set<std::string> CollectDimExprSymbols(
     const DimExpr& dim_expr);
 
diff --git a/paddle/pir/include/dialect/shape/utils/shape_analysis.h b/paddle/pir/include/dialect/shape/utils/shape_analysis.h
index fd3a5b45fee05..677ed41b5e41f 100644
--- a/paddle/pir/include/dialect/shape/utils/shape_analysis.h
+++ b/paddle/pir/include/dialect/shape/utils/shape_analysis.h
@@ -20,6 +20,7 @@
 #include "paddle/pir/include/core/dll_decl.h"
 #include "paddle/pir/include/core/utils.h"
 #include "paddle/pir/include/dialect/shape/ir/shape_op.h"
+#include "paddle/pir/include/dialect/shape/utils/constraints_manager.h"
 #include "paddle/pir/include/dialect/shape/utils/dim_expr_builder.h"
 #include "paddle/pir/include/dialect/shape/utils/shape_or_data_expr.h"
 
@@ -39,7 +40,19 @@ class IR_API ShapeConstraintIRAnalysis {
   void SetShapeOrDataForValue(Value val,
                               const symbol::ShapeOrDataDimExprs& shape_or_data);
 
-  symbol::DimExprBuilder DimExprBuilder();
+  void AddEqualCstr(const symbol::DimExpr& lhs, const symbol::DimExpr& rhs);
+
+  bool IsEqual(const symbol::DimExpr& lhs, const symbol::DimExpr& rhs) const;
+
+  void AddGreatThanOneCstr(const symbol::DimExpr& dim_expr);
+
+  bool IsGreatThanOne(const symbol::DimExpr& dim_expr) const;
+
+  void AddBroadcastableCstr(const symbol::DimExpr& lhs,
+                            const symbol::DimExpr& rhs);
+
+  bool IsBroadcastable(const symbol::DimExpr& lhs,
+                       const symbol::DimExpr& rhs) const;
 
   // Used to debug
   void PrintShapeOrDatas() const;
@@ -76,6 +89,10 @@ class IR_API ShapeConstraintIRAnalysis {
   symbol::DimExpr GetProductDimExpr(Value lhs,
                                     const std::vector<int>& lhs_dim_idxs) const;
 
+ private:
+  void SubstituteDimExpr(const symbol::DimExpr& origin,
+                         const symbol::DimExpr& substituted);
+
  private:
   ModuleOp m_;
 
@@ -84,7 +101,11 @@ class IR_API ShapeConstraintIRAnalysis {
   std::unordered_map<Value, symbol::ShapeOrDataDimExprs>
       value_to_shape_or_data_;
 
-  std::vector<symbol::DimExprConstraint> constraints_;
+  symbol::ConstraintsManager constraints_manager_;
+
+  using DimExprSubstitutionPattern =
+      std::unordered_map<symbol::DimExpr, symbol::DimExpr>;
+  DimExprSubstitutionPattern substitution_pattern_;
 };
 
 class IR_API ShapeAnalysisManager {
diff --git a/paddle/pir/include/dialect/shape/utils/shape_or_data_expr.h b/paddle/pir/include/dialect/shape/utils/shape_or_data_expr.h
index bada3c93d5cc6..0cd2b6b68e785 100644
--- a/paddle/pir/include/dialect/shape/utils/shape_or_data_expr.h
+++ b/paddle/pir/include/dialect/shape/utils/shape_or_data_expr.h
@@ -166,6 +166,10 @@ class ShapeOrDataDimExprs : public ShapeOrDataDimExprsBase {
   }
 };
 
+IR_API ShapeOrDataDimExprs SubstituteShapeOrData(
+    const ShapeOrDataDimExprs& shape_or_data,
+    const std::unordered_map<DimExpr, DimExpr>& substitution_pattern);
+
 IR_API std::ostream& operator<<(std::ostream&,
                                 const ShapeOrDataDimExprs& dim_expr);
 }  // namespace symbol
diff --git a/paddle/pir/include/pass/analysis_manager.h b/paddle/pir/include/pass/analysis_manager.h
index 6c2c3ea7198fc..c5ede5b948ff3 100644
--- a/paddle/pir/include/pass/analysis_manager.h
+++ b/paddle/pir/include/pass/analysis_manager.h
@@ -163,7 +163,7 @@ class AnalysisMap {
   void Invalidate(const PreservedAnalyses& pa) {
     PreservedAnalyses pa_copy(pa);
 
-    // Remove any analyses that were invalidaed.
+    // Remove any analyses that were invalidated.
     // As using MapVector, order of insertion is preserved and
     // dependencies always go before users, so need only one iteration.
     for (auto it = analyses_.begin(); it != analyses_.end();) {
@@ -240,7 +240,7 @@ class AnalysisMap {
 }  // namespace detail
 
 /// This class is intended to be passed around by value, and can not be
-/// constructed direcyly.
+/// constructed directly.
 class AnalysisManager {
  public:
   using PreservedAnalyses = detail::PreservedAnalyses;
diff --git a/paddle/pir/include/pattern_rewrite/pattern_match.h b/paddle/pir/include/pattern_rewrite/pattern_match.h
index 8c34385b400e2..214b78a917189 100644
--- a/paddle/pir/include/pattern_rewrite/pattern_match.h
+++ b/paddle/pir/include/pattern_rewrite/pattern_match.h
@@ -36,7 +36,7 @@
 
 namespace pir {
 
-// This class reprensents the benefit of a pattern. The most common
+// This class represents the benefit of a pattern. The most common
 // unit to use is the `number of operations` in the pattern.
 class IR_API PatternBenefit {
  public:
diff --git a/paddle/pir/src/core/block.cc b/paddle/pir/src/core/block.cc
index 1d9021a47b47b..4cf5e38de10a6 100644
--- a/paddle/pir/src/core/block.cc
+++ b/paddle/pir/src/core/block.cc
@@ -38,7 +38,10 @@ void Block::push_back(Operation *op) { insert(ops_.end(), op); }
 void Block::push_front(Operation *op) { insert(ops_.begin(), op); }
 
 void Block::pop_back() {
-  IR_ENFORCE(!ops_.empty(), "can't pop back from empty block.");
+  PADDLE_ENFORCE_EQ(
+      !ops_.empty(),
+      true,
+      phi::errors::InvalidArgument("can't pop back from empty block."));
   ops_.back()->Destroy();
   ops_.pop_back();
 }
@@ -54,7 +57,10 @@ Block::Iterator Block::insert(ConstIterator iterator, Operation *op) {
 }
 
 Block::Iterator Block::erase(ConstIterator position) {
-  IR_ENFORCE(position->GetParent() == this, "iterator not own this block.");
+  PADDLE_ENFORCE_EQ(
+      position->GetParent(),
+      this,
+      phi::errors::InvalidArgument("iterator not own this block."));
   position->Destroy();
   return ops_.erase(position);
 }
@@ -66,14 +72,20 @@ void Block::ClearOps() {
 }
 
 void Block::Assign(Iterator position, Operation *op) {
-  IR_ENFORCE(position->GetParent() == this, "position not own this block.");
+  PADDLE_ENFORCE_EQ(
+      position->GetParent(),
+      this,
+      phi::errors::InvalidArgument("position not own this block."));
   position->Destroy();
   position.set_underlying_pointer(op);
   op->SetParent(this, position);
 }
 
 Operation *Block::Take(Operation *op) {
-  IR_ENFORCE(op && op->GetParent() == this, "iterator not own this block.");
+  PADDLE_ENFORCE_EQ(
+      op && op->GetParent() == this,
+      true,
+      phi::errors::InvalidArgument("iterator not own this block."));
   ops_.erase(Iterator(*op));
   return op;
 }
@@ -87,10 +99,14 @@ Block::UseIterator Block::use_end() const { return Block::UseIterator(); }
 bool Block::HasOneUse() const { return first_use_ && !first_use_.next_use(); }
 
 void Block::ResetOpListOrder(const OpListType &new_op_list) {
-  IR_ENFORCE(new_op_list.size() == ops_.size(),
-             "The size of new_op_list not same with ops_.");
-  IR_ENFORCE(TopoOrderCheck(new_op_list),
-             "The new_op_list is not in topological order.");
+  PADDLE_ENFORCE_EQ(new_op_list.size(),
+                    ops_.size(),
+                    phi::errors::InvalidArgument(
+                        "The size of new_op_list not same with ops_."));
+  PADDLE_ENFORCE_EQ(TopoOrderCheck(new_op_list),
+                    true,
+                    phi::errors::InvalidArgument(
+                        "The new_op_list is not in topological order."));
 
   ops_.clear();
   for (Operation *op : new_op_list) {
@@ -113,8 +129,10 @@ Value Block::AddArg(Type type) {
 
 void Block::EraseArg(uint32_t index) {
   auto argument = arg(index);
-  IR_ENFORCE(argument.use_empty(),
-             "Erase a block argument that is still in use.");
+  PADDLE_ENFORCE_EQ(argument.use_empty(),
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Erase a block argument that is still in use."));
   argument.dyn_cast<BlockArgument>().Destroy();
   args_.erase(args_.begin() + index);
 }
@@ -127,21 +145,28 @@ void Block::ClearKwargs() {
 }
 
 Value Block::AddKwarg(const std::string &keyword, Type type) {
-  IR_ENFORCE(kwargs_.find(keyword) == kwargs_.end(),
-             "Add keyword (%s) argument which has been existed.",
-             keyword.c_str());
+  PADDLE_ENFORCE_EQ(kwargs_.find(keyword),
+                    kwargs_.end(),
+                    phi::errors::InvalidArgument(
+                        "Add keyword (%s) argument which has been existed.",
+                        keyword.c_str()));
   auto arg = BlockArgument::Create(type, this, keyword);
   kwargs_[keyword] = arg;
   return arg;
 }
 
 void Block::EraseKwarg(const std::string &keyword) {
-  IR_ENFORCE(kwargs_.find(keyword) != kwargs_.end(),
-             "Erase keyword (%s) argument which doesn't existed.",
-             keyword.c_str());
+  PADDLE_ENFORCE_NE(kwargs_.find(keyword),
+                    kwargs_.end(),
+                    phi::errors::InvalidArgument(
+                        "Erase keyword (%s) argument which doesn't existed.",
+                        keyword.c_str()));
   auto kwarg = kwargs_[keyword];
-  IR_ENFORCE(kwarg.use_empty(),
-             "Erase a block keyword argument that is still in use.");
+  PADDLE_ENFORCE_EQ(
+      kwarg.use_empty(),
+      true,
+      phi::errors::InvalidArgument(
+          "Erase a block keyword argument that is still in use."));
   kwarg.dyn_cast<BlockArgument>().Destroy();
   kwargs_.erase(keyword);
 }
diff --git a/paddle/pir/src/core/block_argument.cc b/paddle/pir/src/core/block_argument.cc
index 85ed7e2fa6b77..9a21f59fe38b3 100644
--- a/paddle/pir/src/core/block_argument.cc
+++ b/paddle/pir/src/core/block_argument.cc
@@ -21,8 +21,11 @@
 
 #include "paddle/common/enforce.h"
 
-#define CHECK_NULL_IMPL(func_name) \
-  IR_ENFORCE(impl_, "impl_ is null when called BlockArgument:" #func_name)
+#define CHECK_NULL_IMPL(func_name)  \
+  PADDLE_ENFORCE_NOT_NULL(          \
+      impl_,                        \
+      phi::errors::InvalidArgument( \
+          "impl_ is null when called BlockArgument:" #func_name))
 
 #define IMPL_ static_cast<detail::BlockArgumentImpl *>(impl_)
 
diff --git a/paddle/pir/src/core/block_operand.cc b/paddle/pir/src/core/block_operand.cc
index eb3558bb51dce..be6f7c4a22cda 100644
--- a/paddle/pir/src/core/block_operand.cc
+++ b/paddle/pir/src/core/block_operand.cc
@@ -20,10 +20,12 @@
 
 namespace pir {
 
-#define CHECK_BLOCK_OPERAND_NULL_IMPL(func_name)                \
-  IR_ENFORCE(impl_,                                             \
-             "impl_ pointer is null when call func:" #func_name \
-             " , in class: BlockOperand.")
+#define CHECK_BLOCK_OPERAND_NULL_IMPL(func_name)             \
+  PADDLE_ENFORCE_NOT_NULL(                                   \
+      impl_,                                                 \
+      phi::errors::InvalidArgument(                          \
+          "impl_ pointer is null when call func:" #func_name \
+          " , in class: BlockOperand."))
 
 BlockOperand &BlockOperand::operator=(const BlockOperand &rhs) {
   if (this == &rhs) return *this;
diff --git a/paddle/pir/src/core/builtin_op.cc b/paddle/pir/src/core/builtin_op.cc
index fca2ebe63eea5..c7b5d145adb1b 100644
--- a/paddle/pir/src/core/builtin_op.cc
+++ b/paddle/pir/src/core/builtin_op.cc
@@ -66,11 +66,15 @@ Program *ModuleOp::program() {
 }
 
 Block &ModuleOp::block() {
-  IR_ENFORCE(operation()->num_regions(),
-             "The region size of ModuleOp must be equal to 1.");
+  PADDLE_ENFORCE_GT(operation()->num_regions(),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The region size of ModuleOp must be equal to 1."));
   auto &region = (*this)->region(0);
-  IR_ENFORCE(region.size() == 1,
-             "The region size of ModuleOp must be equal to 1.");
+  PADDLE_ENFORCE_EQ(region.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The region size of ModuleOp must be equal to 1."));
   return region.front();
 }
 
@@ -94,16 +98,24 @@ void ModuleOp::Destroy() {
 void ModuleOp::VerifySig() const {
   VLOG(10) << "Verifying inputs, outputs and attributes for: ModuleOp.";
   // Verify inputs:
-  IR_ENFORCE(num_operands() == 0u, "The size of inputs must be equal to 0.");
+  PADDLE_ENFORCE_EQ(
+      num_operands(),
+      0u,
+      phi::errors::InvalidArgument("The size of inputs must be equal to 0."));
 
   // Verify attributes:
   auto &attributes = this->attributes();
   auto iter = attributes.find("program");
-  IR_ENFORCE(iter != attributes.end() && iter->second.isa<PointerAttribute>(),
-             "Type of attribute: program is not right.");
+  PADDLE_ENFORCE_EQ(
+      iter != attributes.end() && iter->second.isa<PointerAttribute>(),
+      true,
+      phi::errors::InvalidArgument("Type of attribute: program is not right."));
 
   // Verify outputs:
-  IR_ENFORCE(num_results() == 0u, "The size of inputs must be equal to 0.");
+  PADDLE_ENFORCE_EQ(
+      num_results(),
+      0u,
+      phi::errors::InvalidArgument("The size of inputs must be equal to 0."));
 }
 
 const char *ParameterOp::attributes_name[attributes_num] = {  // NOLINT
@@ -132,16 +144,25 @@ std::string ParameterOp::param_name() const {
 void ParameterOp::VerifySig() const {
   VLOG(10) << "Verifying inputs, outputs and attributes for: ParameterOp.";
   // Verify inputs:
-  IR_ENFORCE(num_operands() == 0u, "The size of inputs must be equal to 0.");
+  PADDLE_ENFORCE_EQ(
+      num_operands(),
+      0u,
+      phi::errors::InvalidArgument("The size of inputs must be equal to 0."));
 
   // Verify if attributes contain attribute name in attributes_name:
   auto &attributes = this->attributes();
   auto iter = attributes.find("parameter_name");
-  IR_ENFORCE(iter != attributes.end() && iter->second.isa<StrAttribute>(),
-             "Type of attribute: parameter_name is not right.");
+  PADDLE_ENFORCE_EQ(
+      iter != attributes.end() && iter->second.isa<StrAttribute>(),
+      true,
+      phi::errors::InvalidArgument(
+          "Type of attribute: parameter_name is not right."));
 
   // Verify outputs type:
-  IR_ENFORCE(num_results() == 1u, "The size of outputs must be equal to 1.");
+  PADDLE_ENFORCE_EQ(
+      num_results(),
+      1u,
+      phi::errors::InvalidArgument("The size of outputs must be equal to 1."));
 }
 
 const char *SetParameterOp::attributes_name[attributes_num] = {  // NOLINT
@@ -158,16 +179,25 @@ void SetParameterOp::Build(Builder &builder,             // NOLINT
 void SetParameterOp::VerifySig() const {
   VLOG(10) << "Verifying inputs, outputs and attributes for: SetParameterOp.";
   // Verify inputs:
-  IR_ENFORCE(num_operands() == 1, "The size of outputs must be equal to 1.");
+  PADDLE_ENFORCE_EQ(
+      num_operands(),
+      1,
+      phi::errors::InvalidArgument("The size of outputs must be equal to 1."));
 
   // Verify attributes:
   auto &attributes = this->attributes();
   auto iter = attributes.find("parameter_name");
-  IR_ENFORCE(iter != attributes.end() && iter->second.isa<StrAttribute>(),
-             "Type of attribute: parameter_name is not right.");
+  PADDLE_ENFORCE_EQ(
+      iter != attributes.end() && iter->second.isa<StrAttribute>(),
+      true,
+      phi::errors::InvalidArgument(
+          "Type of attribute: parameter_name is not right."));
 
   // Verify outputs:
-  IR_ENFORCE(num_results() == 0u, "The size of outputs must be equal to 0.");
+  PADDLE_ENFORCE_EQ(
+      num_results(),
+      0u,
+      phi::errors::InvalidArgument("The size of outputs must be equal to 0."));
 }
 
 const char *ShadowOutputOp::attributes_name[attributes_num] = {  // NOLINT
@@ -184,16 +214,25 @@ void ShadowOutputOp::Build(Builder &builder,             // NOLINT
 void ShadowOutputOp::VerifySig() const {
   VLOG(10) << "Verifying inputs, outputs and attributes for: ShadowOutputOp.";
   // Verify inputs:
-  IR_ENFORCE(num_operands() == 1, "The size of outputs must be equal to 1.");
+  PADDLE_ENFORCE_EQ(
+      num_operands(),
+      1,
+      phi::errors::InvalidArgument("The size of outputs must be equal to 1."));
 
   // Verify attributes:
   auto &attributes = this->attributes();
   auto iter = attributes.find("output_name");
-  IR_ENFORCE(iter != attributes.end() && iter->second.isa<StrAttribute>(),
-             "Type of attribute: output_name is not right.");
+  PADDLE_ENFORCE_EQ(
+      iter != attributes.end() && iter->second.isa<StrAttribute>(),
+      true,
+      phi::errors::InvalidArgument(
+          "Type of attribute: output_name is not right."));
 
   // Verify outputs:
-  IR_ENFORCE(num_results() == 0u, "The size of outputs must be equal to 0.");
+  PADDLE_ENFORCE_EQ(
+      num_results(),
+      0u,
+      phi::errors::InvalidArgument("The size of outputs must be equal to 0."));
 }
 
 void CombineOp::Build(Builder &builder,
@@ -210,30 +249,40 @@ void CombineOp::Build(Builder &builder,
 
 void CombineOp::VerifySig() const {
   // outputs.size() == 1
-  IR_ENFORCE(num_results() == 1u, "The size of outputs must be equal to 1.");
+  PADDLE_ENFORCE_EQ(
+      num_results(),
+      1u,
+      phi::errors::InvalidArgument("The size of outputs must be equal to 1."));
 
   // output_type == Vector<Type>
   auto output_type = (*this)->result(0).type().dyn_cast<VectorType>();
-  IR_ENFORCE(output_type,
-             "The type of outputs[0] must be equal to VectorType.");
+  PADDLE_ENFORCE_NOT_NULL(
+      output_type,
+      phi::errors::InvalidArgument(
+          "The type of outputs[0] must be equal to VectorType."));
 
   // inputs.size() == outputs[0].size()
   auto input_num = num_operands();
-  IR_ENFORCE(output_type.size() == input_num,
-             "The size %d of output must be equal to size %d of inputs.",
-             output_type.size(),
-             input_num);
+  PADDLE_ENFORCE_EQ(
+      output_type.size(),
+      input_num,
+      phi::errors::InvalidArgument(
+          "The size %d of output must be equal to size %d of inputs.",
+          output_type.size(),
+          input_num));
 
   // forall i in inputs.size(): inputs[i].type == outputs[0][i].type
   for (size_t i = 0; i < input_num; ++i) {
     auto type = (*this)->operand(i).type();
-    IR_ENFORCE(output_type[i] == type,
-               "The type %s of outputs[0][%d] must be "
-               "equal to type %s of inputs[%d].",
-               output_type[i],
-               i,
-               type,
-               i);
+    PADDLE_ENFORCE_EQ(
+        output_type[i],
+        type,
+        phi::errors::InvalidArgument("The type %s of outputs[0][%d] must be "
+                                     "equal to type %s of inputs[%d].",
+                                     output_type[i],
+                                     i,
+                                     type,
+                                     i));
   }
 }
 
@@ -258,9 +307,11 @@ void SliceOp::PassStopGradients(OperationArgument &argument, int index) {
   if (auto input = argument.inputs[0]) {
     auto *defining_op = input.defining_op();
     if (defining_op && defining_op->isa<CombineOp>()) {
-      IR_ENFORCE(defining_op->HasAttribute(kStopGradientAttrName),
-                 "Required CombineOp must have attribute %s",
-                 kStopGradientAttrName);
+      PADDLE_ENFORCE_EQ(defining_op->HasAttribute(kStopGradientAttrName),
+                        true,
+                        phi::errors::InvalidArgument(
+                            "Required CombineOp must have attribute %s",
+                            kStopGradientAttrName));
       auto attrs = defining_op->attribute(kStopGradientAttrName)
                        .dyn_cast<pir::ArrayAttribute>()
                        .AsVector();
@@ -279,9 +330,11 @@ void SliceOp::RefreshStopGradients() {
   if (auto input = (*this)->operand_source(0)) {
     auto *defining_op = input.defining_op();
     if (defining_op && defining_op->isa<CombineOp>()) {
-      IR_ENFORCE(defining_op->HasAttribute(kStopGradientAttrName),
-                 "Required CombineOp must have attribute %s",
-                 kStopGradientAttrName);
+      PADDLE_ENFORCE_EQ(defining_op->HasAttribute(kStopGradientAttrName),
+                        true,
+                        phi::errors::InvalidArgument(
+                            "Required CombineOp must have attribute %s",
+                            kStopGradientAttrName));
       auto attr = defining_op->attribute(kStopGradientAttrName)
                       .dyn_cast<pir::ArrayAttribute>();
       outs_stop_gradient[0] = attr.at(static_cast<size_t>(index));
@@ -295,46 +348,64 @@ void SliceOp::RefreshStopGradients() {
 void SliceOp::VerifySig() const {
   // inputs.size() == 1
   auto input_size = num_operands();
-  IR_ENFORCE(
-      input_size == 1, "The size %d of inputs must be equal to 1.", input_size);
+  PADDLE_ENFORCE_EQ(
+      input_size,
+      1,
+      phi::errors::InvalidArgument("The size %d of inputs must be equal to 1.",
+                                   input_size));
 
   // inputs[0].type == Vector<Type>
   auto input_type = (*this)->operand(0).type().dyn_cast<pir::VectorType>();
-  IR_ENFORCE(input_type,
-             "The type %s of inputs[0] must be equal to VectorType.",
-             input_type);
+  PADDLE_ENFORCE_NOT_NULL(
+      input_type,
+      phi::errors::InvalidArgument(
+          "The type %s of inputs[0] must be equal to VectorType.", input_type));
 
   auto output_size = num_results();
   // outputs.size() == 1
-  IR_ENFORCE(output_size == 1,
-             "The size %d of outputs must be equal to 1.",
-             output_size);
+  PADDLE_ENFORCE_EQ(
+      output_size,
+      1,
+      phi::errors::InvalidArgument("The size %d of outputs must be equal to 1.",
+                                   output_size));
 
   // attributes contains index: Int32
   auto &attributes = this->attributes();
-  IR_ENFORCE(attributes.count("index") != 0,
-             "The attributes must contains index.");
+  PADDLE_ENFORCE_NE(
+      attributes.count("index"),
+      0,
+      phi::errors::InvalidArgument("The attributes must contains index."));
   const pir::Attribute &attr = attributes.at("index");
-  IR_ENFORCE(attr.isa<pir::Int32Attribute>(),
-             "The attribute index must be INT32.");
+  PADDLE_ENFORCE_EQ(
+      attr.isa<pir::Int32Attribute>(),
+      true,
+      phi::errors::InvalidArgument("The attribute index must be INT32."));
   auto index = attr.dyn_cast<pir::Int32Attribute>().data();
 
   // index >= 0 and < inputs[0].size()
-  IR_ENFORCE(
-      index >= 0, "The index %d must be greater or equal than 0.", index);
-  IR_ENFORCE(static_cast<size_t>(index) < input_type.size(),
-             "The index %d must be less or equal than size %d of inputs[0].",
-             index,
-             input_type.size());
+  PADDLE_ENFORCE_GE(
+      index,
+      0,
+      phi::errors::InvalidArgument(
+          "The index %d must be greater or equal than 0.", index));
+  PADDLE_ENFORCE_LT(
+      static_cast<size_t>(index),
+      input_type.size(),
+      phi::errors::InvalidArgument(
+          "The index %d must be less or equal than size %d of inputs[0].",
+          index,
+          input_type.size()));
 
   // inputs[index].type == outputs[0].type
   auto output_type = (*this)->result(0).type();
-  IR_ENFORCE(
-      input_type[index] == output_type,
-      "The type %s of inputs[%d] must be equal to type %s of outputs[0].",
+  PADDLE_ENFORCE_EQ(
       input_type[index],
-      index,
-      output_type);
+      output_type,
+      phi::errors::InvalidArgument(
+          "The type %s of inputs[%d] must be equal to type %s of outputs[0].",
+          input_type[index],
+          index,
+          output_type));
 }
 
 void SplitOp::Build(Builder &builder,
@@ -355,12 +426,14 @@ void SplitOp::PassStopGradients(OperationArgument &argument) {
   if (auto input = argument.inputs[0]) {
     auto *defining_op = input.defining_op();
     if (defining_op && defining_op->isa<CombineOp>()) {
-      IR_ENFORCE(!argument.output_types.empty(),
-                 defining_op->num_operands(),
-                 "Required SplitOp.output.size() == CombineOp.input.size(), "
-                 "but received %d != %d",
-                 argument.output_types.size(),
-                 defining_op->num_operands());
+      PADDLE_ENFORCE_EQ(
+          argument.output_types.size(),
+          defining_op->num_operands(),
+          phi::errors::InvalidArgument(
+              "Required SplitOp.output.size() == CombineOp.input.size(), "
+              "but received %d != %d",
+              argument.output_types.size(),
+              defining_op->num_operands()));
       for (uint32_t i = 0; i < defining_op->num_operands(); ++i) {
         auto attr =
             defining_op->operand_source(i).attribute<pir::BoolAttribute>(
@@ -397,12 +470,14 @@ void SplitOp::RefreshStopGradients() {
   if (auto input = (*this)->operand_source(0)) {
     auto *defining_op = input.defining_op();
     if (defining_op && defining_op->isa<CombineOp>()) {
-      IR_ENFORCE((*this)->num_results(),
-                 defining_op->num_operands(),
-                 "Required SplitOp.output.size() == CombineOp.input.size(), "
-                 "but received %d != %d",
-                 (*this)->num_results(),
-                 defining_op->num_operands());
+      PADDLE_ENFORCE_EQ(
+          (*this)->num_results(),
+          defining_op->num_operands(),
+          phi::errors::InvalidArgument(
+              "Required SplitOp.output.size() == CombineOp.input.size(), "
+              "but received %d != %d",
+              (*this)->num_results(),
+              defining_op->num_operands()));
       for (uint32_t i = 0; i < defining_op->num_operands(); ++i) {
         auto value = defining_op->operand_source(i);
         if (!value) continue;
@@ -441,18 +516,27 @@ void SplitOp::RefreshStopGradients() {
 
 void SplitOp::VerifySig() const {
   // inputs.size() == 1
-  IR_ENFORCE(num_operands() == 1u, "The size of inputs must be equal to 1.");
+  PADDLE_ENFORCE_EQ(
+      num_operands(),
+      1u,
+      phi::errors::InvalidArgument("The size of inputs must be equal to 1."));
 
   // input_type == Vector<Type>
   auto input_type = (*this)->operand(0).type().dyn_cast<VectorType>();
-  IR_ENFORCE(input_type, "The type of inputs[0] must be equal to VectorType.");
+  PADDLE_ENFORCE_NOT_NULL(
+      input_type,
+      phi::errors::InvalidArgument(
+          "The type of inputs[0] must be equal to VectorType."));
 
   // inputs[0].size() == outputs.size()
   auto output_num = num_results();
-  IR_ENFORCE(input_type.size() == output_num,
-             "The size %d of output must be equal to size %d of inputs.",
-             output_num,
-             input_type.size());
+  PADDLE_ENFORCE_EQ(
+      input_type.size(),
+      output_num,
+      phi::errors::InvalidArgument(
+          "The size %d of output must be equal to size %d of inputs.",
+          output_num,
+          input_type.size()));
 
   // for all i in outputs.size(): outputs[i].type == inputs[0][i].type
   // TODO(@xiongkun) consult zhangbo to check what to do with null type.
@@ -469,17 +553,27 @@ void ConstantOp::Build(Builder &builder,
 }
 
 void ConstantOp::VerifySig() const {
-  IR_ENFORCE(num_operands() == 0, "The size of inputs must be equal to 0.");
-  IR_ENFORCE(num_results() == 1, "The size of outputs must be equal to 1.");
-  IR_ENFORCE(attributes().count("value") > 0, "must has value attribute");
+  PADDLE_ENFORCE_EQ(
+      num_operands(),
+      0,
+      phi::errors::InvalidArgument("The size of inputs must be equal to 0."));
+  PADDLE_ENFORCE_EQ(
+      num_results(),
+      1,
+      phi::errors::InvalidArgument("The size of outputs must be equal to 1."));
+  PADDLE_ENFORCE_GT(attributes().count("value"),
+                    0,
+                    phi::errors::InvalidArgument("must has value attribute"));
 }
 
 Attribute ConstantOp::value() const { return attributes().at("value"); }
 
 void ConstantTensorOp::VerifySig() const {
   ConstantOp::VerifySig();
-  IR_ENFORCE(value().isa<pir::TensorNameAttribute>(),
-             "Type of value must be str attribute");
+  PADDLE_ENFORCE_EQ(
+      value().isa<pir::TensorNameAttribute>(),
+      true,
+      phi::errors::InvalidArgument("Type of value must be str attribute"));
 }
 
 ConstantTensorOp ConstantTensorOp::dyn_cast(Operation *op) {
diff --git a/paddle/pir/src/core/ir_context.cc b/paddle/pir/src/core/ir_context.cc
index 90393fe4370b9..9d94d45413aaf 100644
--- a/paddle/pir/src/core/ir_context.cc
+++ b/paddle/pir/src/core/ir_context.cc
@@ -321,7 +321,9 @@ const OpInfoMap &IrContext::registered_op_info_map() {
 
 const AbstractType &AbstractType::lookup(TypeId type_id, IrContext *ctx) {
   AbstractType *abstract_type = ctx->impl().GetAbstractType(type_id);
-  IR_ENFORCE(abstract_type, "Abstract type not found in IrContext.");
+  PADDLE_ENFORCE_NOT_NULL(
+      abstract_type,
+      phi::errors::InvalidArgument("Abstract type not found in IrContext."));
   return *abstract_type;
 }
 
@@ -329,7 +331,9 @@ const AbstractAttribute &AbstractAttribute::lookup(TypeId type_id,
                                                    IrContext *ctx) {
   AbstractAttribute *abstract_attribute =
       ctx->impl().GetAbstractAttribute(type_id);
-  IR_ENFORCE(abstract_attribute, "Abstract attribute not found in IrContext.");
+  PADDLE_ENFORCE_NOT_NULL(abstract_attribute,
+                          phi::errors::InvalidArgument(
+                              "Abstract attribute not found in IrContext."));
   return *abstract_attribute;
 }
 
diff --git a/paddle/pir/src/core/ir_printer.cc b/paddle/pir/src/core/ir_printer.cc
index e2bc7757f9de4..c6881602cea9f 100644
--- a/paddle/pir/src/core/ir_printer.cc
+++ b/paddle/pir/src/core/ir_printer.cc
@@ -356,7 +356,9 @@ void IrPrinter::PrintOpReturnType(Operation* op) {
 
 void IrPrinter::AddValueAlias(Value v, const std::string& alias) {
   const void* key = v.impl();
-  IR_ENFORCE(aliases_.find(key) == aliases_.end(), "Value already has alias");
+  PADDLE_ENFORCE_EQ(aliases_.find(key),
+                    aliases_.end(),
+                    phi::errors::InvalidArgument("Value already has alias"));
   aliases_[key] = alias;
 }
 
diff --git a/paddle/pir/src/core/op_info_impl.cc b/paddle/pir/src/core/op_info_impl.cc
index f9d5295671113..da699de26ba7a 100644
--- a/paddle/pir/src/core/op_info_impl.cc
+++ b/paddle/pir/src/core/op_info_impl.cc
@@ -21,15 +21,19 @@
 namespace pir {
 
 void OpInfo::AttachInterface(InterfaceValue &&interface_value) {
-  IR_ENFORCE(impl_, "Cann't attach interface to a nullptr OpInfo");
+  PADDLE_ENFORCE_NOT_NULL(impl_,
+                          phi::errors::InvalidArgument(
+                              "Cann't attach interface to a nullptr OpInfo"));
   impl_->AttachInterface(std::move(interface_value));
 }
 
 void OpInfoImpl::AttachInterface(InterfaceValue &&interface_value) {
   auto success = interface_set_.insert(std::move(interface_value)).second;
-  IR_ENFORCE(success,
-             "Interface: id[%u] is already registered. inset failed",
-             interface_value.type_id());
+  PADDLE_ENFORCE_EQ(success,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Interface: id[%u] is already registered. inset failed",
+                        interface_value.type_id()));
   VLOG(10) << "Attach a interface: id[" << interface_value.type_id() << "]. to "
            << op_name_;
 }
diff --git a/paddle/pir/src/core/op_operand.cc b/paddle/pir/src/core/op_operand.cc
index 06c0d79ed9ae0..36cbfd7a557f1 100644
--- a/paddle/pir/src/core/op_operand.cc
+++ b/paddle/pir/src/core/op_operand.cc
@@ -17,10 +17,12 @@
 
 #include "paddle/common/enforce.h"
 
-#define CHECK_NULL_IMPL(class_name, func_name)                  \
-  IR_ENFORCE(impl_,                                             \
-             "impl_ pointer is null when call func:" #func_name \
-             " , in class: " #class_name ".")
+#define CHECK_NULL_IMPL(class_name, func_name)               \
+  PADDLE_ENFORCE_NOT_NULL(                                   \
+      impl_,                                                 \
+      phi::errors::InvalidArgument(                          \
+          "impl_ pointer is null when call func:" #func_name \
+          " , in class: " #class_name "."))
 
 #define CHECK_OP_OPERAND_NULL_IMPL(func_name) \
   CHECK_NULL_IMPL(OpOperand, func_name)
diff --git a/paddle/pir/src/core/op_result.cc b/paddle/pir/src/core/op_result.cc
index cd72b5b2800b7..d9ae484500787 100644
--- a/paddle/pir/src/core/op_result.cc
+++ b/paddle/pir/src/core/op_result.cc
@@ -19,7 +19,10 @@
 #include "paddle/common/enforce.h"
 
 #define CHECK_OPRESULT_NULL_IMPL(func_name) \
-  IR_ENFORCE(impl_, "impl_ pointer is null when call OpResult::" #func_name)
+  PADDLE_ENFORCE_NOT_NULL(                  \
+      impl_,                                \
+      phi::errors::InvalidArgument(         \
+          "impl_ pointer is null when call OpResult::" #func_name))
 #define IMPL_ static_cast<detail::OpResultImpl *>(impl_)
 
 namespace pir {
diff --git a/paddle/pir/src/core/op_trait.cc b/paddle/pir/src/core/op_trait.cc
index 39a0f6001da18..abefa27abcebc 100644
--- a/paddle/pir/src/core/op_trait.cc
+++ b/paddle/pir/src/core/op_trait.cc
@@ -23,11 +23,14 @@ namespace {
 void VerifySameOperandsShapeTrait(pir::Operation *op) {
   VLOG(10) << "Verify SameOperandsShapeTrait for : " << op->name();
 
-  IR_ENFORCE(op->num_operands() > 0,
-             "Op %s with SameOperandsShapeTrait requires at least 1 operands, "
-             "but got %u operands.",
-             op->name(),
-             op->num_operands());
+  PADDLE_ENFORCE_GT(
+      op->num_operands(),
+      0,
+      phi::errors::InvalidArgument(
+          "Op %s with SameOperandsShapeTrait requires at least 1 operands, "
+          "but got %u operands.",
+          op->name(),
+          op->num_operands()));
 
   std::vector<pir::OpOperand> operands = op->operands();
   std::vector<pir::Type> types;
@@ -35,26 +38,35 @@ void VerifySameOperandsShapeTrait(pir::Operation *op) {
     types.push_back(op.type());
   });
 
-  IR_ENFORCE(VerifyCompatibleShapes(types),
-             "Op %s with SameOperandsShapeTrait requires the same shape for "
-             "all operands.",
-             op->name());
+  PADDLE_ENFORCE_EQ(
+      VerifyCompatibleShapes(types),
+      true,
+      phi::errors::InvalidArgument(
+          "Op %s with SameOperandsShapeTrait requires the same shape for "
+          "all operands.",
+          op->name()));
 }
 
 void VerifySameOperandsAndResultShapeTrait(pir::Operation *op) {
   VLOG(10) << "Verify SameOperandsAndResultShapeTrait for : " << op->name();
 
-  IR_ENFORCE(op->num_operands() > 0,
-             "Op %s with SameOperandsAndResultShapeTrait requires at least 1 "
-             "operands, but got %u operands.",
-             op->name(),
-             op->num_operands());
-
-  IR_ENFORCE(op->num_results() > 0,
-             "Op %s with SameOperandsAndResultShapeTrait requires at least 1 "
-             "results, but got %u results.",
-             op->name(),
-             op->num_results());
+  PADDLE_ENFORCE_GT(
+      op->num_operands(),
+      0,
+      phi::errors::InvalidArgument(
+          "Op %s with SameOperandsAndResultShapeTrait requires at least 1 "
+          "operands, but got %u operands.",
+          op->name(),
+          op->num_operands()));
+
+  PADDLE_ENFORCE_GT(
+      op->num_results(),
+      0,
+      phi::errors::InvalidArgument(
+          "Op %s with SameOperandsAndResultShapeTrait requires at least 1 "
+          "results, but got %u results.",
+          op->name(),
+          op->num_results()));
 
   std::vector<pir::OpOperand> operands = op->operands();
   std::vector<pir::Value> results = op->results();
@@ -69,27 +81,36 @@ void VerifySameOperandsAndResultShapeTrait(pir::Operation *op) {
     types.push_back(op.type());
   });
 
-  IR_ENFORCE(VerifyCompatibleShapes(types),
-             "Op %s with SameOperandsAndResultShapeTrait requires compatible "
-             "shapes for operands and results.",
-             op->name());
+  PADDLE_ENFORCE_EQ(
+      VerifyCompatibleShapes(types),
+      true,
+      phi::errors::InvalidArgument(
+          "Op %s with SameOperandsAndResultShapeTrait requires compatible "
+          "shapes for operands and results.",
+          op->name()));
 }
 
 void VerifySameOperandsElementTypeTrait(pir::Operation *op) {
   VLOG(10) << "Verify SameOperandsElementTypeTrait for : " << op->name();
 
-  IR_ENFORCE(op->num_operands() > 0,
-             "Op %s with SameOperandsElementTypeTrait requires at least 1 "
-             "operands, but got %u operands.",
-             op->name(),
-             op->num_operands());
+  PADDLE_ENFORCE_GT(
+      op->num_operands(),
+      0,
+      phi::errors::InvalidArgument(
+          "Op %s with SameOperandsElementTypeTrait requires at least 1 "
+          "operands, but got %u operands.",
+          op->name(),
+          op->num_operands()));
 
   auto elementType = GetElementTypeOrSelf(op->result(0).type());
   for (auto operand : op->operands()) {
-    IR_ENFORCE(GetElementTypeOrSelf(operand.type()) == elementType,
-               "Op %s with SameOperandsElementTypeTrait requires the same "
-               "element type for all operands.",
-               op->name());
+    PADDLE_ENFORCE_EQ(
+        GetElementTypeOrSelf(operand.type()),
+        elementType,
+        phi::errors::InvalidArgument(
+            "Op %s with SameOperandsElementTypeTrait requires the same "
+            "element type for all operands.",
+            op->name()));
   }
 }
 
@@ -97,77 +118,107 @@ void VerifySameOperandsAndResultElementTypeTrait(pir::Operation *op) {
   VLOG(10) << "Verify SameOperandsAndResultElementTypeTrait for : "
            << op->name();
 
-  IR_ENFORCE(op->num_operands() > 0,
-             "Op %s with SameOperandsAndResultElementTypeTrait requires at "
-             "least 1 operands, but got %u operands.",
-             op->name(),
-             op->num_operands());
-
-  IR_ENFORCE(op->num_results() > 0,
-             "Op %s with SameOperandsAndResultElementTypeTrait requires at "
-             "least 1 results, but got %u results.",
-             op->name(),
-             op->num_results());
+  PADDLE_ENFORCE_GT(
+      op->num_operands(),
+      0,
+      phi::errors::InvalidArgument(
+          "Op %s with SameOperandsAndResultElementTypeTrait requires at "
+          "least 1 operands, but got %u operands.",
+          op->name(),
+          op->num_operands()));
+
+  PADDLE_ENFORCE_GT(
+      op->num_results(),
+      0,
+      phi::errors::InvalidArgument(
+          "Op %s with SameOperandsAndResultElementTypeTrait requires at "
+          "least 1 results, but got %u results.",
+          op->name(),
+          op->num_results()));
 
   auto elementType = GetElementTypeOrSelf(op->result(0).type());
 
   // Verify result element type matches first result's element type.
   for (auto result : op->results()) {
-    IR_ENFORCE(GetElementTypeOrSelf(result.type()) == elementType,
-               "Op %s with SameOperandsAndResultElementTypeTrait requires the "
-               "same element type for all operands and results.",
-               op->name());
+    PADDLE_ENFORCE_EQ(
+        GetElementTypeOrSelf(result.type()),
+        elementType,
+        phi::errors::InvalidArgument(
+            "Op %s with SameOperandsAndResultElementTypeTrait requires the "
+            "same element type for all operands and results.",
+            op->name()));
   }
 
   // Verify operand's element type matches first result's element type.
   for (auto operand : op->operands()) {
-    IR_ENFORCE(GetElementTypeOrSelf(operand.type()) == elementType,
-               "Op %s with SameOperandsAndResultElementTypeTrait requires the "
-               "same element type for all operands and results.",
-               op->name());
+    PADDLE_ENFORCE_EQ(
+        GetElementTypeOrSelf(operand.type()),
+        elementType,
+        phi::errors::InvalidArgument(
+            "Op %s with SameOperandsAndResultElementTypeTrait requires the "
+            "same element type for all operands and results.",
+            op->name()));
   }
 }
 
 void VerifySameOperandsAndResultTypeTrait(pir::Operation *op) {
   VLOG(10) << "Verify SameOperandsAndResultTypeTrait for : " << op->name();
 
-  IR_ENFORCE(op->num_operands() > 0,
-             "Op %s with SameOperandsAndResultTypeTrait requires at least 1 "
-             "operands, but got %u operands.",
-             op->name(),
-             op->num_operands());
-
-  IR_ENFORCE(op->num_results() > 0,
-             "Op %s with SameOperandsAndResultTypeTrait requires at least 1 "
-             "results, but got %u results.",
-             op->name(),
-             op->num_results());
+  PADDLE_ENFORCE_GT(
+      op->num_operands(),
+      0,
+      phi::errors::InvalidArgument(
+          "Op %s with SameOperandsAndResultTypeTrait requires at least 1 "
+          "operands, but got %u operands.",
+          op->name(),
+          op->num_operands()));
+
+  PADDLE_ENFORCE_GT(
+      op->num_results(),
+      0,
+      phi::errors::InvalidArgument(
+          "Op %s with SameOperandsAndResultTypeTrait requires at least 1 "
+          "results, but got %u results.",
+          op->name(),
+          op->num_results()));
 
   auto type = op->result(0).type();
   auto elementType = GetElementTypeOrSelf(type);
 
   for (auto result : op->results()) {
-    IR_ENFORCE(GetElementTypeOrSelf(result.type()) == elementType,
-               "Op %s with SameOperandsAndResultTypeTrait requires the same "
-               "type for all operands and results.",
-               op->name());
-
-    IR_ENFORCE(VerifyCompatibleShape(result.type(), type),
-               "Op %s with SameOperandsAndResultTypeTrait requires the same "
-               "type for all operands and results.",
-               op->name());
+    PADDLE_ENFORCE_EQ(
+        GetElementTypeOrSelf(result.type()),
+        elementType,
+        phi::errors::InvalidArgument(
+            "Op %s with SameOperandsAndResultTypeTrait requires the same "
+            "type for all operands and results.",
+            op->name()));
+
+    PADDLE_ENFORCE_EQ(
+        VerifyCompatibleShape(result.type(), type),
+        true,
+        phi::errors::InvalidArgument(
+            "Op %s with SameOperandsAndResultTypeTrait requires the same "
+            "type for all operands and results.",
+            op->name()));
   }
 
   for (auto operand : op->operands()) {
-    IR_ENFORCE(GetElementTypeOrSelf(operand.type()) == elementType,
-               "Op %s with SameOperandsAndResultTypeTrait requires the same "
-               "type for all operands and results.",
-               op->name());
-
-    IR_ENFORCE(VerifyCompatibleShape(operand.type(), type),
-               "Op %s with SameOperandsAndResultTypeTrait requires the same "
-               "type for all operands and results.",
-               op->name());
+    PADDLE_ENFORCE_EQ(
+        GetElementTypeOrSelf(operand.type()),
+        elementType,
+        phi::errors::InvalidArgument(
+            "Op %s with SameOperandsAndResultTypeTrait requires the same "
+            "type for all operands and results.",
+            op->name()));
+
+    PADDLE_ENFORCE_EQ(
+        VerifyCompatibleShape(operand.type(), type),
+        true,
+        phi::errors::InvalidArgument(
+            "Op %s with SameOperandsAndResultTypeTrait requires the same "
+            "type for all operands and results.",
+            op->name()));
   }
 }
 
@@ -181,18 +232,24 @@ void VerifySameTypeOperandsTrait(pir::Operation *op) {
   auto type = op->operand(0).type();
 
   for (auto operand : op->operands()) {
-    IR_ENFORCE(operand.type() == type,
-               "Op %s with SameTypeOperandsTrait requires all operands to have "
-               "the same type.",
-               op->name());
+    PADDLE_ENFORCE_EQ(
+        operand.type(),
+        type,
+        phi::errors::InvalidArgument(
+            "Op %s with SameTypeOperandsTrait requires all operands to have "
+            "the same type.",
+            op->name()));
   }
 }
 
 void VerifyOneResultTrait(pir::Operation *op) {
-  IR_ENFORCE(op->num_results() == 1,
-             "Op %s with OneResultTrait requires 1 result, but got %u results.",
-             op->name(),
-             op->num_results());
+  PADDLE_ENFORCE_EQ(
+      op->num_results(),
+      1,
+      phi::errors::InvalidArgument(
+          "Op %s with OneResultTrait requires 1 result, but got %u results.",
+          op->name(),
+          op->num_results()));
 }
 }  // namespace
 
@@ -231,3 +288,4 @@ IR_DEFINE_EXPLICIT_TYPE_ID(pir::SameOperandsAndResultTypeTrait)
 IR_DEFINE_EXPLICIT_TYPE_ID(pir::SameTypeOperandsTrait)
 IR_DEFINE_EXPLICIT_TYPE_ID(pir::OneResultTrait)
 IR_DEFINE_EXPLICIT_TYPE_ID(pir::SideEffectTrait)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::ImmutableLayoutTrait)
diff --git a/paddle/pir/src/core/operation.cc b/paddle/pir/src/core/operation.cc
index b1b09c60344f6..2703b274111a1 100644
--- a/paddle/pir/src/core/operation.cc
+++ b/paddle/pir/src/core/operation.cc
@@ -131,7 +131,7 @@ Operation *Operation::Create(const std::vector<Value> &inputs,
   if (op_info) {
     try {
       op_info.VerifySig(op);
-    } catch (const pir::IrNotMetException &e) {
+    } catch (const common::enforce::EnforceNotMet &e) {
       op->Destroy();
       throw e;
     }
@@ -297,14 +297,20 @@ std::vector<Value> Operation::operands_source() const {
 /// \brief op successor related public interfaces
 ///
 BlockOperand Operation::block_operand(uint32_t index) const {
-  IR_ENFORCE(index < num_successors_, "Invalid block_operand index");
+  PADDLE_ENFORCE_LT(
+      index,
+      num_successors_,
+      phi::errors::InvalidArgument("Invalid block_operand index"));
   return block_operands_ + index;
 }
 Block *Operation::successor(uint32_t index) const {
   return block_operand(index).source();
 }
 void Operation::set_successor(Block *block, unsigned index) {
-  IR_ENFORCE(index < num_operands_, "Invalid block_operand index");
+  PADDLE_ENFORCE_LT(
+      index,
+      num_operands_,
+      phi::errors::InvalidArgument("Invalid block_operand index"));
   (block_operands_ + index)->set_source(block);
 }
 
@@ -312,11 +318,15 @@ void Operation::set_successor(Block *block, unsigned index) {
 /// \brief region related public interfaces implementation
 ///
 Region &Operation::region(unsigned index) {
-  IR_ENFORCE(index < num_regions_, "invalid region index");
+  PADDLE_ENFORCE_LT(index,
+                    num_regions_,
+                    phi::errors::InvalidArgument("invalid region index"));
   return regions_[index];
 }
 const Region &Operation::region(unsigned index) const {
-  IR_ENFORCE(index < num_regions_, "invalid region index");
+  PADDLE_ENFORCE_LT(index,
+                    num_regions_,
+                    phi::errors::InvalidArgument("invalid region index"));
   return regions_[index];
 }
 
@@ -343,7 +353,8 @@ void Operation::SetParent(Block *parent, const Block::Iterator &position) {
 }
 
 void Operation::MoveTo(Block *block, Block::Iterator position) {
-  IR_ENFORCE(parent_, "Operation does not have parent");
+  PADDLE_ENFORCE_NOT_NULL(
+      parent_, phi::errors::InvalidArgument("Operation does not have parent"));
   Operation *op = parent_->Take(this);
   block->insert(position, op);
 }
@@ -367,8 +378,10 @@ bool Operation::use_empty() {
 }
 
 void Operation::ReplaceAllUsesWith(const std::vector<Value> &values) {
-  IR_ENFORCE(num_results_ == values.size(),
-             "the num of result should be the same.");
+  PADDLE_ENFORCE_EQ(
+      num_results_,
+      values.size(),
+      phi::errors::InvalidArgument("the num of result should be the same."));
   for (uint32_t i = 0; i < num_results_; ++i) {
     result(i).ReplaceAllUsesWith(values[i]);
   }
diff --git a/paddle/pir/src/core/parser/ir_parser.cc b/paddle/pir/src/core/parser/ir_parser.cc
index 5fe0cc55320ec..866da48051a12 100644
--- a/paddle/pir/src/core/parser/ir_parser.cc
+++ b/paddle/pir/src/core/parser/ir_parser.cc
@@ -34,9 +34,12 @@ Token IrParser::PeekToken() { return lexer->PeekToken(); }
 
 void IrParser::ConsumeAToken(std::string expect_token_val) {
   std::string token_val = ConsumeToken().val_;
-  IR_ENFORCE(token_val == expect_token_val,
-             "The token value of expectation is " + expect_token_val + " ,not" +
-                 token_val + "." + GetErrorLocationInfo());
+  PADDLE_ENFORCE_EQ(
+      token_val,
+      expect_token_val,
+      phi::errors::InvalidArgument("The token value of expectation is " +
+                                   expect_token_val + " ,not" + token_val +
+                                   "." + GetErrorLocationInfo()));
 }
 
 // Type := BuiltinType | OtherDialectsDefineType
@@ -104,9 +107,11 @@ Type IrParser::ParseType() {
     }
     return builder->vec_type(vec_type);
   } else {
-    IR_ENFORCE(type_val.find('.') != std::string::npos,
-               "No function parsing " + type_val + " exists!" +
-                   GetErrorLocationInfo());
+    PADDLE_ENFORCE_NE(
+        type_val.find('.'),
+        std::string::npos,
+        phi::errors::InvalidArgument("No function parsing " + type_val +
+                                     " exists!" + GetErrorLocationInfo()));
     auto dialect_name = type_val.substr(0, type_val.find('.'));
     auto dialect = ctx->GetRegisteredDialect(dialect_name);
     return dialect->ParseType(*this);
@@ -167,9 +172,11 @@ Attribute IrParser::ParseAttribute() {
   } else if (attribute_type == "Pointer") {
     IR_THROW("This attribute is not currently supported by parser");
   } else {
-    IR_ENFORCE(attribute_type.find('.') != std::string::npos,
-               "No function parsing " + attribute_type + " exists!" +
-                   GetErrorLocationInfo());
+    PADDLE_ENFORCE_NE(
+        attribute_type.find('.'),
+        std::string::npos,
+        phi::errors::InvalidArgument("No function parsing " + attribute_type +
+                                     " exists!" + GetErrorLocationInfo()));
     auto dialect_name = attribute_type.substr(0, attribute_type.find('.'));
     auto dialect = ctx->GetRegisteredDialect(dialect_name);
     return dialect->ParseAttribute(*this);
@@ -190,8 +197,10 @@ std::unique_ptr<Program> IrParser::ParseProgram() {
 // Region := Block
 void IrParser::ParseRegion(Region& region) {  // NOLINT
   ParseBlock(region.front());
-  IR_ENFORCE(PeekToken().val_ != "{",
-             "Only one block in a region is supported");
+  PADDLE_ENFORCE_NE(
+      PeekToken().val_,
+      "{",
+      phi::errors::InvalidArgument("Only one block in a region is supported"));
 }
 
 // Block := "{" {Operation} "}"
@@ -301,9 +310,11 @@ AttributeMap IrParser::ParseAttributeMap() {
     } else if (token_val == ",") {
       key_token = ConsumeToken();
     } else {
-      IR_ENFORCE((token_val == "}") || (token_val == ","),
-                 "The token value of expectation is } or , , not " + token_val +
-                     "." + GetErrorLocationInfo());
+      PADDLE_ENFORCE_EQ((token_val == "}") || (token_val == ","),
+                        true,
+                        phi::errors::InvalidArgument(
+                            "The token value of expectation is } or , , not " +
+                            token_val + "." + GetErrorLocationInfo()));
     }
   }
   return attribute_map;
diff --git a/paddle/pir/src/core/region.cc b/paddle/pir/src/core/region.cc
index 1bc0de522c129..5395add579cda 100644
--- a/paddle/pir/src/core/region.cc
+++ b/paddle/pir/src/core/region.cc
@@ -37,7 +37,10 @@ Region::Iterator Region::insert(ConstIterator position, Block *block) {
 }
 
 Region::Iterator Region::erase(ConstIterator position) {
-  IR_ENFORCE(position->GetParent() == this, "iterator not own this region.");
+  PADDLE_ENFORCE_EQ(
+      position->GetParent(),
+      this,
+      phi::errors::InvalidArgument("iterator not own this region."));
   delete position;
   return blocks_.erase(position);
 }
@@ -142,7 +145,9 @@ Program *Region::parent_program() const {
   return parent_ ? parent_->GetParentProgram() : nullptr;
 }
 IrContext *Region::ir_context() const {
-  IR_ENFORCE(parent_, "Region is not attached to a operation.");
+  PADDLE_ENFORCE_NOT_NULL(
+      parent_,
+      phi::errors::InvalidArgument("Region is not attached to a operation."));
   return parent_->ir_context();
 }
 }  // namespace pir
diff --git a/paddle/pir/src/core/value.cc b/paddle/pir/src/core/value.cc
index da587e27f9475..b25cadc68bf6b 100644
--- a/paddle/pir/src/core/value.cc
+++ b/paddle/pir/src/core/value.cc
@@ -22,10 +22,12 @@
 #include "paddle/pir/include/core/operation.h"
 #include "paddle/pir/src/core/value_impl.h"
 
-#define CHECK_NULL_IMPL(class_name, func_name)                  \
-  IR_ENFORCE(impl_,                                             \
-             "impl_ pointer is null when call func:" #func_name \
-             " , in class: " #class_name ".")
+#define CHECK_NULL_IMPL(class_name, func_name)               \
+  PADDLE_ENFORCE_NOT_NULL(                                   \
+      impl_,                                                 \
+      phi::errors::InvalidArgument(                          \
+          "impl_ pointer is null when call func:" #func_name \
+          " , in class: " #class_name "."))
 
 #define CHECK_VALUE_NULL_IMPL(func_name) CHECK_NULL_IMPL(Value, func_name)
 
diff --git a/paddle/pir/src/dialect/control_flow/ir/cf_interface.cc b/paddle/pir/src/dialect/control_flow/ir/cf_interface.cc
index 4a9642cee8b57..5c403246fe28b 100644
--- a/paddle/pir/src/dialect/control_flow/ir/cf_interface.cc
+++ b/paddle/pir/src/dialect/control_flow/ir/cf_interface.cc
@@ -18,14 +18,20 @@
 namespace pir {
 TuplePushOp ContainerOpInterface::tuple_push_op() {
   auto value = inlet();
-  IR_ENFORCE(value.HasOneUse(),
-             "The inlet value of container op can only be used once.");
+  PADDLE_ENFORCE_EQ(
+      value.HasOneUse(),
+      true,
+      phi::errors::InvalidArgument(
+          "The inlet value of container op can only be used once."));
   return value.first_use().owner()->dyn_cast<TuplePushOp>();
 }
 TuplePopOp ContainerOpInterface::tuple_pop_op() {
   auto value = outlet();
-  IR_ENFORCE(value.HasOneUse(),
-             "The outlet value of container op can only be used once.");
+  PADDLE_ENFORCE_EQ(
+      value.HasOneUse(),
+      true,
+      phi::errors::InvalidArgument(
+          "The outlet value of container op can only be used once."));
   return value.first_use().owner()->dyn_cast<TuplePopOp>();
 }
 
diff --git a/paddle/pir/src/dialect/control_flow/ir/cf_op.cc b/paddle/pir/src/dialect/control_flow/ir/cf_op.cc
index f7ad9b763f2cb..48f6b87fd5fe2 100644
--- a/paddle/pir/src/dialect/control_flow/ir/cf_op.cc
+++ b/paddle/pir/src/dialect/control_flow/ir/cf_op.cc
@@ -47,28 +47,42 @@ void TuplePushOp::Build(Builder &builder,             // NOLINT
 void TuplePushOp::VerifySig() {
   VLOG(4) << "Verifying inputs, outputs ,attributes for: TuplePushOp.";
   // Verify inputs:
-  IR_ENFORCE(num_operands() >= 1u, "The size of inputs must no less than 1.");
-  IR_ENFORCE(operand_source(0).type().isa<InletType>(),
-             "The first input of cf.tuple_push must be inlet_type.");
+  PADDLE_ENFORCE_GE(
+      num_operands(),
+      1u,
+      phi::errors::InvalidArgument("The size of inputs must no less than 1."));
+  PADDLE_ENFORCE_EQ(
+      operand_source(0).type().isa<InletType>(),
+      true,
+      phi::errors::InvalidArgument(
+          "The first input of cf.tuple_push must be inlet_type."));
 
   // No attributes should be verify.
 
   // Verify outputs:
-  IR_ENFORCE(num_results() == 0u, "The size of outputs must be equal to 0.");
+  PADDLE_ENFORCE_EQ(
+      num_results(),
+      0u,
+      phi::errors::InvalidArgument("The size of outputs must be equal to 0."));
   VLOG(4) << "End Verifying for TuplePushOp.";
 }
 
 void TuplePushOp::VerifyRegion() {
   // Note(winter-wang):Constraints on the number of uses can only can be placed
   // in VerifyRegion, Otherwise cloning would fail.
-  IR_ENFORCE(operand_source(0).HasOneUse(),
-             "The inlet value of cf.tuple_push can only be used once.");
+  PADDLE_ENFORCE_EQ(
+      operand_source(0).HasOneUse(),
+      true,
+      phi::errors::InvalidArgument(
+          "The inlet value of cf.tuple_push can only be used once."));
 }
 
 size_t TuplePushOp::tuple_size() {
   auto operands_size = num_operands();
-  IR_ENFORCE(operands_size >= 1u,
-             "The operands of push op must no less than 1.");
+  PADDLE_ENFORCE_GE(operands_size,
+                    1u,
+                    phi::errors::InvalidArgument(
+                        "The operands of push op must no less than 1."));
   return operands_size - 1u;
 }
 
@@ -94,9 +108,15 @@ void TuplePopOp::VerifySig() {
   VLOG(4) << "Verifying inputs, outputs ,attributes  and stack validity for: "
              "TuplePopOp.";
   // Verify inputs:
-  IR_ENFORCE(num_operands() == 1u, "The size of inputs must equal to 1.");
-  IR_ENFORCE(operand_source(0).type().isa<OutletType>(),
-             "The first input of cf.tuple_pop must be outlet_type.");
+  PADDLE_ENFORCE_EQ(
+      num_operands(),
+      1u,
+      phi::errors::InvalidArgument("The size of inputs must equal to 1."));
+  PADDLE_ENFORCE_EQ(
+      operand_source(0).type().isa<OutletType>(),
+      true,
+      phi::errors::InvalidArgument(
+          "The first input of cf.tuple_pop must be outlet_type."));
 
   // No attributes should be verify.
 
@@ -104,8 +124,11 @@ void TuplePopOp::VerifySig() {
 }
 
 void TuplePopOp::VerifyRegion() {
-  IR_ENFORCE(operand_source(0).HasOneUse(),
-             "The outlet value of cf.tuple_pop can only be used once.");
+  PADDLE_ENFORCE_EQ(
+      operand_source(0).HasOneUse(),
+      true,
+      phi::errors::InvalidArgument(
+          "The outlet value of cf.tuple_pop can only be used once."));
 
   // Verify stack validity:
   if (has_container()) {
@@ -145,19 +168,34 @@ void StackCreateOp::Build(Builder &builder, OperationArgument &argument) {
 void StackCreateOp::VerifySig() {
   VLOG(4) << "Verifying inputs, outputs and attributes for: StackCreateOp.";
   // Verify inputs:
-  IR_ENFORCE(num_operands() == 0u, "The size of inputs must be equal to 0.");
+  PADDLE_ENFORCE_EQ(
+      num_operands(),
+      0u,
+      phi::errors::InvalidArgument("The size of inputs must be equal to 0."));
 
   // No attributes should be verify.
 
   // Verify outputs:
-  IR_ENFORCE(num_results() == 3u, "The size of outputs must be equal to 3.");
-
-  IR_ENFORCE(result(0).type().isa<StackType>(),
-             "The first outputs of cf.stack_create must be stack_type.");
-  IR_ENFORCE(result(1).type().isa<InletType>(),
-             "The first outputs of cf.stack_create must be inlet_type.");
-  IR_ENFORCE(result(2).type().isa<OutletType>(),
-             "The first outputs of cf.stack_create must be outlet_type.");
+  PADDLE_ENFORCE_EQ(
+      num_results(),
+      3u,
+      phi::errors::InvalidArgument("The size of outputs must be equal to 3."));
+
+  PADDLE_ENFORCE_EQ(
+      result(0).type().isa<StackType>(),
+      true,
+      phi::errors::InvalidArgument(
+          "The first outputs of cf.stack_create must be stack_type."));
+  PADDLE_ENFORCE_EQ(
+      result(1).type().isa<InletType>(),
+      true,
+      phi::errors::InvalidArgument(
+          "The first outputs of cf.stack_create must be inlet_type."));
+  PADDLE_ENFORCE_EQ(
+      result(2).type().isa<OutletType>(),
+      true,
+      phi::errors::InvalidArgument(
+          "The first outputs of cf.stack_create must be outlet_type."));
 
   VLOG(4) << "End Verifying for StackCreateOp.";
 }
@@ -174,13 +212,19 @@ Value StackCreateOp::outlet_element(size_t index) {
 
 TuplePushOp StackCreateOp::tuple_push_op() {
   auto inlet_value = inlet();
-  IR_ENFORCE(inlet_value.HasOneUse(), "The inlet value must has one use.");
+  PADDLE_ENFORCE_EQ(
+      inlet_value.HasOneUse(),
+      true,
+      phi::errors::InvalidArgument("The inlet value must has one use."));
   return inlet_value.first_use().owner()->dyn_cast<TuplePushOp>();
 }
 
 TuplePopOp StackCreateOp::tuple_pop_op() {
   auto outlet_value = outlet();
-  IR_ENFORCE(outlet_value.HasOneUse(), "The outlet value must has one use.");
+  PADDLE_ENFORCE_EQ(
+      outlet_value.HasOneUse(),
+      true,
+      phi::errors::InvalidArgument("The outlet value must has one use."));
   return outlet_value.first_use().owner()->dyn_cast<TuplePopOp>();
 }
 
diff --git a/paddle/pir/src/dialect/shape/utils/constraints_manager.cc b/paddle/pir/src/dialect/shape/utils/constraints_manager.cc
new file mode 100644
index 0000000000000..ca9e9999c6fdf
--- /dev/null
+++ b/paddle/pir/src/dialect/shape/utils/constraints_manager.cc
@@ -0,0 +1,301 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pir/include/dialect/shape/utils/constraints_manager.h"
+#include "paddle/pir/include/dialect/shape/utils/dim_expr_util.h"
+
+namespace symbol {
+namespace {
+
+bool CanSubstituteInConstraint(const DimExpr& lhs, const DimExpr& rhs) {
+  int lhs_priority = GetDimExprPriority(lhs);
+  int rhs_priority = GetDimExprPriority(rhs);
+  if (lhs_priority >= 2 || rhs_priority >= 2) {
+    return false;
+  }
+  return true;
+}
+
+bool CanEqualCStrInsert(const DimExpr& lhs, const DimExpr& rhs) {
+  int lhs_priority = GetDimExprPriority(lhs);
+  int rhs_priority = GetDimExprPriority(rhs);
+  if (lhs_priority < 2 && rhs_priority < 2) {
+    return false;
+  }
+  return true;
+}
+
+template <template <class> class OpT>
+std::pair<DimExpr, DimExpr> EliminateCommonFactor(const OpT<DimExpr>& lhs,
+                                                  const OpT<DimExpr>& rhs) {
+  List<DimExpr> lhs_list = lhs.operands;
+  List<DimExpr> rhs_list = rhs.operands;
+  List<DimExpr> lhs_diffs, rhs_diffs;
+  std::unordered_map<DimExpr, int> lhs_hash;
+  for (const auto& lhs_dim_expr : *lhs_list) {
+    lhs_hash[lhs_dim_expr]++;
+  }
+  for (const auto& rhs_dim_expr : *rhs_list) {
+    if (lhs_hash.count(rhs_dim_expr) && lhs_hash.at(rhs_dim_expr) >= 1) {
+      lhs_hash[rhs_dim_expr]--;
+      continue;
+    }
+    rhs_diffs->push_back(rhs_dim_expr);
+  }
+  for (const auto& lhs_dim_expr : *lhs_list) {
+    while (lhs_hash.at(lhs_dim_expr) >= 1) {
+      lhs_hash[lhs_dim_expr]--;
+      lhs_diffs->push_back(lhs_dim_expr);
+    }
+  }
+  if (lhs_diffs->size() == 0 || rhs_diffs->size() == 0)
+    return std::pair(lhs, rhs);
+  auto lhs_diff =
+      lhs_diffs->size() == 1 ? lhs_diffs->at(0) : OpT<DimExpr>{lhs_diffs};
+  auto rhs_diff =
+      rhs_diffs->size() == 1 ? rhs_diffs->at(0) : OpT<DimExpr>{rhs_diffs};
+  return std::pair(lhs_diff, rhs_diff);
+}
+
+std::pair<DimExpr, DimExpr> SimplifyEqCstr(const DimExpr& lhs,
+                                           const DimExpr& rhs) {
+  auto DoSimplify = Overloaded{
+      [](const Add<DimExpr>& lhs,
+         const Add<DimExpr>& rhs) -> std::pair<DimExpr, DimExpr> {
+        return EliminateCommonFactor<Add>(lhs, rhs);
+      },
+      [](const Mul<DimExpr>& lhs,
+         const Mul<DimExpr>& rhs) -> std::pair<DimExpr, DimExpr> {
+        return EliminateCommonFactor<Mul>(lhs, rhs);
+      },
+      [](const auto& lhs, const auto& rhs) -> std::pair<DimExpr, DimExpr> {
+        return std::make_pair(DimExpr(lhs), DimExpr(rhs));
+      }};
+  return std::visit(DoSimplify, lhs.variant(), rhs.variant());
+}
+
+}  // namespace
+
+void ConstraintsManager::AddEqCstr(const DimExpr& lhs, const DimExpr& rhs) {
+  if (lhs == rhs) {
+    return;
+  }
+
+  auto simplify_result = SimplifyEqCstr(lhs, rhs);
+  if (simplify_result.first != lhs && simplify_result.second != rhs) {
+    AddEqCstr(simplify_result.first, simplify_result.second);
+    return;
+  }
+  if (CanEqualCStrInsert(lhs, rhs)) {
+    equals_.Union(lhs, rhs);
+    VLOG(4) << "add equal constraint: " << lhs << " == " << rhs;
+  }
+  DimExpr origin, subsutituted;
+  auto comp_result = CompareDimExprPriority(lhs, rhs);
+  if (comp_result == PriorityComparisonStatus::LOWER) {
+    origin = lhs;
+    subsutituted = rhs;
+  } else if (comp_result == PriorityComparisonStatus::HIGHER) {
+    origin = rhs;
+    subsutituted = lhs;
+  } else {
+    return;
+  }
+  if (CanSubstituteInConstraint(origin, subsutituted)) {
+    SubstituteInConstraint(origin, subsutituted);
+  }
+  if (equal_callback_func_) {
+    equal_callback_func_(origin, subsutituted);
+  }
+}
+
+bool ConstraintsManager::IsEqual(const DimExpr& lhs, const DimExpr& rhs) const {
+  return lhs == rhs || equals_.HasSameRoot(lhs, rhs);
+}
+
+template <typename DoEachClusterT>
+void ConstraintsManager::VisitEqualClusters(
+    const DoEachClusterT& DoEachCluster) const {
+  equals_.VisitCluster(DoEachCluster);
+}
+
+void ConstraintsManager::AddGTOneCstr(const DimExpr& dim_expr) {
+  gtones_.insert(dim_expr);
+
+  auto InsertEqualCstr = [&](const DimExpr& gtone_dim_expr,
+                             const DimExpr& other_dim_expr) {
+    if (IsGTOne(other_dim_expr)) {
+      AddEqCstr(gtone_dim_expr, other_dim_expr);
+    } else {
+      AddEqCstr(
+          gtone_dim_expr,
+          Broadcast<DimExpr>{List<DimExpr>{gtone_dim_expr, other_dim_expr}});
+    }
+  };
+
+  for (auto broadcastable : broadcastables_) {
+    if (broadcastable->lhs == dim_expr) {
+      InsertEqualCstr(dim_expr, broadcastable->rhs);
+    } else if (broadcastable->rhs == dim_expr) {
+      InsertEqualCstr(dim_expr, broadcastable->lhs);
+    }
+  }
+}
+
+namespace {
+
+bool IsGTOneBaseOnValue(const DimExpr& dim_expr) {
+  auto AllOperandGTOne = [](List<DimExpr> dim_exprs) {
+    for (const auto& dim_expr : *dim_exprs) {
+      if (IsGTOneBaseOnValue(dim_expr) == false) return false;
+    }
+    return true;
+  };
+  auto GTOneWithSomeOperandsGEOne = [](List<DimExpr> dim_exprs) {
+    bool flag_exist_gtone = false;
+    for (const auto& dim_expr : *dim_exprs) {
+      if (dim_expr.isa<Broadcast<DimExpr>>() ||
+          (dim_expr.isa<std::int64_t>() && dim_expr.Get<std::int64_t>() >= 1))
+        flag_exist_gtone = true;
+      else if (!dim_expr.isa<std::string>())
+        return false;
+    }
+    return flag_exist_gtone;
+  };
+
+  auto IsGTOnePredicater =
+      Overloaded{[&](std::int64_t dim_expr) { return dim_expr > 1; },
+                 [&](const Add<DimExpr>& dim_expr) {
+                   if (AllOperandGTOne(dim_expr.operands)) return true;
+                   if (GTOneWithSomeOperandsGEOne(dim_expr.operands))
+                     return true;
+                   return false;
+                 },
+                 [&](const Mul<DimExpr>& dim_expr) {
+                   if (AllOperandGTOne(dim_expr.operands)) return true;
+                   if (GTOneWithSomeOperandsGEOne(dim_expr.operands))
+                     return true;
+                   return false;
+                 },
+                 [&](const auto& dim_expr) { return false; }};
+
+  return std::visit(IsGTOnePredicater, dim_expr.variant());
+}
+
+}  // namespace
+
+bool ConstraintsManager::IsGTOne(const DimExpr& dim_expr) const {
+  return gtones_.count(dim_expr) || IsGTOneBaseOnValue(dim_expr);
+}
+
+void ConstraintsManager::AddBroadcastableCstr(const DimExpr& lhs,
+                                              const DimExpr& rhs) {
+  broadcastables_.push_back(Broadcastable<DimExpr>(lhs, rhs));
+
+  bool lhs_gtone = IsGTOne(lhs);
+  bool rhs_gtone = IsGTOne(rhs);
+  if (lhs_gtone && rhs_gtone) {
+    AddEqCstr(lhs, rhs);
+  } else if (lhs_gtone) {
+    AddEqCstr(lhs, Broadcast<DimExpr>{List<DimExpr>{lhs, rhs}});
+  } else if (rhs_gtone) {
+    AddEqCstr(rhs, Broadcast<DimExpr>{List<DimExpr>{lhs, rhs}});
+  }
+}
+
+bool ConstraintsManager::IsBroadcastable(const DimExpr& lhs,
+                                         const DimExpr& rhs) const {
+  for (auto broadcastable : broadcastables_) {
+    if ((broadcastable->lhs == lhs && broadcastable->rhs == rhs) ||
+        (broadcastable->rhs == lhs && broadcastable->lhs == rhs)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void ConstraintsManager::SetEqualCallbackFunc(
+    EqualCallbackFunc equal_callback_func) {
+  equal_callback_func_ = equal_callback_func;
+}
+
+void ConstraintsManager::SubstituteInConstraint(const DimExpr& origin,
+                                                const DimExpr& substituted) {
+  std::unordered_map<DimExpr, DimExpr> substitution_pattern;
+  substitution_pattern[origin] = substituted;
+
+  EqualConstraints substituted_equals;
+  auto substituted_equals_map = substituted_equals.GetMap();
+  EqualConstraintsVisitor([&](auto it) {
+    DimExpr key = SubstituteDimExpr(it->first, substitution_pattern);
+    DimExpr value = SubstituteDimExpr(it->second, substitution_pattern);
+    (*substituted_equals_map)[key] = value;
+  });
+  equals_ = substituted_equals;
+
+  GTOneConstraints substituted_gtones;
+  GTOneConstraintsVisitor([&](auto it) {
+    substituted_gtones.insert(SubstituteDimExpr(*it, substitution_pattern));
+  });
+  gtones_ = substituted_gtones;
+
+  BroadcastableConstraints substituted_broadcastables;
+  BroadcastableConstraintsVisitor([&](auto it) {
+    const DimExpr& substituted_lhs =
+        SubstituteDimExpr(it->data->lhs, substitution_pattern);
+    const DimExpr& substituted_rhs =
+        SubstituteDimExpr(it->data->rhs, substitution_pattern);
+    substituted_broadcastables.emplace_back(
+        Broadcastable<DimExpr>(substituted_lhs, substituted_rhs));
+  });
+  broadcastables_ = substituted_broadcastables;
+}
+
+template <typename DoEachT>
+void ConstraintsManager::EqualConstraintsVisitor(const DoEachT& DoEach) {
+  auto equals_parents = equals_.GetMap();
+  for (auto it = equals_parents->begin(); it != equals_parents->end(); it++) {
+    DoEach(it);
+  }
+}
+
+template <typename DoEachT>
+void ConstraintsManager::GTOneConstraintsVisitor(const DoEachT& DoEach) {
+  for (auto it = gtones_.begin(); it != gtones_.end(); it++) {
+    DoEach(it);
+  }
+}
+
+template <typename DoEachT>
+void ConstraintsManager::BroadcastableConstraintsVisitor(
+    const DoEachT& DoEach) {
+  for (auto it = broadcastables_.begin(); it != broadcastables_.end(); it++) {
+    DoEach(it);
+  }
+}
+
+std::ostream& operator<<(std::ostream& stream,
+                         const ConstraintsManager& constraints_manager) {
+  stream << "Equal Constraints Clusters:" << std::endl;
+  constraints_manager.VisitEqualClusters([&](const auto& cluster) {
+    stream << "{" << std::endl;
+    for (const auto& dim_expr : cluster) {
+      stream << dim_expr << std::endl;
+    }
+    stream << "}" << std::endl;
+  });
+  return stream;
+}
+
+}  // namespace symbol
diff --git a/paddle/pir/src/dialect/shape/utils/dim_expr_builder.cc b/paddle/pir/src/dialect/shape/utils/dim_expr_builder.cc
index acdc65ebec24f..81422f033c91f 100644
--- a/paddle/pir/src/dialect/shape/utils/dim_expr_builder.cc
+++ b/paddle/pir/src/dialect/shape/utils/dim_expr_builder.cc
@@ -66,31 +66,6 @@ std::vector<DimExpr> DimExprBuilder::ConstShape(
   return ret;
 }
 
-void DimExprBuilder::CstrBroadcastable(const DimExpr& lhs, const DimExpr& rhs) {
-  SYMBOL_NOT_IMPLEMENTED;
-}
-
-void DimExprBuilder::CstrBroadcastable(const std::vector<DimExpr>& lhs,
-                                       const std::vector<DimExpr>& rhs) {
-  SYMBOL_NOT_IMPLEMENTED;
-}
-
-void DimExprBuilder::CstrEq(const DimExpr& lhs, const DimExpr& rhs) {
-  constraints_->emplace_back(Equal<DimExpr>(lhs, rhs));
-}
-
-void DimExprBuilder::CstrEq(const std::vector<DimExpr>& lhs,
-                            const std::vector<DimExpr>& rhs) {
-  IR_ENFORCE(lhs.size() == rhs.size(),
-             "Please make sure input sizes are equal, "
-             "lhs.size() = %d, rhs.size() = %d.",
-             lhs.size(),
-             rhs.size());
-  for (std::size_t i = 0; i < lhs.size(); ++i) {
-    CstrEq(lhs.at(i), rhs.at(i));
-  }
-}
-
 std::vector<DimExpr> DimExprBuilder::Concat(const std::vector<DimExpr>& lhs,
                                             const std::vector<DimExpr>& rhs) {
   std::vector<DimExpr> ret{};
@@ -106,18 +81,17 @@ std::vector<DimExpr> DimExprBuilder::Concat(const std::vector<DimExpr>& lhs,
 
 std::pair<std::vector<DimExpr>, std::vector<DimExpr>> DimExprBuilder::SplitAt(
     const std::vector<DimExpr> dim_exprs, int index) {
-  IR_ENFORCE(index > 0 && index < static_cast<int>(dim_exprs.size()),
-             "Index invalid, index = %d, dim_exprs.size() = %d. Please check "
-             "your inputs.",
-             index,
-             dim_exprs.size());
+  PADDLE_ENFORCE_EQ(
+      index > 0 && index < static_cast<int>(dim_exprs.size()),
+      true,
+      phi::errors::InvalidArgument(
+          "Index invalid, index = %d, dim_exprs.size() = %d. Please check "
+          "your inputs.",
+          index,
+          dim_exprs.size()));
   std::vector<DimExpr> lhs(dim_exprs.begin(), dim_exprs.begin() + index);
   std::vector<DimExpr> rhs(dim_exprs.begin() + index, dim_exprs.end());
   return std::make_pair(lhs, rhs);
 }
 
-const std::vector<DimExprConstraint>& DimExprBuilder::constraints() const {
-  return *constraints_;
-}
-
 }  // namespace symbol
diff --git a/paddle/pir/src/dialect/shape/utils/dim_expr_util.cc b/paddle/pir/src/dialect/shape/utils/dim_expr_util.cc
index 9549d66893228..4fba57b2ded79 100644
--- a/paddle/pir/src/dialect/shape/utils/dim_expr_util.cc
+++ b/paddle/pir/src/dialect/shape/utils/dim_expr_util.cc
@@ -589,12 +589,12 @@ ConstRational MulConstRational(const ConstRational& lhs,
   const auto [lhs_num, lhs_dem] = lhs;
   const auto [rhs_num, rhs_dem] = rhs;
   // Crossing is correct.
-  const auto [simplifed_lhs_num, simplifed_rhs_dem] =
+  const auto [simplified_lhs_num, simplified_rhs_dem] =
       SimplifiedConstRational(lhs_num, rhs_dem);
-  const auto [simplifed_rhs_num, simplifed_lhs_dem] =
+  const auto [simplified_rhs_num, simplified_lhs_dem] =
       SimplifiedConstRational(rhs_num, lhs_dem);
-  return ConstRational{simplifed_lhs_num * simplifed_rhs_num,
-                       simplifed_lhs_dem * simplifed_rhs_dem};
+  return ConstRational{simplified_lhs_num * simplified_rhs_num,
+                       simplified_lhs_dem * simplified_rhs_dem};
 }
 
 template <>
@@ -1060,6 +1060,55 @@ DimExpr SubstituteDimExpr(
 
 }  // namespace symbol
 
+namespace symbol {
+
+IR_API int GetDimExprPriority(const DimExpr& dim_expr) {
+  return std::visit(Overloaded{
+                        [&](std::int64_t) { return 0; },
+                        [&](const std::string&) { return 1; },
+                        [&](const Negative<DimExpr>&) { return 2; },
+                        [&](const Reciprocal<DimExpr>&) { return 2; },
+                        [&](const Add<DimExpr>&) { return 2; },
+                        [&](const Mul<DimExpr>&) { return 2; },
+                        [&](const Max<DimExpr>&) { return 2; },
+                        [&](const Min<DimExpr>&) { return 2; },
+                        [&](const Broadcast<DimExpr>&) { return 2; },
+                    },
+                    dim_expr.variant());
+}
+
+IR_API PriorityComparisonStatus CompareDimExprPriority(const DimExpr& lhs,
+                                                       const DimExpr& rhs) {
+  int lhs_priority = GetDimExprPriority(lhs);
+  int rhs_priority = GetDimExprPriority(rhs);
+
+  if (lhs_priority != rhs_priority) {
+    return lhs_priority < rhs_priority ? PriorityComparisonStatus::HIGHER
+                                       : PriorityComparisonStatus::LOWER;
+  }
+
+  auto CompareForEqualPriority = Overloaded{
+      [](const std::string& lhs, const std::string& rhs) {
+        if (lhs.size() != rhs.size()) {
+          return lhs.size() < rhs.size() ? PriorityComparisonStatus::HIGHER
+                                         : PriorityComparisonStatus::LOWER;
+        }
+        int compare_result = lhs.compare(rhs);
+        if (compare_result == 0)
+          return PriorityComparisonStatus::EQUAL;
+        else if (compare_result < 0)
+          return PriorityComparisonStatus::HIGHER;
+        else
+          return PriorityComparisonStatus::LOWER;
+      },
+      [](const auto& lhs, const auto& rhs) {
+        return PriorityComparisonStatus::EQUAL;
+      }};
+  return std::visit(CompareForEqualPriority, lhs.variant(), rhs.variant());
+}
+
+}  // namespace symbol
+
 namespace symbol {
 namespace {
 
diff --git a/paddle/pir/src/dialect/shape/utils/shape_analysis.cc b/paddle/pir/src/dialect/shape/utils/shape_analysis.cc
index 6fdd3f8f7a0f9..85b07ab438c68 100644
--- a/paddle/pir/src/dialect/shape/utils/shape_analysis.cc
+++ b/paddle/pir/src/dialect/shape/utils/shape_analysis.cc
@@ -29,6 +29,10 @@ static std::string GetValueId(Value val) {
 void ShapeConstraintIRAnalysis::Init() {
   value_to_shape_or_data_.clear();
   next_sym_idx_ = 0;
+  constraints_manager_.SetEqualCallbackFunc(
+      [&](const symbol::DimExpr& lhs, const symbol::DimExpr& rhs) {
+        return SubstituteDimExpr(lhs, rhs);
+      });
 }
 
 const std::string ShapeConstraintIRAnalysis::GetNextSymName() {
@@ -42,8 +46,9 @@ bool ShapeConstraintIRAnalysis::HasShapeOrDataForValue(Value val) const {
 const symbol::ShapeOrDataDimExprs&
 ShapeConstraintIRAnalysis::GetShapeOrDataForValue(Value val) const {
   // TODO(zhangbopd): Uncomment this part and remove `if` later.
-  // IR_ENFORCE(this->HasShapeOrDataForValue(val),
-  //            "No shape_or_data for this value.");
+  // PADDLE_ENFORCE_EQ(this->HasShapeOrDataForValue(val), true,
+  // phi::errors::InvalidArgument(//            "No shape_or_data for this
+  // value."));
   if (!HasShapeOrDataForValue(val)) {
     static symbol::ShapeOrDataDimExprs empty{
         symbol::TensorShapeOrDataDimExprs{}};
@@ -55,16 +60,44 @@ ShapeConstraintIRAnalysis::GetShapeOrDataForValue(Value val) const {
 
 void ShapeConstraintIRAnalysis::SetShapeOrDataForValue(
     Value val, const symbol::ShapeOrDataDimExprs& shape_or_data) {
+  const symbol::ShapeOrDataDimExprs& substituted_shape_or_data =
+      symbol::SubstituteShapeOrData(shape_or_data, substitution_pattern_);
   auto iter = value_to_shape_or_data_.find(val);
   if (iter == value_to_shape_or_data_.end()) {
-    value_to_shape_or_data_.emplace(val, shape_or_data);
+    value_to_shape_or_data_.emplace(val, substituted_shape_or_data);
   } else {
-    iter->second = shape_or_data;
+    iter->second = substituted_shape_or_data;
   }
 }
 
-symbol::DimExprBuilder ShapeConstraintIRAnalysis::DimExprBuilder() {
-  return symbol::DimExprBuilder(&constraints_);
+void ShapeConstraintIRAnalysis::AddEqualCstr(const symbol::DimExpr& lhs,
+                                             const symbol::DimExpr& rhs) {
+  constraints_manager_.AddEqCstr(lhs, rhs);
+}
+
+bool ShapeConstraintIRAnalysis::IsEqual(const symbol::DimExpr& lhs,
+                                        const symbol::DimExpr& rhs) const {
+  return constraints_manager_.IsEqual(lhs, rhs);
+}
+
+void ShapeConstraintIRAnalysis::AddGreatThanOneCstr(
+    const symbol::DimExpr& dim_expr) {
+  constraints_manager_.AddGTOneCstr(dim_expr);
+}
+
+bool ShapeConstraintIRAnalysis::IsGreatThanOne(
+    const symbol::DimExpr& dim_expr) const {
+  return constraints_manager_.IsGTOne(dim_expr);
+}
+
+void ShapeConstraintIRAnalysis::AddBroadcastableCstr(
+    const symbol::DimExpr& lhs, const symbol::DimExpr& rhs) {
+  constraints_manager_.AddBroadcastableCstr(lhs, rhs);
+}
+
+bool ShapeConstraintIRAnalysis::IsBroadcastable(
+    const symbol::DimExpr& lhs, const symbol::DimExpr& rhs) const {
+  return constraints_manager_.IsBroadcastable(lhs, rhs);
 }
 
 void ShapeConstraintIRAnalysis::PrintShapeOrDatas() const {
@@ -97,10 +130,13 @@ bool ShapeConstraintIRAnalysis::IsShapeEqual(Value lhs, Value rhs) const {
   auto lhs_shape_data = GetShapeOrDataForValue(lhs);
   auto rhs_shape_data = GetShapeOrDataForValue(rhs);
 
-  IR_ENFORCE(lhs_shape_data.isa<symbol::TensorShapeOrDataDimExprs>() &&
-                 rhs_shape_data.isa<symbol::TensorShapeOrDataDimExprs>(),
-             "Currently, IsShapeEqual only support TensorShapeOrDataDimExprs "
-             "but not TensorListShapeOrDataDimExprs.");
+  PADDLE_ENFORCE_EQ(
+      lhs_shape_data.isa<symbol::TensorShapeOrDataDimExprs>() &&
+          rhs_shape_data.isa<symbol::TensorShapeOrDataDimExprs>(),
+      true,
+      phi::errors::InvalidArgument(
+          "Currently, IsShapeEqual only support TensorShapeOrDataDimExprs "
+          "but not TensorListShapeOrDataDimExprs."));
 
   // For static shape, directly compare the shapes.
   if (lhs_type.IsStaticShape() && rhs_type.IsStaticShape()) {
@@ -118,24 +154,12 @@ bool ShapeConstraintIRAnalysis::IsProductEqual(
     const std::vector<int>& rhs_dim_idxs) const {
   if (lhs == rhs) return true;
 
-  if (!HasShapeOrDataForValue(lhs) || !HasShapeOrDataForValue(rhs)) {
-    return false;
-  }
-
   auto lhs_type = lhs.type().dyn_cast<ShapedTypeInterface>();
   auto rhs_type = rhs.type().dyn_cast<ShapedTypeInterface>();
 
   if (!lhs_type || !rhs_type || !lhs_type.HasRank() || !rhs_type.HasRank())
     return false;
 
-  auto lhs_shape_data = GetShapeOrDataForValue(lhs);
-  auto rhs_shape_data = GetShapeOrDataForValue(rhs);
-
-  IR_ENFORCE(lhs_shape_data.isa<symbol::TensorShapeOrDataDimExprs>() &&
-                 rhs_shape_data.isa<symbol::TensorShapeOrDataDimExprs>(),
-             "Currently, IsProductEqual only support TensorShapeOrDataDimExprs "
-             "but not TensorListShapeOrDataDimExprs.");
-
   // For static shape
   if (lhs_type.IsStaticShape() && rhs_type.IsStaticShape()) {
     int64_t lhs_product = 1;
@@ -150,6 +174,21 @@ bool ShapeConstraintIRAnalysis::IsProductEqual(
   }
 
   // For dynamic shape
+  if (!HasShapeOrDataForValue(lhs) || !HasShapeOrDataForValue(rhs)) {
+    return false;
+  }
+
+  auto lhs_shape_data = GetShapeOrDataForValue(lhs);
+  auto rhs_shape_data = GetShapeOrDataForValue(rhs);
+
+  PADDLE_ENFORCE_EQ(
+      lhs_shape_data.isa<symbol::TensorShapeOrDataDimExprs>() &&
+          rhs_shape_data.isa<symbol::TensorShapeOrDataDimExprs>(),
+      true,
+      phi::errors::InvalidArgument(
+          "Currently, IsProductEqual only support TensorShapeOrDataDimExprs "
+          "but not TensorListShapeOrDataDimExprs."));
+
   symbol::DimExpr lhs_product(1);
   symbol::DimExpr rhs_product(1);
   for (int i : lhs_dim_idxs) {
@@ -227,6 +266,42 @@ symbol::DimExpr ShapeConstraintIRAnalysis::GetProductDimExpr(
   return symbol::SimplifyDimExpr(product);
 }
 
+namespace {
+
+bool CanSubstituteInShapeAnalysis(const symbol::DimExpr& lhs,
+                                  const symbol::DimExpr& rhs) {
+  auto CanSubstitutePredictor = symbol::Overloaded{
+      [](std::int64_t lhs, const auto& rhs) { return true; },
+      [](const std::string& lhs, const std::string& rhs) { return true; },
+      [](const std::string& lhs,
+         const symbol::Broadcast<symbol::DimExpr>& rhs) { return true; },
+      [](const auto& lhs, const auto& rhs) { return false; }};
+  return std::visit(CanSubstitutePredictor, lhs.variant(), rhs.variant()) ||
+         std::visit(CanSubstitutePredictor, rhs.variant(), lhs.variant());
+}
+
+}  // namespace
+
+void ShapeConstraintIRAnalysis::SubstituteDimExpr(
+    const symbol::DimExpr& origin, const symbol::DimExpr& substituted) {
+  if (!CanSubstituteInShapeAnalysis(origin, substituted)) return;
+
+  substitution_pattern_[origin] = substituted;
+  for (auto it = substitution_pattern_.begin();
+       it != substitution_pattern_.end();
+       it++) {
+    if (it->second == origin) it->second = substituted;
+  }
+
+  for (auto it = value_to_shape_or_data_.begin();
+       it != value_to_shape_or_data_.end();
+       it++) {
+    const symbol::ShapeOrDataDimExprs& substituted_shape_or_data =
+        symbol::SubstituteShapeOrData(it->second, substitution_pattern_);
+    SetShapeOrDataForValue(it->first, substituted_shape_or_data);
+  }
+}
+
 pir::PrintHooks ShapeConstraintIRAnalysis::PrintHook() const {
   pir::PrintHooks print_hook;
   print_hook.op_print_hook = [&](Operation* op, IrPrinter& printer) {
diff --git a/paddle/pir/src/dialect/shape/utils/shape_or_data_expr.cc b/paddle/pir/src/dialect/shape/utils/shape_or_data_expr.cc
index e40fc0ddfe7df..092707c52014f 100644
--- a/paddle/pir/src/dialect/shape/utils/shape_or_data_expr.cc
+++ b/paddle/pir/src/dialect/shape/utils/shape_or_data_expr.cc
@@ -16,6 +16,52 @@
 
 namespace symbol {
 
+TensorShapeOrDataDimExprs SubstituteTensorShapeOrData(
+    const TensorShapeOrDataDimExprs& shape_or_data,
+    const std::unordered_map<DimExpr, DimExpr>& substitution_pattern) {
+  auto SubstituteOneDimExpr =
+      [](const std::vector<DimExpr>& original_dim_expr,
+         const std::unordered_map<DimExpr, DimExpr>& substitution_pattern)
+      -> std::vector<DimExpr> {
+    std::vector<DimExpr> substituted_dim_expr{};
+    for (const DimExpr& dim_expr : original_dim_expr) {
+      const auto& tmp_dim_expr =
+          SubstituteDimExpr(dim_expr, substitution_pattern);
+      substituted_dim_expr.push_back(SimplifyDimExpr(tmp_dim_expr));
+    }
+    return substituted_dim_expr;
+  };
+
+  std::vector<DimExpr> substituted_shape =
+      SubstituteOneDimExpr(shape_or_data.shape(), substitution_pattern);
+  if (!shape_or_data.data().has_value()) {
+    return ShapeOrData<DimExpr>(substituted_shape);
+  } else {
+    std::vector<DimExpr> substituted_data = SubstituteOneDimExpr(
+        shape_or_data.data().value(), substitution_pattern);
+    return ShapeOrData<DimExpr>(substituted_shape, substituted_data);
+  }
+}
+
+ShapeOrDataDimExprs SubstituteShapeOrData(
+    const ShapeOrDataDimExprs& shape_or_data,
+    const std::unordered_map<DimExpr, DimExpr>& substitution_pattern) {
+  auto lambdas = Overloaded{
+      [&](const TensorShapeOrDataDimExprs& tensor_shape_or_data) {
+        return ShapeOrDataDimExprs(SubstituteTensorShapeOrData(
+            tensor_shape_or_data, substitution_pattern));
+      },
+      [&](const TensorListShapeOrDataDimExprs& tensor_list) {
+        TensorListShapeOrDataDimExprs substituted_tensor_list;
+        for (TensorShapeOrDataDimExprs tensor_shape_or_data : tensor_list) {
+          substituted_tensor_list.push_back(SubstituteTensorShapeOrData(
+              tensor_shape_or_data, substitution_pattern));
+        }
+        return ShapeOrDataDimExprs(substituted_tensor_list);
+      }};
+  return std::visit(lambdas, shape_or_data.variant());
+}
+
 std::ostream& operator<<(std::ostream& stream,
                          const ShapeOrDataDimExprs& shape_or_data) {
   auto lambdas = Overloaded{
diff --git a/paddle/pir/src/pass/pass.cc b/paddle/pir/src/pass/pass.cc
index 392848df5faee..103266b2ab984 100644
--- a/paddle/pir/src/pass/pass.cc
+++ b/paddle/pir/src/pass/pass.cc
@@ -30,12 +30,22 @@ namespace pir {
 //===----------------------------------------------------------------------===//
 // Pass
 //===----------------------------------------------------------------------===//
-Pass::~Pass() = default;
+Pass::~Pass() {
+  for (const auto& attr : attrs_) {
+    if (attr_dels_.find(attr.first) != attr_dels_.end()) {
+      attr_dels_[attr.first]();
+    }
+  }
+  attrs_.clear();
+  attr_dels_.clear();
+}
 
 bool Pass::CanApplyOn(Operation* op) const { return op->num_regions() > 0; }
 
 detail::PassExecutionState& Pass::pass_state() {
-  IR_ENFORCE(pass_state_.has_value() == true, "pass state has no value");
+  PADDLE_ENFORCE_EQ(pass_state_.has_value(),
+                    true,
+                    phi::errors::InvalidArgument("pass state has no value"));
   return *pass_state_;
 }
 
@@ -44,13 +54,16 @@ detail::PassExecutionState& Pass::pass_state() {
 //===----------------------------------------------------------------------===//
 bool PatternRewritePass::Initialize(IrContext* context) {
   RewritePatternSet ps = InitializePatterns(context);
-  IR_ENFORCE(ps.Empty() == false,
-             "Pass creation failed."
-             "When using PatternRewritePass to create a Pass, the number of "
-             "customized Patterns is required to be greater than zero."
-             "Suggested fix: Check whether Pattern is added to the "
-             "InitializePatterns() function of class [%s]",
-             name());
+  PADDLE_ENFORCE_EQ(
+      ps.Empty(),
+      false,
+      phi::errors::InvalidArgument(
+          "Pass creation failed."
+          "When using PatternRewritePass to create a Pass, the number of "
+          "customized Patterns is required to be greater than zero."
+          "Suggested fix: Check whether Pattern is added to the "
+          "InitializePatterns() function of class [%s]",
+          name()));
   patterns_ = FrozenRewritePatternSet(std::move(ps));
   return true;
 }
diff --git a/paddle/pir/src/pass/print_statistics.cc b/paddle/pir/src/pass/print_statistics.cc
index 21d4d67945ce8..9d26374592284 100644
--- a/paddle/pir/src/pass/print_statistics.cc
+++ b/paddle/pir/src/pass/print_statistics.cc
@@ -32,24 +32,20 @@ class PrintStatistics : public PassInstrumentation {
   ~PrintStatistics() override = default;
 
   void RunBeforePass(Pass *pass, Operation *op) override {
-    if (pass->name() == "replace_fetch_with_shadow_output_pass") {
-      return;
-    }
     paddle::string::PrettyLogH1("--- Running PIR pass [%s]",
                                 pass->pass_info().name);
   }
 
   void RunAfterPass(Pass *pass, Operation *op) override {
-    if (pass->name() == "replace_fetch_with_shadow_output_pass") {
-      return;
-    }
     if (pass->Has("__match_count__") && pass->Has("__all_count__")) {
       auto match_count = pass->Get<int64_t>("__match_count__");
       auto all_count = pass->Get<int64_t>("__all_count__");
-      IR_ENFORCE(match_count <= all_count,
-                 "match_count: %d should smaller than all_count: %d",
-                 match_count,
-                 all_count);
+      PADDLE_ENFORCE_LE(match_count,
+                        all_count,
+                        phi::errors::InvalidArgument(
+                            "match_count: %d should smaller than all_count: %d",
+                            match_count,
+                            all_count));
       if (match_count > 0) {
         LOG(INFO) << "--- detected [" << match_count << "/" << all_count
                   << "] subgraphs!";
diff --git a/paddle/pir/src/pattern_rewrite/pattern_match.cc b/paddle/pir/src/pattern_rewrite/pattern_match.cc
index 7e4edae09cb0a..b5b35c3abc57a 100644
--- a/paddle/pir/src/pattern_rewrite/pattern_match.cc
+++ b/paddle/pir/src/pattern_rewrite/pattern_match.cc
@@ -91,8 +91,10 @@ void RewriterBase::ReplaceOpWithIf(
     const std::vector<Value>& new_values,
     bool* all_uses_replaced,
     const std::function<bool(OpOperand)>& functor) {
-  IR_ENFORCE(op->num_results() == new_values.size(),
-             "incorrect number of values to replace operation");
+  PADDLE_ENFORCE_EQ(op->num_results(),
+                    new_values.size(),
+                    phi::errors::InvalidArgument(
+                        "incorrect number of values to replace operation"));
   NotifyRootReplaced(op, new_values);
 
   // Replace each use of the results when the functor is true.
@@ -119,8 +121,10 @@ void RewriterBase::ReplaceOp(Operation* op,
   // Notify that the rewriter subclass we're about to replace this root.
   NotifyRootReplaced(op, new_values);
 
-  IR_ENFORCE(op->num_results() == new_values.size(),
-             "incorrect # of replacement values");
+  PADDLE_ENFORCE_EQ(
+      op->num_results(),
+      new_values.size(),
+      phi::errors::InvalidArgument("incorrect # of replacement values"));
   op->ReplaceAllUsesWith(new_values);
 
   NotifyOperationRemoved(op);
@@ -128,10 +132,12 @@ void RewriterBase::ReplaceOp(Operation* op,
 }
 
 void RewriterBase::EraseOp(Operation* op) {
-  IR_ENFORCE(
+  PADDLE_ENFORCE_EQ(
       op->use_empty(),
-      "Erase op failed. op(%s) is used, the expectation is that it is not used",
-      op->name());
+      true,
+      phi::errors::InvalidArgument("Erase op failed. op(%s) is used, the "
+                                   "expectation is that it is not used",
+                                   op->name()));
   NotifyOperationRemoved(op);
   op->Erase();
 }
@@ -159,8 +165,10 @@ void RewriterBase::ReplaceUseIf(Value from,
 // 'op' and 'new_op' are known to have the same number of results
 void RewriterBase::ReplaceOpWithResultsOfAnotherOp(Operation* op,
                                                    Operation* new_op) {
-  IR_ENFORCE(op->num_results() == new_op->num_results(),
-             "replacement op doesn't match results of original op");
+  PADDLE_ENFORCE_EQ(op->num_results(),
+                    new_op->num_results(),
+                    phi::errors::InvalidArgument(
+                        "replacement op doesn't match results of original op"));
   // TODO(zhangbopd): Add unit test for this.
   if (op->num_results() == 1) {
     std::vector<Value> new_values;
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index a7c916aa9bdf5..eaa2d36cd5903 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -24,11 +24,11 @@ rem -------clean up environment-----------
 set work_dir=%cd%
 if not defined cache_dir set cache_dir=%work_dir:Paddle=cache%
 if not exist %cache_dir%\tools (
-    cd /d cache_dir
+    cd /d %cache_dir%
     python -m pip install wget
     python -c "import wget;wget.download('https://paddle-ci.gz.bcebos.com/window_requirement/tools.zip')"
     tar xf tools.zip
-    cd /d work_dir
+    cd /d %work_dir%
 )
 taskkill /f /im cmake.exe /t 2>NUL
 taskkill /f /im ninja.exe /t 2>NUL
@@ -74,6 +74,7 @@ if not defined NEW_RELEASE_ALL set NEW_RELEASE_ALL=ON
 if not defined NEW_RELEASE_PYPI set NEW_RELEASE_PYPI=OFF
 if not defined NEW_RELEASE_JIT set NEW_RELEASE_JIT=OFF
 if not defined WITH_CPP_TEST set WITH_CPP_TEST=ON
+if not defined WITH_NIGHTLY_BUILD set WITH_NIGHTLY_BUILD=OFF
 
 rem variable to control pipeline process
 if not defined WITH_TPCACHE set WITH_TPCACHE=OFF
@@ -515,7 +516,7 @@ echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -D
 -DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% -DCUDA_ARCH_BIN=%CUDA_ARCH_BIN% -DCUB_PATH=%THIRD_PARTY_HOME%/cub ^
 -DCUDA_TOOLKIT_ROOT_DIR="%CUDA_TOOLKIT_ROOT_DIR%" -DNEW_RELEASE_ALL=%NEW_RELEASE_ALL% -DNEW_RELEASE_PYPI=%NEW_RELEASE_PYPI% ^
 -DNEW_RELEASE_JIT=%NEW_RELEASE_JIT% -DWITH_ONNXRUNTIME=%WITH_ONNXRUNTIME% -DWITH_CPP_TEST=%WITH_CPP_TEST% ^
--DWIN_UNITTEST_LEVEL=%WIN_UNITTEST_LEVEL%
+-DWIN_UNITTEST_LEVEL=%WIN_UNITTEST_LEVEL% -DWITH_NIGHTLY_BUILD=%WITH_NIGHTLY_BUILD%
 
 echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^
@@ -525,7 +526,7 @@ echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -D
 -DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% -DCUDA_ARCH_BIN=%CUDA_ARCH_BIN% -DCUB_PATH=%THIRD_PARTY_HOME%/cub ^
 -DCUDA_TOOLKIT_ROOT_DIR="%CUDA_TOOLKIT_ROOT_DIR%" -DNEW_RELEASE_ALL=%NEW_RELEASE_ALL% -DNEW_RELEASE_PYPI=%NEW_RELEASE_PYPI% ^
 -DNEW_RELEASE_JIT=%NEW_RELEASE_JIT% -DWITH_ONNXRUNTIME=%WITH_ONNXRUNTIME% -DWITH_CPP_TEST=%WITH_CPP_TEST% ^
--DWIN_UNITTEST_LEVEL=%WIN_UNITTEST_LEVEL% >> %work_dir%\win_cmake.sh
+-DWIN_UNITTEST_LEVEL=%WIN_UNITTEST_LEVEL% -DWITH_NIGHTLY_BUILD=%WITH_NIGHTLY_BUILD% >> %work_dir%\win_cmake.sh
 
 cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^
@@ -535,7 +536,7 @@ cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_
 -DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% -DCUDA_ARCH_BIN=%CUDA_ARCH_BIN% -DCUB_PATH=%THIRD_PARTY_HOME%/cub ^
 -DCUDA_TOOLKIT_ROOT_DIR="%CUDA_TOOLKIT_ROOT_DIR%" -DNEW_RELEASE_ALL=%NEW_RELEASE_ALL% -DNEW_RELEASE_PYPI=%NEW_RELEASE_PYPI% ^
 -DNEW_RELEASE_JIT=%NEW_RELEASE_JIT% -DWITH_ONNXRUNTIME=%WITH_ONNXRUNTIME% -DWITH_CPP_TEST=%WITH_CPP_TEST% ^
--DWIN_UNITTEST_LEVEL=%WIN_UNITTEST_LEVEL%
+-DWIN_UNITTEST_LEVEL=%WIN_UNITTEST_LEVEL% -DWITH_NIGHTLY_BUILD=%WITH_NIGHTLY_BUILD%
 goto:eof
 
 :cmake_error
@@ -718,13 +719,13 @@ dir %THIRD_PARTY_PATH:/=\%\install\openblas\lib
 dir %THIRD_PARTY_PATH:/=\%\install\openblas\bin
 dir %THIRD_PARTY_PATH:/=\%\install\zlib\bin
 dir %THIRD_PARTY_PATH:/=\%\install\mklml\lib
-dir %THIRD_PARTY_PATH:/=\%\install\mkldnn\lib
+dir %THIRD_PARTY_PATH:/=\%\install\onednn\lib
 dir %THIRD_PARTY_PATH:/=\%\install\warpctc\bin
 dir %THIRD_PARTY_PATH:/=\%\install\onnxruntime\lib
 
 set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;^
 %THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;^
-%THIRD_PARTY_PATH:/=\%\install\mkldnn\lib;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;^
+%THIRD_PARTY_PATH:/=\%\install\onednn\lib;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;^
 %THIRD_PARTY_PATH:/=\%\install\onnxruntime\lib;%THIRD_PARTY_PATH:/=\%\install\paddle2onnx\lib;^
 %work_dir%\%BUILD_DIR%\paddle\fluid\inference;%work_dir%\%BUILD_DIR%\paddle\fluid\pybind;%work_dir%\%BUILD_DIR%\paddle\fluid\inference\capi_exp;%work_dir%\%BUILD_DIR%\paddle\ir;^
 %PATH%
@@ -766,7 +767,7 @@ setlocal enabledelayedexpansion
 :: for /F %%# in ('cmd /C nvidia-smi -L ^|find "GPU" /C') do set CUDA_DEVICE_COUNT=%%#
 set CUDA_DEVICE_COUNT=1
 
-:: For hypothesis tests(mkldnn op and inference pass), we set use 'ci' profile
+:: For hypothesis tests(onednn op and inference pass), we set use 'ci' profile
 set HYPOTHESIS_TEST_PROFILE=ci
 
 %cache_dir%\tools\busybox64.exe bash %work_dir%\tools\windows\run_unittests.sh %NIGHTLY_MODE% %PRECISION_TEST% %WITH_GPU%
@@ -778,7 +779,7 @@ echo    ========================================
 echo    Running CPU unit tests in parallel way ...
 echo    ========================================
 
-:: For hypothesis tests(mkldnn op and inference pass), we set use 'ci' profile
+:: For hypothesis tests(onednn op and inference pass), we set use 'ci' profile
 set HYPOTHESIS_TEST_PROFILE=ci
 %cache_dir%\tools\busybox64.exe bash %work_dir%\tools\windows\run_unittests.sh %NIGHTLY_MODE% %PRECISION_TEST% %WITH_GPU%
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 1f21c6c33185f..c4dfa340e2858 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -3322,7 +3322,7 @@ EOF
     startTime_s=`date +%s`
     set +e
 
-    cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON -DWITH_TENSORRT=ON -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-Auto} -DWITH_PYTHON=${WITH_PYTHON:-ON} -DWITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF} -DWITH_MKL=${WITH_MKL:-ON} -DWITH_MKLDNN=${WITH_MKLDNN:-ON} ;build_error=$?
+    cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON -DWITH_TENSORRT=ON -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-Auto} -DWITH_PYTHON=${WITH_PYTHON:-ON} -DWITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF} -DWITH_MKL=${WITH_MKL:-ON} -DWITH_ONEDNN=${WITH_ONEDNN:-ON} ;build_error=$?
 
     # reset ccache zero stats for collect PR's actual hit rate
     ccache -z
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index be8bc29414921..f016890ca3269 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -5,7 +5,7 @@ function version(){
         echo "    with_avx: @WITH_AVX@"
         echo "    with_gpu: @WITH_GPU@"
         echo "    with_mkl: @WITH_MKL@"
-        echo "    with_mkldnn: @WITH_MKLDNN@"
+        echo "    with_mkldnn: @WITH_ONEDNN@"
         echo "    with_python: @WITH_PYTHON@"
 }
 
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index 9ae8b4b4886bc..d5a7eca0f3b9f 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -40,7 +40,7 @@ if(WITH_TESTING)
       DEPS gtest xxhash framework_proto eigen3 dlpack)
   endif()
 
-  if(WITH_MKLDNN)
-    add_dependencies(paddle_gtest_main_new mkldnn)
+  if(WITH_ONEDNN)
+    add_dependencies(paddle_gtest_main_new onednn)
   endif()
 endif()
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index d1c22b73e456d..934f0978b2dfb 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,7 +1,5 @@
 if(WITH_CINN)
   file(GLOB_RECURSE CINN_PY_FILES ${PROJECT_SOURCE_DIR}/python/cinn/*.py)
-  set(CINN_PYTHON_DIR ${PROJECT_SOURCE_DIR}/python/cinn)
-  set(CINN_CORE_API ${CMAKE_BINARY_DIR}/python/cinn/core_api.so)
 
   if(WITH_GPU)
     set(PACKAGE_NAME "cinn-gpu")
@@ -18,23 +16,6 @@ if(WITH_CINN)
   endif()
 
   message(STATUS "PYTHON_EXECUTABLE: ${PYTHON_EXECUTABLE}")
-
-  # There may be a link file called core_api.so under the dir ${CINN_PYTHON_DIR} due to the `mac_doc`
-  # function defined in build.sh. So, we need to copy the directory ${CINN_PYTHON_DIR} first and
-  # then core_api.so.
-  add_custom_command(
-    OUTPUT ${CINN_CORE_API} POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CINN_PYTHON_DIR}
-            ${PADDLE_BINARY_DIR}/python/cinn
-    COMMAND cp --remove-destination
-            ${CMAKE_BINARY_DIR}/paddle/cinn/pybind/core_api.so ${CINN_CORE_API}
-    COMMAND cd ${CMAKE_CURRENT_BINARY_DIR} && ${PYTHON_EXECUTABLE} setup_cinn.py
-            bdist_wheel
-    DEPENDS core_api ${CINN_PY_FILES})
-
-  add_custom_target(COPY_CINN_CORE_API ALL DEPENDS ${CINN_CORE_API}
-                                                   ${CINN_PY_FILES})
-
 endif()
 
 file(GLOB UTILS_PY_FILES . ./paddle/legacy/utils/*.py)
@@ -179,8 +160,7 @@ else()
     add_custom_command(
       OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp_wheel
       COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-      COMMENT "Packing whl packages------>>>"
-      DEPENDS ${PADDLE_PYTHON_BUILD_DIR}/.timestamp)
+      COMMENT "Packing whl packages------>>>")
   endif()
 endif()
 
diff --git a/python/cinn/__init__.py b/python/cinn/__init__.py
deleted file mode 100644
index 9bce78e526a24..0000000000000
--- a/python/cinn/__init__.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# Copyright (c) 2021 CINN Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-from .runtime.cinn_jit import to_cinn_llir  # noqa: F401
-from .version import full_version as __version__  # noqa: F401
-
-cinndir = os.path.dirname(os.path.abspath(__file__))
-runtime_include_dir = os.path.join(cinndir, "libs")
-cuhfile = os.path.join(runtime_include_dir, "cinn_cuda_runtime_source.cuh")
-
-if os.path.exists(cuhfile):
-    os.environ.setdefault('runtime_include_dir', runtime_include_dir)
-
-from .backends import (  # noqa: F401
-    Compiler,
-    ExecutionEngine,
-    ExecutionOptions,
-)
-from .common import (  # noqa: F401
-    BFloat16,
-    Bool,
-    CINNValue,
-    CINNValuePack,
-    DefaultHostTarget,
-    DefaultNVGPUTarget,
-    DefaultTarget,
-    Float,
-    Float16,
-    Int,
-    RefCount,
-    Shared_CINNValuePack_,
-    String,
-    Target,
-    Type,
-    UInt,
-    Void,
-    _CINNValuePack_,
-    get_target,
-    is_compiled_with_cuda,
-    is_compiled_with_cudnn,
-    make_const,
-    reset_name_id,
-    set_target,
-    type_of,
-)
-from .ir import (  # noqa: F401
-    EQ,
-    GE,
-    GT,
-    LE,
-    LT,
-    NE,
-    Add,
-    And,
-    Args,
-    Argument,
-    BinaryOpNodeAdd,
-    BinaryOpNodeAnd,
-    BinaryOpNodeDiv,
-    BinaryOpNodeEQ,
-    BinaryOpNodeFracOp,
-    BinaryOpNodeGE,
-    BinaryOpNodeGT,
-    BinaryOpNodeLE,
-    BinaryOpNodeLT,
-    BinaryOpNodeMax,
-    BinaryOpNodeMin,
-    BinaryOpNodeMod,
-    BinaryOpNodeMul,
-    BinaryOpNodeNE,
-    BinaryOpNodeOr,
-    BinaryOpNodeSub,
-    Block,
-    Call,
-    CallOp,
-    CallType,
-    Cast,
-    ComputeOp,
-    Div,
-    Expr,
-    ExprNode_Module_,
-    ExprNode_Tensor_,
-    ExprNode_Var_,
-    ExprNodeAdd,
-    ExprNodeAnd,
-    ExprNodeBlock,
-    ExprNodeCall,
-    ExprNodeCast,
-    ExprNodeDiv,
-    ExprNodeEQ,
-    ExprNodeFloatImm,
-    ExprNodeFracOp,
-    ExprNodeGE,
-    ExprNodeGT,
-    ExprNodeIntImm,
-    ExprNodeLE,
-    ExprNodeLet,
-    ExprNodeLoad,
-    ExprNodeLT,
-    ExprNodeMax,
-    ExprNodeMin,
-    ExprNodeMinus,
-    ExprNodeMod,
-    ExprNodeMul,
-    ExprNodeNE,
-    ExprNodeNot,
-    ExprNodeOr,
-    ExprNodeProduct,
-    ExprNodeReduce,
-    ExprNodeSelect,
-    ExprNodeStore,
-    ExprNodeStringImm,
-    ExprNodeSub,
-    ExprNodeSum,
-    ExprNodeUIntImm,
-    FloatImm,
-    FracOp,
-    IntImm,
-    IrNode,
-    IrNodeRef,
-    IrNodeTy,
-    IRVisitor,
-    Let,
-    Load,
-    LoadStoreAddrMnger,
-    LoweredFunc,
-    Max,
-    Min,
-    Minus,
-    Mod,
-    Mul,
-    Not,
-    Operation,
-    Or,
-    PackedFunc,
-    PlaceholderOp,
-    Product,
-    Reduce,
-    Registry,
-    Select,
-    SharedIrNode,
-    Store,
-    StringImm,
-    Sub,
-    Sum,
-    Tensor,
-    UIntImm,
-    UnaryOpNodeMinus,
-    UnaryOpNodeNot,
-    Var,
-    _Module_,
-    _Tensor_,
-    _Var_,
-)
-from .lang import (  # noqa: F401
-    Buffer,
-    Module,
-    Placeholder,
-    ReturnType,
-    call_extern,
-    call_lowered,
-    compute,
-    create_placeholder,
-    lower,
-    lower_vec,
-    reduce_all,
-    reduce_any,
-    reduce_max,
-    reduce_min,
-    reduce_mul,
-    reduce_sum,
-)
-from .poly import (  # noqa: F401
-    Condition,
-    Iterator,
-    SharedStage,
-    SharedStageMap,
-    Stage,
-    StageMap,
-    create_stages,
-)
diff --git a/python/cinn/common.py b/python/cinn/common.py
deleted file mode 100644
index 6f0f8e8c593ff..0000000000000
--- a/python/cinn/common.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) 2021 CINN Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .core_api.common import (  # noqa: F401
-    BFloat16,
-    Bool,
-    CINNValue,
-    CINNValuePack,
-    DefaultHostTarget,
-    DefaultNVGPUTarget,
-    DefaultTarget,
-    Float,
-    Float16,
-    Int,
-    RefCount,
-    Shared_CINNValuePack_,
-    String,
-    Target,
-    Type,
-    UInt,
-    Void,
-    _CINNValuePack_,
-    get_target,
-    is_compiled_with_cuda,
-    is_compiled_with_cudnn,
-    make_const,
-    reset_name_id,
-    set_target,
-    type_of,
-)
diff --git a/python/cinn/ir/__init__.py b/python/cinn/ir/__init__.py
deleted file mode 100644
index 24427ae6fec90..0000000000000
--- a/python/cinn/ir/__init__.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# Copyright (c) 2021 CINN Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ..core_api.ir import (  # noqa: F401
-    EQ,
-    GE,
-    GT,
-    LE,
-    LT,
-    NE,
-    Add,
-    And,
-    Arg,
-    Args,
-    Argument,
-    AxisMap,
-    BinaryOpNodeAdd,
-    BinaryOpNodeAnd,
-    BinaryOpNodeDiv,
-    BinaryOpNodeEQ,
-    BinaryOpNodeFracOp,
-    BinaryOpNodeGE,
-    BinaryOpNodeGT,
-    BinaryOpNodeLE,
-    BinaryOpNodeLT,
-    BinaryOpNodeMax,
-    BinaryOpNodeMin,
-    BinaryOpNodeMod,
-    BinaryOpNodeMul,
-    BinaryOpNodeNE,
-    BinaryOpNodeOr,
-    BinaryOpNodeSub,
-    Block,
-    Buffer,
-    Call,
-    CallOp,
-    CallType,
-    Cast,
-    ComputeOp,
-    Div,
-    Expr,
-    ExprNode_Module_,
-    ExprNode_Tensor_,
-    ExprNode_Var_,
-    ExprNodeAdd,
-    ExprNodeAnd,
-    ExprNodeBlock,
-    ExprNodeCall,
-    ExprNodeCast,
-    ExprNodeDiv,
-    ExprNodeEQ,
-    ExprNodeFloatImm,
-    ExprNodeFracOp,
-    ExprNodeGE,
-    ExprNodeGT,
-    ExprNodeIntImm,
-    ExprNodeLE,
-    ExprNodeLet,
-    ExprNodeLoad,
-    ExprNodeLT,
-    ExprNodeMax,
-    ExprNodeMin,
-    ExprNodeMinus,
-    ExprNodeMod,
-    ExprNodeMul,
-    ExprNodeNE,
-    ExprNodeNot,
-    ExprNodeOr,
-    ExprNodeProduct,
-    ExprNodeReduce,
-    ExprNodeSelect,
-    ExprNodeStore,
-    ExprNodeStringImm,
-    ExprNodeSub,
-    ExprNodeSum,
-    ExprNodeUIntImm,
-    FloatImm,
-    FracOp,
-    IfThenElse,
-    IntImm,
-    IrCompare,
-    IrNode,
-    IrNodeRef,
-    IrNodeTy,
-    IRVisitor,
-    Let,
-    Load,
-    LoadStoreAddrMnger,
-    LoweredFunc,
-    Max,
-    Min,
-    Minus,
-    Mod,
-    ModuleExpr,
-    Mul,
-    Not,
-    Operation,
-    Or,
-    PackedFunc,
-    PlaceholderOp,
-    Product,
-    Reduce,
-    Registry,
-    Select,
-    Sequential,
-    SharedIrNode,
-    Store,
-    StringImm,
-    Sub,
-    Sum,
-    Tensor,
-    TensorStore,
-    UIntImm,
-    UnaryOpNodeMinus,
-    UnaryOpNodeNot,
-    Var,
-    _Buffer_,
-    _Module_,
-    _Tensor_,
-    _Var_,
-)
-from .ir import sequential  # noqa: F401
-from .ir_context import (  # noqa: F401
-    ElseContext,
-    ForContext,
-    IfContext,
-    IRBuilder,
-    IRContext,
-    LowerFuncContext,
-    ScheduleBlockContext,
-    ThenContext,
-)
-
-
-def get_global_func(name):
-    return Registry.get(name)
-
-
-def register(name, override=False):
-    def _register_fn(fn):
-        Registry.register(name, override).set_body(PackedFunc(fn))
-        return Registry.get(name)
-
-    return _register_fn
-
-
-def register_packed_func(name, override=False):
-    def _register(fn):
-        def _packed(args, rv):
-            _args = []
-            for i in range(len(args)):
-                _args.append(args[i])
-            r = fn(*_args)
-            rv.set(r)
-
-        Registry.register(name, override).set_body(PackedFunc(_packed))
-        return Registry.get(name)
-
-    return _register
diff --git a/python/cinn/lang.py b/python/cinn/lang.py
deleted file mode 100644
index 9cf3c1d1179a0..0000000000000
--- a/python/cinn/lang.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2021 CINN Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .core_api.lang import (  # noqa: F401
-    Buffer,
-    Module,
-    Placeholder,
-    ReturnType,
-    call_extern,
-    call_lowered,
-    compute,
-    create_placeholder,
-    lower,
-    lower_vec,
-    reduce_all,
-    reduce_any,
-    reduce_max,
-    reduce_min,
-    reduce_mul,
-    reduce_sum,
-)
diff --git a/python/cinn/pe.py b/python/cinn/pe.py
deleted file mode 100644
index 34e0bfb84cb2e..0000000000000
--- a/python/cinn/pe.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) 2021 CINN Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .core_api.pe import (  # noqa: F401
-    abs,
-    acos,
-    acosh,
-    add,
-    asin,
-    asinh,
-    atan,
-    atan2,
-    atanh,
-    bitwise_and,
-    bitwise_not,
-    bitwise_or,
-    bitwise_xor,
-    ceil,
-    cos,
-    cosh,
-    divide,
-    equal,
-    erf,
-    exp,
-    floor,
-    floor_divide,
-    greater,
-    greater_equal,
-    identity,
-    isfinite,
-    isinf,
-    isnan,
-    left_shift,
-    less,
-    less_equal,
-    log,
-    log2,
-    log10,
-    logical_and,
-    logical_not,
-    logical_or,
-    logical_xor,
-    matmul,
-    matmul_mkl,
-    max,
-    min,
-    mod,
-    multiply,
-    negative,
-    not_equal,
-    reduce_all,
-    reduce_any,
-    reduce_max,
-    reduce_min,
-    reduce_prod,
-    reduce_sum,
-    remainder,
-    right_shift,
-    round,
-    rsqrt,
-    sigmoid,
-    sign,
-    sin,
-    sinh,
-    sqrt,
-    subtract,
-    tan,
-    tanh,
-    trunc,
-)
diff --git a/python/cinn/runtime/__init__.py b/python/cinn/runtime/__init__.py
deleted file mode 100644
index 244567bd855c2..0000000000000
--- a/python/cinn/runtime/__init__.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) 2023 CINN Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from cinn.core_api.runtime import (  # noqa: F401
-    VoidPointer,
-    cinn_arm_device,
-    cinn_bool_t,
-    cinn_buffer_copy,
-    cinn_buffer_copy_to_device,
-    cinn_buffer_copy_to_host,
-    cinn_buffer_free,
-    cinn_buffer_get_data_const_handle,
-    cinn_buffer_get_data_handle,
-    cinn_buffer_kind_t,
-    cinn_buffer_load_float32,
-    cinn_buffer_load_float64,
-    cinn_buffer_malloc,
-    cinn_buffer_on_device,
-    cinn_buffer_on_host,
-    cinn_buffer_t,
-    cinn_device_interface_t,
-    cinn_device_kind_t,
-    cinn_device_release,
-    cinn_device_sync,
-    cinn_float32_t,
-    cinn_float64_t,
-    cinn_int8_t,
-    cinn_int32_t,
-    cinn_int64_t,
-    cinn_opencl_device,
-    cinn_pod_value_t,
-    cinn_pod_value_to_buffer_p,
-    cinn_pod_value_to_double,
-    cinn_pod_value_to_float,
-    cinn_pod_value_to_int8,
-    cinn_pod_value_to_int32,
-    cinn_pod_value_to_int64,
-    cinn_pod_value_to_void_p,
-    cinn_type_code_t,
-    cinn_type_float,
-    cinn_type_handle,
-    cinn_type_int,
-    cinn_type_t,
-    cinn_type_uint,
-    cinn_type_unk,
-    cinn_uint32_t,
-    cinn_uint64_t,
-    cinn_unk_device,
-    cinn_unk_t,
-    cinn_value_t,
-    cinn_x86_device,
-    cinn_x86_device_interface,
-    clear_seed,
-    nullptr,
-    seed,
-    set_cinn_cudnn_deterministic,
-)
-
-from .cinn_jit import CinnLowerLevelIrJit
-from .module import Module
-
-__all__ = ["CinnLowerLevelIrJit", "Module"]
diff --git a/python/env_dict.py.in b/python/env_dict.py.in
index 301254edbf38d..e3a1b08d28e82 100644
--- a/python/env_dict.py.in
+++ b/python/env_dict.py.in
@@ -48,7 +48,7 @@ env_dict={
     'PSLIB_LIB':'@PSLIB_LIB@',
     'JVM_LIB':'@JVM_LIB@',
     'PSLIB_VERSION_PY':'@PSLIB_VERSION_PY@',
-    'WITH_MKLDNN':'@WITH_MKLDNN@',
+    'WITH_ONEDNN':'@WITH_ONEDNN@',
     'MKLDNN_SHARED_LIB':'@MKLDNN_SHARED_LIB@',
     'MKLDNN_INSTALL_DIR':'@MKLDNN_INSTALL_DIR@',
     'WITH_ONNXRUNTIME':'@WITH_ONNXRUNTIME@',
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 05cff990c1837..ab4d932278093 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -278,6 +278,7 @@
     expand,
     expand_as,
     flatten,
+    flatten_,
     flip,
     flip as reverse,
     gather,
@@ -881,6 +882,7 @@
     'set_printoptions',
     'std',
     'flatten',
+    'flatten_',
     'asin',
     'multiply',
     'multiply_',
diff --git a/python/paddle/amp/accuracy_compare.py b/python/paddle/amp/accuracy_compare.py
index 52a4c4c2ef85d..2f93c165d2bcb 100644
--- a/python/paddle/amp/accuracy_compare.py
+++ b/python/paddle/amp/accuracy_compare.py
@@ -46,19 +46,7 @@ def __init__(self):
         self.num_zero = None
 
     def __str__(self):
-        return "[TensorInfo] device={}, op_type={}, tensor_name={}, dtype={}, numel={}, num_inf={}, num_nan={}, num_zero={}, max_value={:.6f}, min_value={:.6f}, mean_value={:.6f}".format(
-            self.device,
-            self.op_type,
-            self.tensor_name,
-            self.dtype,
-            self.numel,
-            self.has_inf,
-            self.has_nan,
-            self.num_zero,
-            self.max_value,
-            self.min_value,
-            self.mean_value,
-        )
+        return f"[TensorInfo] device={self.device}, op_type={self.op_type}, tensor_name={self.tensor_name}, dtype={self.dtype}, numel={self.numel}, num_inf={self.has_inf}, num_nan={self.has_nan}, num_zero={self.num_zero}, max_value={self.max_value:.6f}, min_value={self.min_value:.6f}, mean_value={self.mean_value:.6f}"
 
     def key(
         self,
@@ -163,9 +151,7 @@ def __init__(
             assert fp32_tensor_info.op_type == fp16_tensor_info.op_type
             assert (
                 fp32_tensor_info.numel == fp16_tensor_info.numel
-            ), "Error:\n\tFP32 Tensor Info:{}\n\tFP16 Tensor Info:{}".format(
-                fp32_tensor_info, fp16_tensor_info
-            )
+            ), f"Error:\n\tFP32 Tensor Info:{fp32_tensor_info}\n\tFP16 Tensor Info:{fp16_tensor_info}"
             # Fp16 divided by fp32
             self.fp32_div_fp16_max_value = self._div(
                 self.fp16_max_value, self.fp32_max_value
@@ -183,25 +169,9 @@ def __str__(self):
         def _float_str(value):
             return f"{value:.6f}" if value is not None else value
 
-        debug_str = "[MixedPrecisionTensorInfo] op_type={}, numel={}".format(
-            self.op_type, self.numel
-        )
-        debug_str += "\n  FP32: tensor_name={}, dtype={}, max_value={}, min_value={}, mean_value={}".format(
-            self.fp32_tensor_name,
-            self.fp32_dtype,
-            _float_str(self.fp32_max_value),
-            _float_str(self.fp32_min_value),
-            _float_str(self.fp32_mean_value),
-        )
-        debug_str += "\n  FP16: tensor_name={}, dtype={}, max_value={}, min_value={}, mean_value={}, has_inf={}, has_nan={}".format(
-            self.fp16_tensor_name,
-            self.fp16_dtype,
-            _float_str(self.fp16_max_value),
-            _float_str(self.fp16_min_value),
-            _float_str(self.fp16_mean_value),
-            self.fp16_has_inf,
-            self.fp16_has_nan,
-        )
+        debug_str = f"[MixedPrecisionTensorInfo] op_type={self.op_type}, numel={self.numel}"
+        debug_str += f"\n  FP32: tensor_name={self.fp32_tensor_name}, dtype={self.fp32_dtype}, max_value={_float_str(self.fp32_max_value)}, min_value={_float_str(self.fp32_min_value)}, mean_value={_float_str(self.fp32_mean_value)}"
+        debug_str += f"\n  FP16: tensor_name={self.fp16_tensor_name}, dtype={self.fp16_dtype}, max_value={_float_str(self.fp16_max_value)}, min_value={_float_str(self.fp16_min_value)}, mean_value={_float_str(self.fp16_mean_value)}, has_inf={self.fp16_has_inf}, has_nan={self.fp16_has_nan}"
         return debug_str
 
     def _div(self, a, b):
@@ -640,9 +610,7 @@ def merge_tensor_info_list(
         for i in range(len(fp16_tensor_info_list)):
             if i % 10 == 0:
                 print(
-                    "-- Processing {:-8d} / {:-8d} FP16 Tensor Info".format(
-                        i, len(fp16_tensor_info_list)
-                    ),
+                    f"-- Processing {i:-8d} / {len(fp16_tensor_info_list):-8d} FP16 Tensor Info",
                     end="\r",
                 )
             fp16_tensor_info = fp16_tensor_info_list[i]
@@ -667,9 +635,7 @@ def merge_tensor_info_list(
         for i in range(len(fp32_tensor_info_list)):
             if i % 10 == 0:
                 print(
-                    "-- Processing {:-8d} / {:-8d} FP32 Tensor Info".format(
-                        i, len(fp32_tensor_info_list)
-                    ),
+                    f"-- Processing {i:-8d} / {len(fp32_tensor_info_list):-8d} FP32 Tensor Info",
                     end="\r",
                 )
             tensor_info = fp32_tensor_info_list[i]
@@ -699,9 +665,7 @@ def compare_accuracy(
         if "worker_" in name:
             workerlog_filenames.append(name)
     print(
-        "-- There are {} workerlogs under {}: {}".format(
-            len(workerlog_filenames), dump_path, workerlog_filenames
-        )
+        f"-- There are {len(workerlog_filenames)} workerlogs under {dump_path}: {workerlog_filenames}"
     )
 
     for filename in sorted(workerlog_filenames):
diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
index 81fe65a364bf3..0f67084da733e 100644
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -323,9 +323,7 @@ def check_models(models):
     for model in models:
         if not isinstance(model, paddle.nn.Layer):
             raise RuntimeError(
-                "Current train mode is pure fp16, models should be paddle.nn.Layer, but receive {}.".format(
-                    type(model)
-                )
+                f"Current train mode is pure fp16, models should be paddle.nn.Layer, but receive {type(model)}."
             )
         if isinstance(model, paddle.DataParallel):
             raise RuntimeError(
@@ -353,9 +351,7 @@ def check_optimizers(optimizers):
     for optimizer in optimizers:
         if not _is_valid_optimizer(optimizer):
             raise RuntimeError(
-                "Current train mode is pure fp16, optimizers should be paddle.optimizer.Optimizer or DygraphShardingOptimizer, but receive {}.".format(
-                    type(optimizer)
-                )
+                f"Current train mode is pure fp16, optimizers should be paddle.optimizer.Optimizer or DygraphShardingOptimizer, but receive {type(optimizer)}."
             )
 
 
diff --git a/python/paddle/amp/debugging.py b/python/paddle/amp/debugging.py
index e589a98fe8a42..59b07e8dbaada 100644
--- a/python/paddle/amp/debugging.py
+++ b/python/paddle/amp/debugging.py
@@ -442,9 +442,7 @@ def _print_operator_stats(op_count_dict):
                 called = value.split(",")
             else:
                 raise ValueError(
-                    "Input {} is expected to be a list of str, but received {}.".format(
-                        value, type(value)
-                    )
+                    f"Input {value} is expected to be a list of str, but received {type(value)}."
                 )
             print(
                 "  %-40s|  %-17s|  %-17s|  %-17s|  %-17s"
diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py
index fd8ba5887cbfd..020c13d6a337a 100644
--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -298,6 +298,7 @@ def minimize(self, optimizer, *args, **kwargs):
         if hasattr(optimizer, "_set_auxiliary_var"):
             optimizer._set_auxiliary_var('found_inf', self._found_inf)
             optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
+            # TODO: Fix to _cache_found_inf after PaddleNLP update
             self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf')
         else:
             if self._found_inf:
@@ -430,11 +431,7 @@ def _update(self):
             self._decr_count = self._decr_count + 1
             if self._decr_count == self._decr_every_n_nan_or_inf:
                 print(
-                    'Found inf or nan, current scale is: {}, decrease to: {}*{}'.format(
-                        float(self._scale),
-                        float(self._scale),
-                        float(self._decr_ratio),
-                    )
+                    f'Found inf or nan, current scale is: {float(self._scale)}, decrease to: {float(self._scale)}*{float(self._decr_ratio)}'
                 )
                 self._scale = self._scale * self._decr_ratio
                 self._decr_count = 0
diff --git a/python/paddle/audio/backends/init_backend.py b/python/paddle/audio/backends/init_backend.py
index 2259fda8b846b..1488d26e4c73a 100644
--- a/python/paddle/audio/backends/init_backend.py
+++ b/python/paddle/audio/backends/init_backend.py
@@ -72,11 +72,11 @@ def list_available_backends() -> List[str]:
     except ImportError:
         package = "paddleaudio"
         warn_msg = (
-            "Failed importing {}. \n"
+            f"Failed importing {package}. \n"
             "only wave_backend(only can deal with PCM16 WAV) supported.\n"
             "if want soundfile_backend(more audio type supported),\n"
-            "please manually installed (usually with `pip install {} >= 1.0.2`). "
-        ).format(package, package)
+            f"please manually installed (usually with `pip install {package} >= 1.0.2`). "
+        )
         warnings.warn(warn_msg)
 
     if "paddleaudio" in sys.modules:
diff --git a/python/paddle/autograd/backward_utils.py b/python/paddle/autograd/backward_utils.py
index ff6c42613d06b..3ec2855aba7ed 100644
--- a/python/paddle/autograd/backward_utils.py
+++ b/python/paddle/autograd/backward_utils.py
@@ -389,7 +389,6 @@ def remove_op(block, op, state):
     '''
     remove op from block
     '''
-    block.remove_op(op)
     if state.opgrad_to_op[op] != []:
         fwd_op = state.opgrad_to_op[op][0]
         state.op_to_opgrad[fwd_op].remove(op)
@@ -403,6 +402,10 @@ def remove_op(block, op, state):
                 raise ValueError(
                     'input_grad in [%s] is value which need to sum ', op.name()
                 )
+    # NOTE(SigureMo): Ensure access to the op's results before removing it.
+    # Otherwise, the op will be deconstructed and access the num_results
+    # will be undefined behavior, it always cause hanging on the macOS.
+    block.remove_op(op)
 
 
 def while_prune_check(while_tuple_ops):
@@ -444,6 +447,14 @@ def all_stop_gradient_true(block):
     return True
 
 
+def all_input_stop_gradient_true(list_of_list):
+    for list_ in list_of_list:
+        for stop_gradient in list_:
+            if stop_gradient is False:
+                return False
+    return True
+
+
 def all_output_grad_none(list_of_list):
     for list_ in list_of_list:
         for value in list_:
diff --git a/python/paddle/autograd/ir_backward.py b/python/paddle/autograd/ir_backward.py
index 551e55a18b942..4614856ed86ae 100644
--- a/python/paddle/autograd/ir_backward.py
+++ b/python/paddle/autograd/ir_backward.py
@@ -22,6 +22,7 @@
     ValueDict,
     ValueSet,
     _as_list,
+    all_input_stop_gradient_true,
     all_output_grad_none,
     all_stop_gradient_true,
     argument_to_value,
@@ -56,66 +57,68 @@
 
 
 def append_full_like(float_value, copy_value, value, state, backward_ops):
-    if paddle.pir.is_fake_value(value):
-        state.value_to_valuegrad[value] = [[paddle.pir.fake_value()]]
-        return
-    if copy_value.is_dense_tensor_array_type():
-        value_grad = paddle._pir_ops.create_array_like(
-            copy_value,
-            float_value,
-        )
-        full_like_op = value_grad.get_defining_op()
-        backward_ops_ = [full_like_op]
-    else:
-        value_grad = paddle.full_like(
-            copy_value,
-            float_value,
-            dtype=copy_value.dtype,
+    with paddle.amp.auto_cast(enable=False):
+        if paddle.pir.is_fake_value(value):
+            state.value_to_valuegrad[value] = [[paddle.pir.fake_value()]]
+            return
+        if copy_value.is_dense_tensor_array_type():
+            value_grad = paddle._C_ops.create_array_like(
+                copy_value,
+                float_value,
+            )
+            full_like_op = value_grad.get_defining_op()
+            backward_ops_ = [full_like_op]
+        else:
+            value_grad = paddle.full_like(
+                copy_value,
+                float_value,
+                dtype=copy_value.dtype,
+            )
+            full_like_op = value_grad.get_defining_op()
+            full_op = full_like_op.operand_source(1).get_defining_op()
+            backward_ops_ = [full_like_op, full_op]
+        update_bwdop_structure(
+            backward_ops,
+            state.op_to_opgrad[value.get_defining_op()],
+            backward_ops_,
         )
-        full_like_op = value_grad.get_defining_op()
-        full_op = full_like_op.operand_source(1).get_defining_op()
-        backward_ops_ = [full_like_op, full_op]
-    update_bwdop_structure(
-        backward_ops,
-        state.op_to_opgrad[value.get_defining_op()],
-        backward_ops_,
-    )
-    state.value_to_valuegrad[value] = [[value_grad]]
-    return value_grad
+        state.value_to_valuegrad[value] = [[value_grad]]
+        return value_grad
 
 
 def append_add_n(
     op, value, state, backward_ops, bwd_value_to_block_argument_map
 ):
-    # value is input of more than one fwd_op,
-    # so more than one bwd_op create input_grad,
-    # need add sum op to accumulate gradient
-    add_n_list = []
-    for item in state.value_to_valuegrad[value]:
-        if item[0] is not None:
-            add_n_list.append(
-                return_map_value(item[0], bwd_value_to_block_argument_map)
-            )
+    with paddle.amp.auto_cast(enable=False):
+        # value is input of more than one fwd_op,
+        # so more than one bwd_op create input_grad,
+        # need add sum op to accumulate gradient
+        add_n_list = []
+        for item in state.value_to_valuegrad[value]:
+            if item[0] is not None:
+                add_n_list.append(
+                    return_map_value(item[0], bwd_value_to_block_argument_map)
+                )
 
-    if len(add_n_list) == 0:
-        for tmp in state.value_to_valuegrad[value]:
-            state.value_to_sumvaluegrad[value].append(tmp)
-        state.value_to_valuegrad[value] = []
-    else:
-        if value.is_dense_tensor_array_type():
-            add_n_value = paddle._pir_ops.add_n_array(add_n_list)
+        if len(add_n_list) == 0:
+            for tmp in state.value_to_valuegrad[value]:
+                state.value_to_sumvaluegrad[value].append(tmp)
+            state.value_to_valuegrad[value] = []
         else:
-            add_n_value = paddle.add_n(add_n_list)
+            if value.is_dense_tensor_array_type():
+                add_n_value = paddle._C_ops.add_n_array(add_n_list)
+            else:
+                add_n_value = paddle.add_n(add_n_list)
 
-        add_n_op = add_n_value.get_defining_op()
-        combine_op = add_n_op.operand_source(0).get_defining_op()
-        update_bwdop_structure(
-            backward_ops, state.op_to_opgrad[op], [combine_op, add_n_op]
-        )
+            add_n_op = add_n_value.get_defining_op()
+            combine_op = add_n_op.operand_source(0).get_defining_op()
+            update_bwdop_structure(
+                backward_ops, state.op_to_opgrad[op], [combine_op, add_n_op]
+            )
 
-        for tmp in state.value_to_valuegrad[value]:
-            state.value_to_sumvaluegrad[value].append(tmp)
-        state.value_to_valuegrad[value] = [[add_n_value]]
+            for tmp in state.value_to_valuegrad[value]:
+                state.value_to_sumvaluegrad[value].append(tmp)
+            state.value_to_valuegrad[value] = [[add_n_value]]
 
 
 def update_bwdop_structure(backward_ops, op_to_opgrad_list, grad_op_list):
@@ -649,6 +652,14 @@ def append_yield(
                     ]:
                         continue
 
+                    if all_input_stop_gradient_true(
+                        input_grad_stopgradients
+                    ) and op.name() not in [
+                        "pd_op.array_read",
+                        "pd_op.array_write_",
+                        "pd_op.increment_",
+                    ]:
+                        continue
                     if op.name() == "pd_op.if":
                         origin_inputs = get_real_op_inputs(op)
                         for sub_block in op.blocks():
diff --git a/python/paddle/base/__init__.py b/python/paddle/base/__init__.py
index acbaa22357ace..f665fa8d48048 100644
--- a/python/paddle/base/__init__.py
+++ b/python/paddle/base/__init__.py
@@ -58,7 +58,6 @@
 from .compiler import (  # noqa: F401
     BuildStrategy,
     CompiledProgram,
-    ExecutionStrategy,
     IpuCompiledProgram,
     IpuStrategy,
 )
diff --git a/python/paddle/base/backward.py b/python/paddle/base/backward.py
index 9f39d9c3ea03f..6663a514c0446 100755
--- a/python/paddle/base/backward.py
+++ b/python/paddle/base/backward.py
@@ -119,10 +119,7 @@ def is_amp_cast(op):
         while idx_ > pre_segment_end_idx:
             if is_amp_cast(self.ops[idx_]):
                 _logger.info(
-                    "found amp-cast op: {}, : {}".format(
-                        self.ops[idx_].desc.type(),
-                        self.ops[idx_].desc.input_arg_names()[0],
-                    )
+                    f"found amp-cast op: {self.ops[idx_].desc.type()}, : {self.ops[idx_].desc.input_arg_names()[0]}"
                 )
                 updated_min_idx = idx_
                 idx_ -= 1
@@ -409,9 +406,7 @@ def _infer_var_data_type_shape_(grad_var_name, block):
     else:
         # TODO(jiabin): Maybe we should not to this to cause some unexpected error on dtype
         warnings.warn(
-            "Set grad var: {} dtype to default FP32, since we can't find its related forward var".format(
-                grad_var_name
-            )
+            f"Set grad var: {grad_var_name} dtype to default FP32, since we can't find its related forward var"
         )
         grad_var.set_dtype(core.VarDesc.VarType.FP32)
 
@@ -1038,25 +1033,17 @@ def _append_backward_ops_with_checkpoints_(
     for i, (idx1, idx2) in enumerate(recompute_segments):
         _logger.info(f"recompute segment[{i}]")
         _logger.info(
-            "segment start op: [{}]: [{}]".format(
-                ops[idx1].desc.type(), ops[idx1].desc.input_arg_names()
-            )
+            f"segment start op: [{ops[idx1].desc.type()}]: [{ops[idx1].desc.input_arg_names()}]"
         )
         _logger.info(
-            "segment end op: [{}]: [{}]".format(
-                ops[idx2 - 1].desc.type(), ops[idx2 - 1].desc.input_arg_names()
-            )
+            f"segment end op: [{ops[idx2 - 1].desc.type()}]: [{ops[idx2 - 1].desc.input_arg_names()}]"
         )
         _logger.info(f"recompute segment[{i}]")
         _logger.info(
-            "segment start op: [{}]: [{}]".format(
-                ops[idx1].desc.type(), ops[idx1].desc.input_arg_names()
-            )
+            f"segment start op: [{ops[idx1].desc.type()}]: [{ops[idx1].desc.input_arg_names()}]"
         )
         _logger.info(
-            "segment end op: [{}]: [{}]".format(
-                ops[idx2 - 1].desc.type(), ops[idx2 - 1].desc.input_arg_names()
-            )
+            f"segment end op: [{ops[idx2 - 1].desc.type()}]: [{ops[idx2 - 1].desc.input_arg_names()}]"
         )
 
     # 2) go through all forward ops and induct all variables that will be hold in memory
@@ -1069,9 +1056,7 @@ def _append_backward_ops_with_checkpoints_(
 
     cross_vars = set(vars_should_be_hold) - set(checkpoints_name)
     _logger.info(
-        "found [{}] vars which cross recompute segment: [{}], better checkpoints might be set to reduce those vars".format(
-            len(cross_vars), cross_vars
-        )
+        f"found [{len(cross_vars)}] vars which cross recompute segment: [{cross_vars}], better checkpoints might be set to reduce those vars"
     )
 
     # b. output of seed op should be kept in memory
@@ -1942,9 +1927,7 @@ def _get_no_grad_set_name(no_grad_set):
                     )
         else:
             raise TypeError(
-                "The type of no_grad_set should be set or list or tuple, but received {}".format(
-                    type(no_grad_set)
-                )
+                f"The type of no_grad_set should be set or list or tuple, but received {type(no_grad_set)}"
             )
     return no_grad_set_name
 
@@ -1963,9 +1946,7 @@ def _get_no_grad_set_value(no_grad_set):
                     )
         else:
             raise TypeError(
-                "The type of no_grad_set should be set or list or tuple, but received {}".format(
-                    type(no_grad_set)
-                )
+                f"The type of no_grad_set should be set or list or tuple, but received {type(no_grad_set)}"
             )
     return no_grad_set_value
 
@@ -2553,9 +2534,7 @@ def calc_gradient_helper(
                 raise ValueError("all targets must be in the same block")
             if target.shape != grad.shape:
                 raise ValueError(
-                    "The shapes of target and grad are different: {} {}".format(
-                        target.name, grad.name
-                    )
+                    f"The shapes of target and grad are different: {target.name} {grad.name}"
                 )
             target_grad_map[_append_grad_suffix_(target.name)] = grad.name
             input_grad_names_set.add(grad.name)
diff --git a/python/paddle/base/compiler.py b/python/paddle/base/compiler.py
index 7b8646eb00b70..c8f0cb4247898 100644
--- a/python/paddle/base/compiler.py
+++ b/python/paddle/base/compiler.py
@@ -202,9 +202,7 @@ def _compile_data_parallel(self, places, use_device, scope=None):
 
         assert isinstance(
             places, (list, tuple)
-        ), "Currently, The places type can only be list or tuple, but the input type is {}.".format(
-            type(places)
-        )
+        ), f"Currently, The places type can only be list or tuple, but the input type is {type(places)}."
 
         if self._build_strategy is None:
             self._build_strategy = BuildStrategy()
@@ -546,10 +544,8 @@ def patch_getter(self, item):
                 current_tracing_count = len(self._caches)
                 if current_tracing_count > MAX_TRACED_PROGRAM_COUNT:
                     logging_utils.warn(
-                        "Current traced program number: {} > `max_tracing_count`:{}. Too much cached programs will bring expensive overhead. "
-                        "The reason may be: (1) passing tensors with different shapes, (2) passing python objects instead of tensors.".format(
-                            current_tracing_count, MAX_TRACED_PROGRAM_COUNT
-                        )
+                        f"Current traced program number: {current_tracing_count} > `max_tracing_count`:{MAX_TRACED_PROGRAM_COUNT}. Too much cached programs will bring expensive overhead. "
+                        "The reason may be: (1) passing tensors with different shapes, (2) passing python objects instead of tensors."
                     )
 
             return self._caches[item_id]
diff --git a/python/paddle/base/data_feeder.py b/python/paddle/base/data_feeder.py
index 6ed14832f17e8..119659bdca327 100644
--- a/python/paddle/base/data_feeder.py
+++ b/python/paddle/base/data_feeder.py
@@ -201,9 +201,7 @@ def check_type(input, input_name, expected_type, op_name, extra_message=''):
         )
     if not isinstance(input, expected_type):
         raise TypeError(
-            "The type of '{}' in {} must be {}, but received {}. {}".format(
-                input_name, op_name, expected_type, type(input), extra_message
-            )
+            f"The type of '{input_name}' in {op_name} must be {expected_type}, but received {type(input)}. {extra_message}"
         )
 
 
@@ -216,13 +214,7 @@ def check_dtype(
 
     if convert_dtype(input_dtype) not in expected_dtype:
         raise TypeError(
-            "The data type of '{}' in {} must be {}, but received {}. {}".format(
-                input_name,
-                op_name,
-                expected_dtype,
-                convert_dtype(input_dtype),
-                extra_message,
-            )
+            f"The data type of '{input_name}' in {op_name} must be {expected_dtype}, but received {convert_dtype(input_dtype)}. {extra_message}"
         )
 
 
@@ -294,9 +286,7 @@ def _check_shape(self, shape):
         for s1, s2 in zip(self.shape, shape):
             if s1 != s2 and s1 >= 0 and s2 >= 0:
                 raise ValueError(
-                    "Shape not match. What is defined in data layer is {}, but receive {}".format(
-                        self.shape, shape
-                    )
+                    f"Shape not match. What is defined in data layer is {self.shape}, but receive {shape}"
                 )
 
     def done(self):
@@ -307,9 +297,7 @@ def done(self):
                     arr = arr.reshape(self.shape)
                 except ValueError:
                     raise ValueError(
-                        "Reshape error. What is defined in data layer is {}, but receive {}".format(
-                            self.shape, arr.shape
-                        )
+                        f"Reshape error. What is defined in data layer is {self.shape}, but receive {arr.shape}"
                     )
         t = core.LoDTensor()
         t.set(arr, self.place)
diff --git a/python/paddle/base/dygraph/tensor_patch_methods.py b/python/paddle/base/dygraph/tensor_patch_methods.py
index e9bcf773b7c69..d608155d7d453 100644
--- a/python/paddle/base/dygraph/tensor_patch_methods.py
+++ b/python/paddle/base/dygraph/tensor_patch_methods.py
@@ -199,9 +199,7 @@ def set_value(self, value):
         if isinstance(value, (dict, str)):
             assert len(self) == len(
                 value
-            ), "Variable length not match, Variable [ {} ] need tensor with length {} but load set tensor with length {}".format(
-                self.name, len(self), len(value)
-            )
+            ), f"Variable length not match, Variable [ {self.name} ] need tensor with length {len(self)} but load set tensor with length {len(value)}"
             if isinstance(value, dict):
                 self.value().set_vocab(value)
             else:
@@ -209,9 +207,7 @@ def set_value(self, value):
         else:
             assert self.shape == list(
                 value.shape
-            ), "Variable Shape not match, Variable [ {} ] need tensor with shape {} but load set tensor with shape {}".format(
-                self.name, self.shape, value.shape
-            )
+            ), f"Variable Shape not match, Variable [ {self.name} ] need tensor with shape {self.shape} but load set tensor with shape {value.shape}"
 
             if isinstance(value, paddle.Tensor):
                 dtype = value.dtype
@@ -222,9 +218,7 @@ def set_value(self, value):
 
             assert (
                 self.dtype == dtype
-            ), "Variable dtype not match, Variable [ {} ] need tensor with dtype {}  but load tensor with dtype {}".format(
-                self.name, self.dtype, dtype
-            )
+            ), f"Variable dtype not match, Variable [ {self.name} ] need tensor with dtype {self.dtype}  but load tensor with dtype {dtype}"
 
             # NOTE(wuweilong): self could be Tensor, the subsequent behavior are defined in different files
             # if self is Tensor, method value() return self that defined in this file, get_tensor() defined in eager_method.cc
@@ -328,9 +322,7 @@ def backward(self, grad_tensor=None, retain_graph=False):
 
                 assert (
                     grad_tensor.shape == self.shape
-                ), "Tensor shape not match, Tensor of grad_tensor [ {} ] with shape {} mismatch Tensor [ {} ] with shape {}".format(
-                    grad_tensor.name, grad_tensor.shape, self.name, self.shape
-                )
+                ), f"Tensor shape not match, Tensor of grad_tensor [ {grad_tensor.name} ] with shape {grad_tensor.shape} mismatch Tensor [ {self.name} ] with shape {self.shape}"
 
             if grad_tensor is None:
                 grad_tensor = []
diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py
index 3162d27e05059..3d793e5172fa9 100755
--- a/python/paddle/base/executor.py
+++ b/python/paddle/base/executor.py
@@ -268,9 +268,7 @@ def check_feed_shape_type(var, feed, num_places=1):
                 else feed._dtype()
             )
             raise ValueError(
-                'The data type of fed Variable {!r} must be {!r}, but received {!r}'.format(
-                    var.name, var_dtype_format, feed_dtype_format
-                )
+                f'The data type of fed Variable {var.name!r} must be {var_dtype_format!r}, but received {feed_dtype_format!r}'
             )
     return True
 
@@ -318,9 +316,7 @@ def pir_check_feed_shape_type(feed, name, target_shape, dtype, num_places=1):
             else feed._dtype()
         )
         raise ValueError(
-            'The data type of fed Variable {!r} must be {!r}, but received {!r}'.format(
-                name, var_dtype_format, feed_dtype_format
-            )
+            f'The data type of fed Variable {name!r} must be {var_dtype_format!r}, but received {feed_dtype_format!r}'
         )
     return True
 
@@ -1455,9 +1451,7 @@ def _get_targets(_optimize_ops, _fetch_list, item):
             elif isinstance(item, tuple):
                 if not isinstance(item[0], (list, tuple)):
                     raise TypeError(
-                        "Requires fetch_list[{}][0] shall be one of (list, tuple) when type(fetch_list[{}]) is `tuple`, but received fetch_list[{}][0]'s type is `{}`.".format(
-                            index, index, index, type(item[0]).__name__
-                        )
+                        f"Requires fetch_list[{index}][0] shall be one of (list, tuple) when type(fetch_list[{index}]) is `tuple`, but received fetch_list[{index}][0]'s type is `{type(item[0]).__name__}`."
                     )
                 for i in item[0]:
                     _get_targets(_optimize_ops, _fetch_list, i)
@@ -2142,8 +2136,8 @@ def _check_fetch_list(self, fetch_list):
 
         assert is_tuple_list(fetch_list), (
             "Currently , The fetch_list type only should be list or tuple, \n"
-            "but the input type is {}. For more information please refer to \n"
-            "the executor.run(...).".format(type(fetch_list))
+            f"but the input type is {type(fetch_list)}. For more information please refer to \n"
+            "the executor.run(...)."
         )
 
         res = []
@@ -2158,9 +2152,7 @@ def _check_fetch_list(self, fetch_list):
                     res.append(var)
             else:
                 raise TypeError(
-                    "Require fetch_list[{}] 's type shall be one of (Value, str), but received {}.".format(
-                        i, type(var).__name__
-                    )
+                    f"Require fetch_list[{i}] 's type shall be one of (Value, str), but received {type(var).__name__}."
                 )
 
         return res
diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py
index 09018cd4fffe1..b575fb3d04698 100644
--- a/python/paddle/base/framework.py
+++ b/python/paddle/base/framework.py
@@ -561,19 +561,15 @@ def version_cmp(ver_a, ver_b):
     if version_cmp(version_installed, zero_version) == 0:
         if max_version is not None:
             warnings.warn(
-                "PaddlePaddle version in [{}, {}] required, but {} installed. "
+                f"PaddlePaddle version in [{min_version}, {max_version}] required, but {paddle_version.full_version} installed. "
                 "Maybe you are using a develop version, "
-                "please make sure the version is good with your code.".format(
-                    min_version, max_version, paddle_version.full_version
-                )
+                "please make sure the version is good with your code."
             )
         else:
             warnings.warn(
-                "PaddlePaddle version {} or higher is required, but {} installed, "
+                f"PaddlePaddle version {min_version} or higher is required, but {paddle_version.full_version} installed, "
                 "Maybe you are using a develop version, "
-                "please make sure the version is good with your code.".format(
-                    min_version, paddle_version.full_version
-                )
+                "please make sure the version is good with your code."
             )
         return
 
@@ -593,17 +589,13 @@ def version_cmp(ver_a, ver_b):
             or version_cmp(version_installed, min_version_to_check) < 0
         ):
             raise Exception(
-                "VersionError: PaddlePaddle version in [{}, {}] required, but {} installed.".format(
-                    min_version, max_version, paddle_version.full_version
-                )
+                f"VersionError: PaddlePaddle version in [{min_version}, {max_version}] required, but {paddle_version.full_version} installed."
             )
     else:
         if version_cmp(version_installed, min_version_to_check) < 0:
             raise Exception(
-                "VersionError: PaddlePaddle version {} or higher is required, but {} installed, "
-                "please upgrade your PaddlePaddle to {} or other higher version.".format(
-                    min_version, paddle_version.full_version, min_version
-                )
+                f"VersionError: PaddlePaddle version {min_version} or higher is required, but {paddle_version.full_version} installed, "
+                f"please upgrade your PaddlePaddle to {min_version} or other higher version."
             )
 
 
@@ -779,10 +771,8 @@ def _cpu_num():
                 "!!! The CPU_NUM is not specified, you should set CPU_NUM in the environment variable list.\n"
                 "CPU_NUM indicates that how many CPUPlace are used in the current task.\n"
                 "And if this parameter are set as N (equal to the number of physical CPU core) the program may be faster.\n\n"
-                "export CPU_NUM={} # for example, set CPU_NUM as number of physical CPU core which is {}.\n\n"
-                "!!! The default number of CPU_NUM=1.\n".format(
-                    multiprocessing.cpu_count(), multiprocessing.cpu_count()
-                )
+                f"export CPU_NUM={multiprocessing.cpu_count()} # for example, set CPU_NUM as number of physical CPU core which is {multiprocessing.cpu_count()}.\n\n"
+                "!!! The default number of CPU_NUM=1.\n"
             )
         os.environ["CPU_NUM"] = str(1)
     cpu_num = os.environ.get("CPU_NUM")
@@ -1971,13 +1961,7 @@ def _to_readable_code(self):
             or self.type == core.VarDesc.VarType.LOD_TENSOR
         ):
             dtype_str = str(self.dtype).split(".")[1]
-            var_str = "{name} : {type}.shape{shape}.dtype({dtype}).stop_gradient({stop_gradient})".format(
-                name=self.name,
-                type=type_str,
-                shape=self.shape,
-                dtype=dtype_str,
-                stop_gradient=self.stop_gradient,
-            )
+            var_str = f"{self.name} : {type_str}.shape{self.shape}.dtype({dtype_str}).stop_gradient({self.stop_gradient})"
         else:
             var_str = f"{self.name} : {type_str})"
 
@@ -2696,9 +2680,7 @@ def get_value(self, scope=None):
 
         if scope is not None and not isinstance(scope, core._Scope):
             raise TypeError(
-                "`scope` should be None or `paddle.static.Scope` type, but received {}.".format(
-                    type(scope)
-                )
+                f"`scope` should be None or `paddle.static.Scope` type, but received {type(scope)}."
             )
 
         if scope is None:
@@ -2763,16 +2745,12 @@ def set_value(self, value, scope=None):
 
         if not (isinstance(value, np.ndarray) or hasattr(value, "__array__")):
             raise TypeError(
-                "`value` should be `numpy.ndarray` or `LoDTensor`, but received {}.".format(
-                    type(value)
-                )
+                f"`value` should be `numpy.ndarray` or `LoDTensor`, but received {type(value)}."
             )
 
         if scope is not None and not isinstance(scope, core._Scope):
             raise TypeError(
-                "`scope` should be None or `paddle.static.Scope` type, but received {}.".format(
-                    type(scope)
-                )
+                f"`scope` should be None or `paddle.static.Scope` type, but received {type(scope)}."
             )
 
         if scope is None:
@@ -2793,9 +2771,7 @@ def set_value(self, value, scope=None):
                 value_shape = value.shape
             if list(t.shape()) != list(value_shape):
                 raise ValueError(
-                    "{} expected a shape {}, but the received shape is {}.".format(
-                        self.name, list(t.shape()), list(value_shape)
-                    )
+                    f"{self.name} expected a shape {list(t.shape())}, but the received shape is {list(value_shape)}."
                 )
 
         p = t._place()
@@ -3079,7 +3055,6 @@ class Operator:
         "heter_listen_and_serv",
         "c_wait_comm",
         "c_wait_compute",
-        "copy_cross_scope",
     }
 
     def __init__(
@@ -3330,12 +3305,7 @@ def find_name(var_list, name):
                                 and default_value != op_attrs[a_name]
                             ):
                                 warnings.warn(
-                                    "op {}'s attr {} = {} is not the default value: {}".format(
-                                        type,
-                                        a_name,
-                                        op_attrs[a_name],
-                                        default_value,
-                                    )
+                                    f"op {type}'s attr {a_name} = {op_attrs[a_name]} is not the default value: {default_value}"
                                 )
 
             # proto.attrs doesn't include ipu_index
@@ -3407,9 +3377,7 @@ def _to_readable_code(self, skip_op_callstack=True):
         """
         assert isinstance(
             skip_op_callstack, bool
-        ), "skip_op_callstack parameter's type is error, expect bool, received {}".format(
-            type(skip_op_callstack)
-        )
+        ), f"skip_op_callstack parameter's type is error, expect bool, received {type(skip_op_callstack)}"
         outputs_str = "{"
         for i in range(0, len(self.output_names)):
             outputs_str += f"{self.output_names[i]}="
@@ -3932,9 +3900,7 @@ def check_if_to_static_diff_with_dygraph(op_type, inplace_map, outputs):
                     and inplace_map.get("Input", None) == "Out"
                 ):
                     raise ValueError(
-                        "Sorry about what's happened. In to_static mode, {}'s output variable {} is a viewed Tensor in dygraph. This will result in inconsistent calculation behavior between dynamic and static graphs. If you are sure it is safe, you can call with paddle.base.framework._stride_in_no_check_dy2st_diff() in your safe code block.".format(
-                            op_type, k
-                        )
+                        f"Sorry about what's happened. In to_static mode, {op_type}'s output variable {k} is a viewed Tensor in dygraph. This will result in inconsistent calculation behavior between dynamic and static graphs. If you are sure it is safe, you can call with paddle.base.framework._stride_in_no_check_dy2st_diff() in your safe code block."
                     )
             elif isinstance(v, list):
                 for var in v:
@@ -3944,9 +3910,7 @@ def check_if_to_static_diff_with_dygraph(op_type, inplace_map, outputs):
                             and inplace_map.get("Input", None) == "Out"
                         ):
                             raise ValueError(
-                                "Sorry about what's happend. In to_static mode, {}'s output variable {} is a viewed Tensor in dygraph. This will result in inconsistent calculation behavior between dynamic and static graphs. If you are sure it is safe, you can call with paddle.base.framework._stride_in_no_check_dy2st_diff() in your safe code block.".format(
-                                    op_type, k
-                                )
+                                f"Sorry about what's happend. In to_static mode, {op_type}'s output variable {k} is a viewed Tensor in dygraph. This will result in inconsistent calculation behavior between dynamic and static graphs. If you are sure it is safe, you can call with paddle.base.framework._stride_in_no_check_dy2st_diff() in your safe code block."
                             )
 
 
@@ -4208,9 +4172,7 @@ def _to_readable_code(self, skip_op_callstack=True):
         """
         assert isinstance(
             skip_op_callstack, bool
-        ), "skip_op_callstack parameter's type is error, expect bool, received {}".format(
-            type(skip_op_callstack)
-        )
+        ), f"skip_op_callstack parameter's type is error, expect bool, received {type(skip_op_callstack)}"
         block_str = f"{{ // block_idx:{self.idx}  parent_idx:{self.parent_idx}  forward_idx:{self.forward_block_idx}  backward_idx:{self.backward_block_idx}\n"
         for var in list(self.vars.values()):
             block_str += f"    {var._to_readable_code()}\n"
@@ -6240,9 +6202,7 @@ def _to_readable_code(self, skip_op_callstack=True):
         """
         assert isinstance(
             skip_op_callstack, bool
-        ), "skip_op_callstack parameter's type is error, expect bool, received {}".format(
-            type(skip_op_callstack)
-        )
+        ), f"skip_op_callstack parameter's type is error, expect bool, received {type(skip_op_callstack)}"
         program_str = ""
         for block in self.blocks:
             program_str += block._to_readable_code(skip_op_callstack)
@@ -6283,14 +6243,10 @@ def to_string(self, throw_on_error, with_details=False):
         """
         assert isinstance(
             throw_on_error, bool
-        ), "The type of throw_on_error parameter is wrong, expected bool, but received {}.".format(
-            type(throw_on_error)
-        )
+        ), f"The type of throw_on_error parameter is wrong, expected bool, but received {type(throw_on_error)}."
         assert isinstance(
             with_details, bool
-        ), "The type of with_details parameter is wrong, expected bool, but received {}.".format(
-            type(with_details)
-        )
+        ), f"The type of with_details parameter is wrong, expected bool, but received {type(with_details)}."
 
         if with_details:
             res_str = ""
@@ -7342,9 +7298,7 @@ def state_dict(self, mode="all", scope=None):
 
         if scope is not None and not isinstance(scope, core._Scope):
             raise TypeError(
-                "`scope` should be None or `paddle.static.Scope'` type, but received {}.".format(
-                    type(scope)
-                )
+                f"`scope` should be None or `paddle.static.Scope'` type, but received {type(scope)}."
             )
 
         if scope is None:
@@ -7391,9 +7345,7 @@ def condition(var):
             var_temp = scope.find_var(var.name)
             if var_temp is None:
                 raise ValueError(
-                    "Can not find Variable '{}' in the scope. Make sure it is initialized".format(
-                        var.name
-                    )
+                    f"Can not find Variable '{var.name}' in the scope. Make sure it is initialized"
                 )
             state_dict[var.name] = var_temp.get_tensor()
 
@@ -8154,8 +8106,8 @@ def _get_paddle_place(place):
     if place == "gpu_pinned" or place == "gpu" or available_gpu_place:
         if not core.is_compiled_with_cuda():
             raise ValueError(
-                "The device should not be {}, since PaddlePaddle is "
-                "not compiled with CUDA".format(available_gpu_place.group())
+                f"The device should not be {available_gpu_place.group()}, since PaddlePaddle is "
+                "not compiled with CUDA"
             )
         if place == "gpu_pinned":
             return core.CUDAPinnedPlace()
@@ -8172,8 +8124,8 @@ def _get_paddle_place(place):
     if available_xpu_place:
         if not core.is_compiled_with_xpu():
             raise ValueError(
-                "The device should not be {}, since PaddlePaddle is "
-                "not compiled with XPU".format(available_xpu_place.group())
+                f"The device should not be {available_xpu_place.group()}, since PaddlePaddle is "
+                "not compiled with XPU"
             )
         place_info_list = place.split(":", 1)
         device_id = place_info_list[1]
@@ -8185,8 +8137,8 @@ def _get_paddle_place(place):
     if available_ipu_place:
         if not core.is_compiled_with_ipu():
             raise ValueError(
-                "The device should not be {}, since PaddlePaddle is "
-                "not compiled with IPU".format(available_ipu_place.group())
+                f"The device should not be {available_ipu_place.group()}, since PaddlePaddle is "
+                "not compiled with IPU"
             )
         place_info_list = place.split(":", 1)
         device_id = place_info_list[1]
@@ -8259,7 +8211,29 @@ def add_cast_for_type_promotion(op, block, idx, var_name, out_dtype):
     op.desc._rename_input(var_name.name, out_var.name)
 
 
+def can_skip_promote(op, device):
+    # Only GPU elementwise_add kernel supports the pattern "float + half".
+    if device != 'GPU':
+        return False
+    if op.type != "elementwise_add":
+        return False
+    input_x_dtype = op.block._find_var_recursive(op.input('X')[0]).dtype
+    input_y_dtype = op.block._find_var_recursive(op.input('Y')[0]).dtype
+    if input_x_dtype == paddle.float32 and (
+        input_y_dtype in [paddle.float16, paddle.bfloat16]
+    ):
+        return True
+
+    return False
+
+
 def process_type_promotion(program):
+    # Get _current_expected_place place
+    device = None
+    if core.is_compiled_with_cuda() and isinstance(
+        _current_expected_place(), core.CUDAPlace
+    ):
+        device = 'GPU'
     org_program = program
     if program is None:
         program = default_main_program()
@@ -8281,7 +8255,7 @@ def process_type_promotion(program):
                 op.type, None
             )
             # type promotion only support some dyadic api
-            if need_transed_var_names is None:
+            if need_transed_var_names is None or can_skip_promote(op, device):
                 idx += 1
                 continue
 
diff --git a/python/paddle/base/incubate/checkpoint/auto_checkpoint.py b/python/paddle/base/incubate/checkpoint/auto_checkpoint.py
index 329cdc25ab083..6fb4ef6074c5f 100644
--- a/python/paddle/base/incubate/checkpoint/auto_checkpoint.py
+++ b/python/paddle/base/incubate/checkpoint/auto_checkpoint.py
@@ -147,18 +147,9 @@ def valid(self):
         )
 
     def __str__(self):
-        return "run_env:{} platform:{} job_id:{} \
-            hdfs_home:{} hdfs_name:{} hdfs_ugi:{} \
-            hdfs_checkpoint_path:{} trainer_id:{} ce_test".format(
-            self._run_env,
-            self._platform,
-            self._hdfs_home,
-            self._hdfs_name,
-            self._hdfs_ugi,
-            self._hdfs_checkpoint_path,
-            self._trainer_id,
-            self._ce_test,
-        )
+        return f"run_env:{self._run_env} platform:{self._platform} job_id:{self._hdfs_home} \
+            hdfs_home:{self._hdfs_name} hdfs_name:{self._hdfs_ugi} hdfs_ugi:{self._hdfs_checkpoint_path} \
+            hdfs_checkpoint_path:{self._trainer_id} trainer_id:{self._ce_test} ce_test"
 
     @property
     def trainer_id(self):
diff --git a/python/paddle/base/incubate/checkpoint/checkpoint_saver.py b/python/paddle/base/incubate/checkpoint/checkpoint_saver.py
index b597cf9c37f2f..fc20b6300126a 100644
--- a/python/paddle/base/incubate/checkpoint/checkpoint_saver.py
+++ b/python/paddle/base/incubate/checkpoint/checkpoint_saver.py
@@ -86,9 +86,7 @@ def save_checkpoint(
 
         cache_path = None
         if self._fs.need_upload_download():
-            cache_path = "{}/{}.{}.saved_cache".format(
-                local_cache_path, self._checkpoint_prefix, max_no
-            )
+            cache_path = f"{local_cache_path}/{self._checkpoint_prefix}.{max_no}.saved_cache"
 
             if trainer_id is not None:
                 cache_path = f"{cache_path}.{trainer_id}"
@@ -144,9 +142,7 @@ def load_checkpoint(
 
         local_fs = LocalFS()
         if self._fs.need_upload_download():
-            cache_path = "{}/{}.{}.load_cache".format(
-                local_cache_path, self._checkpoint_prefix, checkpoint_no
-            )
+            cache_path = f"{local_cache_path}/{self._checkpoint_prefix}.{checkpoint_no}.load_cache"
 
             if trainer_id is not None:
                 cache_path = f"{cache_path}.{trainer_id}"
diff --git a/python/paddle/base/layers/layer_function_generator.py b/python/paddle/base/layers/layer_function_generator.py
index a8128603e05cd..cada5a6b6d72d 100644
--- a/python/paddle/base/layers/layer_function_generator.py
+++ b/python/paddle/base/layers/layer_function_generator.py
@@ -191,9 +191,7 @@ def infer_and_check_dtype(op_proto, *args, **kwargs):
                     dtype = each.dtype
                 elif dtype != each.dtype:
                     raise ValueError(
-                        "operator {} must input same dtype. {} vs {}".format(
-                            op_type, dtype, each.dtype
-                        )
+                        f"operator {op_type} must input same dtype. {dtype} vs {each.dtype}"
                     )
 
         if dtype is None:
@@ -315,9 +313,7 @@ def func(x, name=None):
             return op(x)
         else:
             warnings.warn(
-                "In static mode, {}() is the same as {}() and does not perform inplace operation.".format(
-                    inplace_op_type, origin_op_type
-                )
+                f"In static mode, {inplace_op_type}() is the same as {origin_op_type}() and does not perform inplace operation."
             )
             from ..dygraph.base import in_to_static_mode
 
@@ -327,19 +323,15 @@ def func(x, name=None):
                 and x.is_view_var
             ):
                 raise ValueError(
-                    'Sorry about what\'s happened. In to_static mode, {}\'s output variable {} is a viewed Tensor in dygraph. This will result in inconsistent calculation behavior between dynamic and static graphs. You must find the location of the strided API be called, and call {} = {}.assign().'.format(
-                        inplace_op_type, x.name, x.name, x.nameb
-                    )
+                    f'Sorry about what\'s happened. In to_static mode, {inplace_op_type}\'s output variable {x.name} is a viewed Tensor in dygraph. This will result in inconsistent calculation behavior between dynamic and static graphs. You must find the location of the strided API be called, and call {x.name} = {x.nameb}.assign().'
                 )
             return generate_activation_fn(origin_op_type)(x, name)
 
     func.__name__ = inplace_op_type
-    func.__doc__ = """
-Inplace version of ``{}`` API, the output Tensor will be inplaced with input ``x``.
-Please refer to :ref:`api_paddle_base_layers_{}`.
-""".format(
-        origin_op_type, origin_op_type
-    )
+    func.__doc__ = f"""
+Inplace version of ``{origin_op_type}`` API, the output Tensor will be inplaced with input ``x``.
+Please refer to :ref:`api_paddle_base_layers_{origin_op_type}`.
+"""
 
     return func
 
diff --git a/python/paddle/base/layers/math_op_patch.py b/python/paddle/base/layers/math_op_patch.py
index 00d0faaedd0dd..2fcc262264851 100644
--- a/python/paddle/base/layers/math_op_patch.py
+++ b/python/paddle/base/layers/math_op_patch.py
@@ -370,9 +370,7 @@ def append(self, var):
                 )
         if self.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
             raise TypeError(
-                "Only Variable with VarType.LOD_TENSOR_ARRAY support `append` method, but received type: {}".format(
-                    self.type
-                )
+                f"Only Variable with VarType.LOD_TENSOR_ARRAY support `append` method, but received type: {self.type}"
             )
         from paddle.tensor.array import array_length, array_write
 
@@ -409,9 +407,7 @@ def pop(self, *args):
 
         if self.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
             raise TypeError(
-                "Only Variable with VarType.LOD_TENSOR_ARRAY support `pop` method, but received type: {}".format(
-                    self.type
-                )
+                f"Only Variable with VarType.LOD_TENSOR_ARRAY support `pop` method, but received type: {self.type}"
             )
         if len(args) == 0:
             idx = -1
@@ -653,16 +649,9 @@ def __impl__(self, other_var):
                 file_name = stack[1]
                 line_num = stack[2]
                 warnings.warn(
-                    "{}:{}\nThe behavior of expression {} has been unified with {}(X, Y, axis=-1) from Paddle 2.0. "
+                    f"{file_name}:{line_num}\nThe behavior of expression {EXPRESSION_MAP[method_name]} has been unified with {op_type}(X, Y, axis=-1) from Paddle 2.0. "
                     "If your code works well in the older versions but crashes in this version, try to use "
-                    "{}(X, Y, axis=0) instead of {}. This transitional warning will be dropped in the future.".format(
-                        file_name,
-                        line_num,
-                        EXPRESSION_MAP[method_name],
-                        op_type,
-                        op_type,
-                        EXPRESSION_MAP[method_name],
-                    ),
+                    f"{op_type}(X, Y, axis=0) instead of {EXPRESSION_MAP[method_name]}. This transitional warning will be dropped in the future.",
                     category=DeprecationWarning,
                 )
             current_block(self).append_op(
diff --git a/python/paddle/base/reader.py b/python/paddle/base/reader.py
index d5695aec5b220..abca7f527db9a 100644
--- a/python/paddle/base/reader.py
+++ b/python/paddle/base/reader.py
@@ -96,10 +96,17 @@ def _convert_places(places):
 
 
 # NOTE(chenweihang): _reader_process_loop must be top level method to be pickled
-def _reader_process_loop(batch_reader, data_queue):
+def _reader_process_loop(
+    batch_reader, data_queue, dataloader_use_file_descriptor=True
+):
     try:
         # set signal handler
         core._set_process_signal_handler()
+        if not dataloader_use_file_descriptor:
+            # set dataloader_use_file_descriptor to false to avoid use descriptor.
+            paddle.base.core.globals()[
+                "FLAGS_dataloader_use_file_descriptor"
+            ] = False
 
         # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
         # some shared memory objects may have been applied for but have not yet
@@ -606,7 +613,7 @@ def _start(self):
             multiprocess_queue_set.add(self._data_queue)
             self._process = multiprocessing.Process(
                 target=_reader_process_loop,
-                args=(self._batch_reader, self._data_queue),
+                args=(self._batch_reader, self._data_queue, False),
             )
             self._process.daemon = True
             self._process.start()
@@ -1611,9 +1618,7 @@ def __init__(self, dataset, places, drop_last):
 
         assert (
             len(dataset.filelist) >= thread_num
-        ), "Filelist number of dataset {} must be not less than place number {}".format(
-            len(dataset.filelist), thread_num
-        )
+        ), f"Filelist number of dataset {len(dataset.filelist)} must be not less than place number {thread_num}"
 
         if dataset.thread_num != 0 and dataset.thread_num != thread_num:
             logging.warn(
diff --git a/python/paddle/base/variable_index.py b/python/paddle/base/variable_index.py
index 0d7704272df61..cabbddd18644b 100644
--- a/python/paddle/base/variable_index.py
+++ b/python/paddle/base/variable_index.py
@@ -125,9 +125,7 @@ def get_value_for_bool_tensor(var, item):
         if dim_len != -1 and var.shape[i] != -1 and dim_len != var.shape[i]:
             raise IndexError(
                 "The dimension of bool index doesn't match indexed array along "
-                "dimension {}, the target dimension is {}, but received {}.".format(
-                    i, var.shape[i], dim_len
-                )
+                f"dimension {i}, the target dimension is {var.shape[i]}, but received {dim_len}."
             )
         i += 1
     if len(item.shape) == len(var.shape):
@@ -160,9 +158,7 @@ def _setitem_for_tensor_array(var, item, value):
         return array_write(x=value, i=item, array=var)
     else:
         raise NotImplementedError(
-            "Only support __setitem__ by Int/Variable in tensor_array, but gets {}".format(
-                type(item)
-            )
+            f"Only support __setitem__ by Int/Variable in tensor_array, but gets {type(item)}"
         )
 
 
@@ -362,9 +358,7 @@ def parse_index(x, indices):
                 and len(slice_item) != x.shape[dim]
             ):
                 raise IndexError(
-                    "The shape of boolean index {} did not match indexed tensor {} along axis {}".format(
-                        len(slice_item), x.shape[dim], dim
-                    )
+                    f"The shape of boolean index {len(slice_item)} did not match indexed tensor {x.shape[dim]} along axis {dim}"
                 )
 
             has_advanced_index = True
@@ -382,9 +376,7 @@ def parse_index(x, indices):
 
                 elif slice_item.shape[0] != x.shape[dim]:
                     raise IndexError(
-                        "The shape of boolean index {} did not match indexed tensor {} along axis {}".format(
-                            slice_item.shape[0], x.shape[dim], dim
-                        )
+                        f"The shape of boolean index {slice_item.shape[0]} did not match indexed tensor {x.shape[dim]} along axis {dim}"
                     )
             advanced_index[estimated_dim] = (estimated_dim, slice_item)
             has_advanced_index = True
@@ -399,9 +391,7 @@ def parse_index(x, indices):
 
                 elif slice_item.shape[0] != x.shape[dim]:
                     raise IndexError(
-                        "The shape of boolean index {} did not match indexed tensor {} along axis {}".format(
-                            slice_item.shape[0], x.shape[dim], dim
-                        )
+                        f"The shape of boolean index {slice_item.shape[0]} did not match indexed tensor {x.shape[dim]} along axis {dim}"
                     )
             advanced_index[estimated_dim] = (estimated_dim, slice_item)
             has_advanced_index = True
@@ -409,9 +399,7 @@ def parse_index(x, indices):
             dim += 1
         else:
             raise IndexError(
-                "Valid index accept int / bool / slice / ellipsis / list / Tuple / Ndarray / Tensor, but received {}.".format(
-                    slice_item
-                )
+                f"Valid index accept int / bool / slice / ellipsis / list / Tuple / Ndarray / Tensor, but received {slice_item}."
             )
         if not slice_is_same_to_original(start, end, step):
             starts.append(start)
diff --git a/python/paddle/cinn/__init__.py b/python/paddle/cinn/__init__.py
new file mode 100644
index 0000000000000..3084a73790a20
--- /dev/null
+++ b/python/paddle/cinn/__init__.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from .runtime.cinn_jit import to_cinn_llir  # noqa: F401
+
+cinndir = os.path.dirname(os.path.abspath(__file__))
+runtime_include_dir = os.path.join(cinndir, "libs")
+cuhfile = os.path.join(runtime_include_dir, "cinn_cuda_runtime_source.cuh")
+
+if os.path.exists(cuhfile):
+    os.environ.setdefault('runtime_include_dir', runtime_include_dir)
+
+from .backends import (  # noqa: F401
+    Compiler,
+    ExecutionEngine,
+    ExecutionOptions,
+)
+from .common import (  # noqa: F401
+    BFloat16,
+    Bool,
+    CINNValue,
+    CINNValuePack,
+    DefaultHostTarget,
+    DefaultNVGPUTarget,
+    DefaultTarget,
+    Float,
+    Float16,
+    Int,
+    RefCount,
+    Shared_CINNValuePack_,
+    String,
+    Target,
+    Type,
+    UInt,
+    Void,
+    _CINNValuePack_,
+    get_target,
+    is_compiled_with_cuda,
+    is_compiled_with_cudnn,
+    make_const,
+    reset_name_id,
+    set_target,
+    type_of,
+)
diff --git a/python/cinn/auto_schedule/__init__.py b/python/paddle/cinn/auto_schedule/__init__.py
similarity index 100%
rename from python/cinn/auto_schedule/__init__.py
rename to python/paddle/cinn/auto_schedule/__init__.py
diff --git a/python/cinn/auto_schedule/cost_model/__init__.py b/python/paddle/cinn/auto_schedule/cost_model/__init__.py
similarity index 100%
rename from python/cinn/auto_schedule/cost_model/__init__.py
rename to python/paddle/cinn/auto_schedule/cost_model/__init__.py
diff --git a/python/cinn/auto_schedule/cost_model/cost_model.py b/python/paddle/cinn/auto_schedule/cost_model/cost_model.py
similarity index 100%
rename from python/cinn/auto_schedule/cost_model/cost_model.py
rename to python/paddle/cinn/auto_schedule/cost_model/cost_model.py
diff --git a/python/cinn/auto_schedule/cost_model/xgb_cost_model.py b/python/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.py
similarity index 97%
rename from python/cinn/auto_schedule/cost_model/xgb_cost_model.py
rename to python/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.py
index 6dc3c8e3baba5..de8796bb7c18b 100644
--- a/python/cinn/auto_schedule/cost_model/xgb_cost_model.py
+++ b/python/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.py
@@ -94,7 +94,7 @@ def load(self, path):
             self.booster = xgb.Booster()
         self.booster.load_model(path)
         # Should we save/load config parameters? Not now because it is pre-set.
-        # But we should do that here if that's changable in the future.
+        # But we should do that here if that's changeable in the future.
 
     def update(self, samples, labels):
         # xgb doesn't support incremental training, we leave this method as TODO
diff --git a/python/cinn/backends.py b/python/paddle/cinn/backends.py
similarity index 78%
rename from python/cinn/backends.py
rename to python/paddle/cinn/backends.py
index 6395d209586ad..3a940605f21f1 100644
--- a/python/cinn/backends.py
+++ b/python/paddle/cinn/backends.py
@@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .core_api.backends import (  # noqa: F401
-    Compiler,
-    ExecutionEngine,
-    ExecutionOptions,
-)
+from paddle.base import core
+
+__all__ = []
+
+for name in dir(core.cinn.backends):
+    globals()[name] = getattr(core.cinn.backends, name)
+    __all__.append(name)
diff --git a/python/cinn/libs/__init__.py b/python/paddle/cinn/common.py
similarity index 79%
rename from python/cinn/libs/__init__.py
rename to python/paddle/cinn/common.py
index b98e9f82cc94c..c083bd5c51acb 100644
--- a/python/cinn/libs/__init__.py
+++ b/python/paddle/cinn/common.py
@@ -12,4 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# used for setup.py.in to store the thirdparty shared libraries
+from paddle.base import core
+
+__all__ = []
+
+for name in dir(core.cinn.common):
+    globals()[name] = getattr(core.cinn.common, name)
+    __all__.append(name)
diff --git a/python/cinn/compiler/__init__.py b/python/paddle/cinn/compiler/__init__.py
similarity index 100%
rename from python/cinn/compiler/__init__.py
rename to python/paddle/cinn/compiler/__init__.py
diff --git a/python/cinn/compiler/compiler.py b/python/paddle/cinn/compiler/compiler.py
similarity index 94%
rename from python/cinn/compiler/compiler.py
rename to python/paddle/cinn/compiler/compiler.py
index 064b97c31f243..ddba9a5c0ae7d 100644
--- a/python/cinn/compiler/compiler.py
+++ b/python/paddle/cinn/compiler/compiler.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import cinn
+from paddle import cinn
+from paddle.cinn import lang
 
 from ..runtime import CinnLowerLevelIrJit
 from .compute_code_generator import ComputeCodeGenerator
@@ -33,7 +34,7 @@ def ast_to_llir(fn, inputs_signature):
 
 
 def llir_to_runtime_module(llir_func, target, function_name, arg_names):
-    cinn_builder = cinn.lang.Module.Builder(function_name, target)
+    cinn_builder = lang.Module.Builder(function_name, target)
     cinn_builder.add_function(llir_func)
     llir_module = cinn_builder.build()
     return cinn.runtime.Module(llir_module, target, function_name, arg_names)
diff --git a/python/cinn/compiler/compute_code_generator.py b/python/paddle/cinn/compiler/compute_code_generator.py
similarity index 99%
rename from python/cinn/compiler/compute_code_generator.py
rename to python/paddle/cinn/compiler/compute_code_generator.py
index bbe7f3e3c545e..7e093c675a68f 100644
--- a/python/cinn/compiler/compute_code_generator.py
+++ b/python/paddle/cinn/compiler/compute_code_generator.py
@@ -15,7 +15,7 @@
 import ast
 import contextlib
 
-from cinn import ir
+from paddle.cinn import ir
 
 from .expr_executor import ExprExecutor, exec_assign
 from .utils import VariableTable, is_node_parsed_in_schedule
diff --git a/python/cinn/compiler/expr_executor.py b/python/paddle/cinn/compiler/expr_executor.py
similarity index 99%
rename from python/cinn/compiler/expr_executor.py
rename to python/paddle/cinn/compiler/expr_executor.py
index eeca1b9602e15..d22163883e9f9 100644
--- a/python/cinn/compiler/expr_executor.py
+++ b/python/paddle/cinn/compiler/expr_executor.py
@@ -14,7 +14,7 @@
 
 import ast
 
-from cinn import ir
+from paddle.cinn import ir
 
 # The Python native AST node that cinn ir supports
 AST2CINN = {
diff --git a/python/cinn/compiler/schedule_code_generator.py b/python/paddle/cinn/compiler/schedule_code_generator.py
similarity index 99%
rename from python/cinn/compiler/schedule_code_generator.py
rename to python/paddle/cinn/compiler/schedule_code_generator.py
index 9da654aff58df..52fb65e060b73 100644
--- a/python/cinn/compiler/schedule_code_generator.py
+++ b/python/paddle/cinn/compiler/schedule_code_generator.py
@@ -14,7 +14,7 @@
 
 import ast
 
-from cinn.schedule import IRSchedule
+from paddle.cinn.schedule import IRSchedule
 
 from .expr_executor import ExprExecutor, exec_assign
 from .utils import (
diff --git a/python/cinn/compiler/utils.py b/python/paddle/cinn/compiler/utils.py
similarity index 98%
rename from python/cinn/compiler/utils.py
rename to python/paddle/cinn/compiler/utils.py
index 6f78446245fb4..03e2303f73178 100644
--- a/python/cinn/compiler/utils.py
+++ b/python/paddle/cinn/compiler/utils.py
@@ -19,7 +19,7 @@
     pass
 
 
-from cinn.schedule import IRSchedule
+from paddle.cinn.schedule import IRSchedule
 
 
 def is_node_parsed_in_schedule(node: ast.Call):
diff --git a/python/cinn/framework.py b/python/paddle/cinn/framework.py
similarity index 77%
rename from python/cinn/framework.py
rename to python/paddle/cinn/framework.py
index 2b58539cf5750..34fc92cda4efc 100644
--- a/python/cinn/framework.py
+++ b/python/paddle/cinn/framework.py
@@ -12,13 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .core_api.framework import (  # noqa: F401
-    Instruction,
-    NodeAttr,
-    Operator,
-    OpValueType,
-    OpValueType1,
-    Scope,
-    SharedTensor,
-    Tensor,
-)
+from paddle.base import core
+
+__all__ = []
+
+for name in dir(core.cinn.framework):
+    globals()[name] = getattr(core.cinn.framework, name)
+    __all__.append(name)
diff --git a/python/cinn/frontend.py b/python/paddle/cinn/frontend.py
similarity index 68%
rename from python/cinn/frontend.py
rename to python/paddle/cinn/frontend.py
index c1c309f3f5c7e..0a78c21500c48 100644
--- a/python/cinn/frontend.py
+++ b/python/paddle/cinn/frontend.py
@@ -12,16 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .core_api.frontend import (  # noqa: F401
-    Computation,
-    Instruction,
-    Interpreter,
-    NetBuilder,
-    PaddleModelConvertor,
-    Placeholder,
-    Program,
-    Variable,
-    get_default_graph_pass,
-    get_default_opfusion_pass,
-    get_default_program_pass,
-)
+from paddle.base import core
+
+__all__ = []
+
+for name in dir(core.cinn.frontend):
+    globals()[name] = getattr(core.cinn.frontend, name)
+    __all__.append(name)
diff --git a/python/paddle/cinn/ir/__init__.py b/python/paddle/cinn/ir/__init__.py
new file mode 100644
index 0000000000000..5fe371ce02966
--- /dev/null
+++ b/python/paddle/cinn/ir/__init__.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.base import core
+
+from .ir_api import sequential  # noqa: F401
+from .ir_context import (  # noqa: F401
+    ElseContext,
+    ForContext,
+    IfContext,
+    IRBuilder,
+    IRContext,
+    LowerFuncContext,
+    ScheduleBlockContext,
+    ThenContext,
+)
+
+__all__ = []
+ignore_cpp_module = [
+    "ElseContext",
+    "ForContext",
+    "IfContext",
+    "IRBuilder",
+    "IRContext",
+    "ForContext",
+    "IRContext",
+    "LowerFuncContext",
+    "ScheduleBlockContext",
+    "ThenContext",
+]
+
+for name in dir(core.cinn.ir):
+    if name not in ignore_cpp_module:
+        globals()[name] = getattr(core.cinn.ir, name)
+        __all__.append(name)
+
+from paddle.cinn.ir import PackedFunc, Registry
+
+
+def get_global_func(name):
+    return Registry.get(name)
+
+
+def register(name, override=False):
+    def _register_fn(fn):
+        Registry.register(name, override).set_body(PackedFunc(fn))
+        return Registry.get(name)
+
+    return _register_fn
+
+
+def register_packed_func(name, override=False):
+    def _register(fn):
+        def _packed(args, rv):
+            _args = []
+            for i in range(len(args)):
+                _args.append(args[i])
+            r = fn(*_args)
+            rv.set(r)
+
+        Registry.register(name, override).set_body(PackedFunc(_packed))
+        return Registry.get(name)
+
+    return _register
diff --git a/python/cinn/ir/ir.py b/python/paddle/cinn/ir/ir_api.py
similarity index 97%
rename from python/cinn/ir/ir.py
rename to python/paddle/cinn/ir/ir_api.py
index 7d51a302a3dfb..508efce13e58f 100644
--- a/python/cinn/ir/ir.py
+++ b/python/paddle/cinn/ir/ir_api.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn import ir
+from paddle.cinn import ir
 
 from .ir_context import ForContext
 
diff --git a/python/cinn/ir/ir_context.py b/python/paddle/cinn/ir/ir_context.py
similarity index 82%
rename from python/cinn/ir/ir_context.py
rename to python/paddle/cinn/ir/ir_context.py
index 69292541a6698..bc09e63efb788 100644
--- a/python/cinn/ir/ir_context.py
+++ b/python/paddle/cinn/ir/ir_context.py
@@ -12,15 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn import ir
-
-from .. import core_api
+from paddle.base import core
+from paddle.cinn import ir
 
 
 # Encapsulated cinn::pybind::IRBuilder in C++
 class IRBuilder:
     def __init__(self):
-        self.ir_builder = core_api.ir.IRBuilder()
+        self.ir_builder = core.cinn.ir.IRBuilder()
 
     def __enter__(self):
         self.ir_builder.EnterWithContext()
@@ -50,13 +49,13 @@ def __exit__(self, ptype, value, trace) -> None:
 # Encapsulated cinn::pybind::ScheduleBlockContext in C++
 class ScheduleBlockContext(IRContext):
     def __init__(self, name):
-        self.ir_ctx = core_api.ir.IRContext.MakeScheduleBlockContext(name)
+        self.ir_ctx = core.cinn.ir.IRContext.MakeScheduleBlockContext(name)
 
 
 # Encapsulated cinn::pybind::LowerFuncContext in C++
 class LowerFuncContext(IRContext):
     def __init__(self, name):
-        self.ir_ctx = core_api.ir.IRContext.MakeLowerFunctionContext(name)
+        self.ir_ctx = core.cinn.ir.IRContext.MakeLowerFunctionContext(name)
 
 
 # Encapsulated cinn::pybind::ForContext in C++
@@ -72,16 +71,16 @@ def __enter__(self):
 # Encapsulated cinn::pybind::IfContext in C++
 class IfContext(IRContext):
     def __init__(self, expr):
-        self.ir_ctx = core_api.ir.IRContext.MakeIfContext(expr)
+        self.ir_ctx = core.cinn.ir.IRContext.MakeIfContext(expr)
 
 
 # Encapsulated cinn::pybind::ThenContext in C++
 class ThenContext(IRContext):
     def __init__(self):
-        self.ir_ctx = core_api.ir.IRContext.MakeThenContext()
+        self.ir_ctx = core.cinn.ir.IRContext.MakeThenContext()
 
 
 # Encapsulated cinn::pybind::ElseContext in C++
 class ElseContext(IRContext):
     def __init__(self):
-        self.ir_ctx = core_api.ir.IRContext.MakeElseContext()
+        self.ir_ctx = core.cinn.ir.IRContext.MakeElseContext()
diff --git a/python/paddle/cinn/lang.py b/python/paddle/cinn/lang.py
new file mode 100644
index 0000000000000..f4f3d5813b6de
--- /dev/null
+++ b/python/paddle/cinn/lang.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.base import core
+
+__all__ = []
+
+for name in dir(core.cinn.lang):
+    globals()[name] = getattr(core.cinn.lang, name)
+    __all__.append(name)
diff --git a/python/cinn/optim.py b/python/paddle/cinn/optim.py
similarity index 79%
rename from python/cinn/optim.py
rename to python/paddle/cinn/optim.py
index 12f9786e9f842..dc8b24a0b68a1 100644
--- a/python/cinn/optim.py
+++ b/python/paddle/cinn/optim.py
@@ -12,4 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .core_api.optim import ir_copy, simplify  # noqa: F401
+from paddle.base import core
+
+__all__ = []
+
+for name in dir(core.cinn.optim):
+    globals()[name] = getattr(core.cinn.optim, name)
+    __all__.append(name)
diff --git a/python/cinn/version/__init__.py b/python/paddle/cinn/pe.py
similarity index 80%
rename from python/cinn/version/__init__.py
rename to python/paddle/cinn/pe.py
index 766bdeacbad7c..adc314378948e 100644
--- a/python/cinn/version/__init__.py
+++ b/python/paddle/cinn/pe.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-try:
-    from .info import *  # noqa: F403
-except:
-    full_version = 'Unknown'
+from paddle.base import core
+
+__all__ = []
+
+for name in dir(core.cinn.pe):
+    globals()[name] = getattr(core.cinn.pe, name)
+    __all__.append(name)
diff --git a/python/cinn/poly.py b/python/paddle/cinn/poly.py
similarity index 79%
rename from python/cinn/poly.py
rename to python/paddle/cinn/poly.py
index 7bc2bf32fb3de..8e4cf171a2ae2 100644
--- a/python/cinn/poly.py
+++ b/python/paddle/cinn/poly.py
@@ -12,12 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .core_api.poly import (  # noqa: F401
-    Condition,
-    Iterator,
-    SharedStage,
-    SharedStageMap,
-    Stage,
-    StageMap,
-    create_stages,
-)
+from paddle.base import core
+
+__all__ = []
+
+for name in dir(core.cinn.poly):
+    globals()[name] = getattr(core.cinn.poly, name)
+    __all__.append(name)
diff --git a/python/cinn/utils.py b/python/paddle/cinn/runtime/__init__.py
similarity index 69%
rename from python/cinn/utils.py
rename to python/paddle/cinn/runtime/__init__.py
index 35cea7e9e0b8f..3c8ca72bf9dc5 100644
--- a/python/cinn/utils.py
+++ b/python/paddle/cinn/runtime/__init__.py
@@ -11,20 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from paddle.base import core
 
-from .core_api.utils import (  # noqa: F401
-    EventType,
-    HostEvent,
-    HostEventRecorder,
-    ProfilerHelper,
-    kCodeGen,
-    kCompile,
-    kCompute,
-    kFusePass,
-    kGraph,
-    kInstruction,
-    kOptimize,
-    kOrdinary,
-    kProgram,
-    kSchedule,
-)
+from .cinn_jit import CinnLowerLevelIrJit
+from .module import Module
+
+__all__ = ["CinnLowerLevelIrJit", "Module"]
+
+for name in dir(core.cinn.runtime):
+    globals()[name] = getattr(core.cinn.runtime, name)
+    __all__.append(name)
diff --git a/python/cinn/runtime/cinn_jit.py b/python/paddle/cinn/runtime/cinn_jit.py
similarity index 96%
rename from python/cinn/runtime/cinn_jit.py
rename to python/paddle/cinn/runtime/cinn_jit.py
index 7b85808593d62..3e2a2cdac0f9d 100644
--- a/python/cinn/runtime/cinn_jit.py
+++ b/python/paddle/cinn/runtime/cinn_jit.py
@@ -48,9 +48,9 @@ def _make_launcher(self):
         # Gets information about runtime input parameters
         jit_input_args = ', '.join(arg_name for arg_name in self.arg_names)
         lazy_compile = f"""
-import cinn
+import paddle.cinn as cinn
 def {self.fn.__name__}({jit_input_args}, target=cinn.common.DefaultHostTarget()):
-    from cinn.compiler import compile
+    from paddle.cinn.compiler import compile
     jit_inputs = {', '.join([f'{arg}' for arg in self.arg_names])}
     jit_inputs_signature = {{ i: self._convert_arg_type(arg) \
                              for i, arg in enumerate(jit_inputs)}}
@@ -67,7 +67,7 @@ def {self.fn.__name__}({jit_input_args}, target=cinn.common.DefaultHostTarget())
         return scope[self.fn.__name__]
 
     def convert_to_llir(self):
-        from cinn.compiler import compile
+        from paddle.cinn.compiler import compile
 
         return compile(self, just_convert=True)
 
diff --git a/python/cinn/runtime/data_array.py b/python/paddle/cinn/runtime/data_array.py
similarity index 96%
rename from python/cinn/runtime/data_array.py
rename to python/paddle/cinn/runtime/data_array.py
index e422005622cac..179df00b706ae 100644
--- a/python/cinn/runtime/data_array.py
+++ b/python/paddle/cinn/runtime/data_array.py
@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
-from cinn import common, runtime
-from cinn.common import BFloat16, Bool, Float, Float16, Int, UInt
+
+from paddle.cinn import common, runtime
+from paddle.cinn.common import BFloat16, Bool, Float, Float16, Int, UInt
 
 
 class DataArray:
diff --git a/python/cinn/runtime/module.py b/python/paddle/cinn/runtime/module.py
similarity index 93%
rename from python/cinn/runtime/module.py
rename to python/paddle/cinn/runtime/module.py
index 24a3169101594..e720c146a27e2 100644
--- a/python/cinn/runtime/module.py
+++ b/python/paddle/cinn/runtime/module.py
@@ -11,9 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import cinn
-from cinn import framework
-from cinn.backends import Compiler
+from paddle import cinn
+from paddle.cinn import framework
+from paddle.cinn.backends import Compiler
 
 
 class Module:
diff --git a/python/cinn/runtime/utils.py b/python/paddle/cinn/runtime/utils.py
similarity index 100%
rename from python/cinn/runtime/utils.py
rename to python/paddle/cinn/runtime/utils.py
diff --git a/python/cinn/schedule.py b/python/paddle/cinn/schedule.py
similarity index 78%
rename from python/cinn/schedule.py
rename to python/paddle/cinn/schedule.py
index c9cb004f1fb66..4e044a2f45659 100644
--- a/python/cinn/schedule.py
+++ b/python/paddle/cinn/schedule.py
@@ -12,4 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .core_api.schedule import IRSchedule  # noqa: F401
+from paddle.base import core
+
+__all__ = []
+
+for name in dir(core.cinn.schedule):
+    globals()[name] = getattr(core.cinn.schedule, name)
+    __all__.append(name)
diff --git a/test/legacy_test/test_parallel_executor_transformer_auto_growth.py b/python/paddle/cinn/utils.py
similarity index 72%
rename from test/legacy_test/test_parallel_executor_transformer_auto_growth.py
rename to python/paddle/cinn/utils.py
index 7f38de13af4cd..09324c40bb953 100644
--- a/test/legacy_test/test_parallel_executor_transformer_auto_growth.py
+++ b/python/paddle/cinn/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
+from paddle.base import core
 
-if __name__ == '__main__':
-    unittest.main()
+__all__ = []
+
+for name in dir(core.cinn.utils):
+    globals()[name] = getattr(core.cinn.utils, name)
+    __all__.append(name)
diff --git a/python/paddle/decomposition/recompute.py b/python/paddle/decomposition/recompute.py
index 1386f2d06481b..caa381ac22e6d 100644
--- a/python/paddle/decomposition/recompute.py
+++ b/python/paddle/decomposition/recompute.py
@@ -76,11 +76,51 @@
     "pd_op.where",
     "pd_op.prod",
     "pd_op.log",
+    "pd_op.log1p",
+    "pd_op.logit",
     "pd_op.max",
     "pd_op.expand_as",
     "pd_op.split",
     "pd_op.arange",
     "pd_op.put_along_axis",
+    "pd_op.tanh",
+    "pd_op.atan",
+    "pd_op.atanh",
+    "pd_op.sinh",
+    "pd_op.asin",
+    "pd_op.asinh",
+    "pd_op.cosh",
+    "pd_op.acos",
+    "pd_op.acosh",
+    "pd_op.abs",
+    "pd_op.sign",
+    "pd_op.expm1",
+    "pd_op.erf",
+    "pd_op.erfinv",
+    "pd_op.ceil",
+    "pd_op.floor",
+    "pd_op.frac",
+    "pd_op.round",
+    "pd_op.trunc",
+    "pd_op.equal",
+    "pd_op.angle",
+    "pd_op.as_complex",
+    "pd_op.as_real",
+    "pd_op.complex",
+    "pd_op.real",
+    "pd_op.imag",
+    "pd_op.conj",
+    "pd_op.not_equal",
+    "pd_op.greater_equal",
+    "pd_op.greater_than",
+    "pd_op.less_equal",
+    "pd_op.less_than",
+    "pd_op.bitwise_and",
+    "pd_op.bitwise_not",
+    "pd_op.bitwise_or",
+    "pd_op.bitwise_xor",
+    "pd_op.isinf",
+    "pd_op.isnan",
 ]
 
 VIEW_OPS: List[str] = []
@@ -468,9 +508,7 @@ def _find_recompute_ops(
                 "pd_op.full_int_array",
             ]:
                 raise Exception(
-                    "Every path to recompute value {} must have saved value or starting point of the path is one of op in [pd_op.full, pd_op.full_int_array], but find {} op".format(
-                        recompute_value, define_op.name()
-                    )
+                    f"Every path to recompute value {recompute_value} must have saved value or starting point of the path is one of op in [pd_op.full, pd_op.full_int_array], but find {define_op.name()} op"
                 )
             for op_input in op_inputs:
                 if op_input in saved_values:
diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py
index 3b834e054486d..9027ed5d5fd94 100644
--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -225,8 +225,8 @@ def _convert_to_place(device):
         if available_gpu_device:
             if not core.is_compiled_with_cuda():
                 raise ValueError(
-                    "The device should not be {}, since PaddlePaddle is "
-                    "not compiled with CUDA".format(available_gpu_device)
+                    f"The device should not be {available_gpu_device}, since PaddlePaddle is "
+                    "not compiled with CUDA"
                 )
             device_info_list = device.split(':', 1)
             device_id = device_info_list[1]
@@ -235,8 +235,8 @@ def _convert_to_place(device):
         if available_xpu_device:
             if not core.is_compiled_with_xpu():
                 raise ValueError(
-                    "The device should not be {}, since PaddlePaddle is "
-                    "not compiled with XPU".format(available_xpu_device)
+                    f"The device should not be {available_xpu_device}, since PaddlePaddle is "
+                    "not compiled with XPU"
                 )
             device_info_list = device.split(':', 1)
             device_id = device_info_list[1]
@@ -827,9 +827,7 @@ def __hash__(self):
         return hash((self.stream_base, self.device))
 
     def __repr__(self):
-        return '<paddle.device.Stream device={} stream={:#x}>'.format(
-            self.device, self._as_parameter_.value
-        )
+        return f'<paddle.device.Stream device={self.device} stream={self._as_parameter_.value:#x}>'
 
 
 def current_stream(device=None):
diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py
index f624cb1e1a109..d5b485b06c5d5 100644
--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -197,13 +197,13 @@ def extract_cuda_device_id(device, op_name):
             device_id = int(device[4:])
         else:
             raise ValueError(
-                "The current string {} is not expected. Because {} only support string which is like 'gpu:x'. "
-                "Please input appropriate string again!".format(device, op_name)
+                f"The current string {device} is not expected. Because {op_name} only support string which is like 'gpu:x'. "
+                "Please input appropriate string again!"
             )
     else:
         raise ValueError(
-            "The device type {} is not expected. Because {} only support int, str or paddle.CUDAPlace. "
-            "Please input appropriate device again!".format(device, op_name)
+            f"The device type {device} is not expected. Because {op_name} only support int, str or paddle.CUDAPlace. "
+            "Please input appropriate device again!"
         )
 
     assert (
diff --git a/python/paddle/device/cuda/graphs.py b/python/paddle/device/cuda/graphs.py
index 598bf64a10387..db425d003b66d 100644
--- a/python/paddle/device/cuda/graphs.py
+++ b/python/paddle/device/cuda/graphs.py
@@ -23,7 +23,7 @@
     is_compiled_with_rocm,
 )
 
-if is_compiled_with_cuda() and not is_compiled_with_rocm():
+if is_compiled_with_cuda() or is_compiled_with_rocm():
     from paddle.base.core import CUDAGraph as CoreCUDAGraph
 
     def is_cuda_graph_supported():
diff --git a/python/paddle/distributed/auto_parallel/api.py b/python/paddle/distributed/auto_parallel/api.py
index a70105e75b0f1..114c956dcd975 100644
--- a/python/paddle/distributed/auto_parallel/api.py
+++ b/python/paddle/distributed/auto_parallel/api.py
@@ -28,12 +28,15 @@
     EagerParamBase,
     Variable,
     default_main_program,
+    in_pir_mode,
 )
 from paddle.distributed.auto_parallel import Engine, strategy as auto_strategy
 from paddle.distributed.auto_parallel.interface import (
     shard_tensor as shard_tensor_static,
 )
-from paddle.distributed.auto_parallel.placement_type import to_placements
+from paddle.distributed.auto_parallel.placement_type import (
+    to_placements,
+)
 from paddle.distributed.auto_parallel.static.completion import (
     mark_as_sharding_propagation_skip_op,
 )
@@ -252,6 +255,8 @@ def _init_func(var, block):
         sharding_specs = get_shard_spec(mesh, placements, tensor.ndim)
         dims_mapping = convert_to_dims_mapping(sharding_specs, mesh)
         dist_tensor = paddle._pir_ops.shard_tensor(tensor, mesh, dims_mapping)
+        dist_tensor.stop_gradient = tensor.stop_gradient
+        dist_tensor.persistable = tensor.persistable
         return dist_tensor
     else:
         # TODO(zhiqiu): we need to refine the static shard_tensor
@@ -386,12 +391,12 @@ def reshard(dist_tensor, mesh, placements):
             dist_attr._set_partial_dims(partial_dims)
 
         return paddle.base.core.reshard(dist_tensor, dist_attr)
+    elif in_pir_mode():
+        return paddle._C_ops.reshard(dist_tensor, mesh, placements)
     else:
         assert isinstance(
             dist_tensor, Variable
-        ), "in dy2static mode, reshard's input should be Variable, but got [{}]".format(
-            dist_tensor
-        )
+        ), f"in dy2static mode, reshard's input should be Variable, but got [{dist_tensor}]"
         sharding_specs = get_shard_spec(mesh, placements, dist_tensor.ndim)
         main_program = default_main_program()
         default_dist_ctx = get_default_distributed_context()
@@ -1445,33 +1450,39 @@ def __init__(self, config=None):
         )
         self._sp_optimization = auto_strategy.SPOptimizationConfig(config_dict)
 
-    def _from_legacy_strategy(self, auto_stragety):
+    def _from_legacy_strategy(self, legacy_strategy):
         """
         NOTE(lizhiyu): This is a template function to get `dist.Strategy` from `fleet.auto.Strategy`.
         """
         import copy
 
-        self._fused_passes.enable = auto_stragety.fused_passes.enable
+        category = auto_strategy.constants.BASE
+        base_config = auto_strategy.constants.get_category_default_config(
+            category
+        )
+        for key in base_config.keys():
+            setattr(self, key, getattr(legacy_strategy, key))
+        self._fused_passes.enable = legacy_strategy.fused_passes.enable
         if (
             "fused_gemm_epilogue_pass"
-            in auto_stragety.fused_passes.fused_passes_list
+            in legacy_strategy.fused_passes.fused_passes_list
         ):
             self._fused_passes.gemm_epilogue = True
         if (
             "fused_dropout_add_pass"
-            in auto_stragety.fused_passes.fused_passes_list
+            in legacy_strategy.fused_passes.fused_passes_list
         ):
             self._fused_passes.dropout_add = True
 
-        self._amp = copy.deepcopy(auto_stragety.amp)
-        self._sharding = copy.deepcopy(auto_stragety.sharding)
-        self._gradient_merge = copy.deepcopy(auto_stragety.gradient_merge)
-        self._pipeline = copy.deepcopy(auto_stragety.pipeline)
+        self._amp = copy.deepcopy(legacy_strategy.amp)
+        self._sharding = copy.deepcopy(legacy_strategy.sharding)
+        self._gradient_merge = copy.deepcopy(legacy_strategy.gradient_merge)
+        self._pipeline = copy.deepcopy(legacy_strategy.pipeline)
         # The below are template interfaces
-        self._recompute = copy.deepcopy(auto_stragety.recompute)
-        self._mp_optimization = copy.deepcopy(auto_stragety.mp_optimization)
-        self._dp_optimization = copy.deepcopy(auto_stragety.dp_optimization)
-        self._sp_optimization = copy.deepcopy(auto_stragety.sp_optimization)
+        self._recompute = copy.deepcopy(legacy_strategy.recompute)
+        self._mp_optimization = copy.deepcopy(legacy_strategy.mp_optimization)
+        self._dp_optimization = copy.deepcopy(legacy_strategy.dp_optimization)
+        self._sp_optimization = copy.deepcopy(legacy_strategy.sp_optimization)
 
     @property
     def sharding(self):
@@ -1697,12 +1708,18 @@ def __init__(
         # call paddle.disable_static to keep the outside of DistModel in dynamic graph mode
 
         # set the default mode
-        if optimizer is not None and loss is not None:
-            self.train()
-        elif loss is not None:
-            self.eval()
-        else:
-            self.predict()
+        self._in_pir_mode = paddle.base.framework.get_flags(
+            "FLAGS_enable_pir_api"
+        )["FLAGS_enable_pir_api"]
+        if (
+            not self._in_pir_mode
+        ):  # TODO (2024-Q2) remove this when pir mode is fully constructed.
+            if optimizer is not None and loss is not None:
+                self.train()
+            elif loss is not None:
+                self.eval()
+            else:
+                self.predict()
 
     def train(self):
         """
@@ -1835,6 +1852,10 @@ def serial_startup_program(self, mode=None):
         return self._engine.get_serial_startup_program(mode)
 
     def _make_feeds(self, data_list):
+        # TODO (2024-Q2): formula make feed
+        if self._in_pir_mode:
+            self._feed_name_list[self._mode] = ['input0', 'label0']
+
         if (
             self._mode not in self._feed_name_list
             or self._feed_name_list[self._mode] == []
@@ -1887,6 +1908,12 @@ def __convert_strategy(self, strategy):
         if strategy is None:
             return None
         inner_strategy = auto_strategy.Strategy()
+        category = auto_strategy.constants.BASE
+        base_config = auto_strategy.constants.get_category_default_config(
+            category
+        )
+        for key in base_config.keys():
+            setattr(inner_strategy, key, getattr(strategy, key))
         inner_strategy.fused_passes.enable = strategy.fused_passes.enable
         if getattr(strategy.fused_passes, "gemm_epilogue", False):
             inner_strategy.fused_passes.fused_passes_list.append(
@@ -1902,16 +1929,21 @@ def __convert_strategy(self, strategy):
         inner_strategy.gradient_merge = copy.deepcopy(strategy.gradient_merge)
         inner_strategy.pipeline = copy.deepcopy(strategy.pipeline)
         # The below are template interfaces
-        inner_strategy.recompute = copy.deepcopy(strategy._recompute)
-        inner_strategy.mp_optimization = copy.deepcopy(
-            strategy._mp_optimization
-        )
-        inner_strategy.dp_optimization = copy.deepcopy(
-            strategy._dp_optimization
-        )
-        inner_strategy.sp_optimization = copy.deepcopy(
-            strategy._sp_optimization
-        )
+        if hasattr(strategy, "_recompute"):
+            inner_strategy.recompute = copy.deepcopy(strategy._recompute)
+
+        if hasattr(strategy, "_mp_optimization"):
+            inner_strategy.mp_optimization = copy.deepcopy(
+                strategy._mp_optimization
+            )
+        if hasattr(strategy, "_dp_optimization"):
+            inner_strategy.dp_optimization = copy.deepcopy(
+                strategy._dp_optimization
+            )
+        if hasattr(strategy, "_sp_optimization"):
+            inner_strategy.sp_optimization = copy.deepcopy(
+                strategy._sp_optimization
+            )
 
         return inner_strategy
 
@@ -2273,9 +2305,7 @@ def unshard_dtensor(dist_tensor):
     else:
         assert isinstance(
             dist_tensor, Variable
-        ), "the input type of 'unshard_dtensor' should be Variable, but got [{}]".format(
-            dist_tensor
-        )
+        ), f"the input type of 'unshard_dtensor' should be Variable, but got [{dist_tensor}]"
         # in static mode, 'distributed tensor' and 'dense tensor' are all
         # Variable type, the distributed attribute is a property of the Variable.
         # So, it's no need to convert the distributed tensor to a dense tensor.
@@ -2338,9 +2368,7 @@ def __init__(
         process_id = dist.get_rank()
         if self._process_id_in_multi_meshes(process_id):
             raise ValueError(
-                "process_id {} is in more than one mesh, the meshes are {}".format(
-                    process_id, self._meshes
-                )
+                f"process_id {process_id} is in more than one mesh, the meshes are {self._meshes}"
             )
         if input_keys is not None:
             assert len(input_keys) == 2, "input_keys lengths must be 2"
@@ -2410,9 +2438,7 @@ def _process_shard_dims(self, shard_dims):
         else:
             if len(shard_dims) != len(self._meshes):
                 raise ValueError(
-                    "shard_dims must be the same length as meshes, but got {} != {}".format(
-                        len(shard_dims), len(self._meshes)
-                    )
+                    f"shard_dims must be the same length as meshes, but got {len(shard_dims)} != {len(self._meshes)}"
                 )
             return shard_dims
 
diff --git a/python/paddle/distributed/auto_parallel/interface.py b/python/paddle/distributed/auto_parallel/interface.py
index 1029e8772e200..e8aa51563ad77 100644
--- a/python/paddle/distributed/auto_parallel/interface.py
+++ b/python/paddle/distributed/auto_parallel/interface.py
@@ -102,9 +102,7 @@ def shard_tensor(x, process_mesh=None, shard_spec=None):
     if shard_spec is not None:
         assert verify_shard_spec(
             shard_spec, tensor_shape, process_mesh
-        ), "For tensor {}, shard_spec {} is invalid with tensor_shape {} and process_mesh {}.".format(
-            serial_tensor.name, shard_spec, tensor_shape, process_mesh
-        )
+        ), f"For tensor {serial_tensor.name}, shard_spec {shard_spec} is invalid with tensor_shape {tensor_shape} and process_mesh {process_mesh}."
         dist_tensor.dist_attr.dims_mapping = convert_to_dims_mapping(
             shard_spec, process_mesh
         )
@@ -304,9 +302,7 @@ def fetch(tensor, name=None, logging=False):
         tensor = tensor
     else:
         raise TypeError(
-            "Only support fetch `Variable` or `str`[`Variable`'s name], but got `{}`".format(
-                type(tensor)
-            )
+            f"Only support fetch `Variable` or `str`[`Variable`'s name], but got `{type(tensor)}`"
         )
     add_to_collection(CollectionNames.FETCHES, tensor, name)
     if logging:
diff --git a/python/paddle/distributed/auto_parallel/process_mesh.py b/python/paddle/distributed/auto_parallel/process_mesh.py
index c0dbd3a9d2790..03f0a4cda7d69 100644
--- a/python/paddle/distributed/auto_parallel/process_mesh.py
+++ b/python/paddle/distributed/auto_parallel/process_mesh.py
@@ -322,9 +322,7 @@ def __ne__(self, other):
         return not self.__eq__(other)
 
     def __str__(self):
-        str = "shape {}, process_ids {}, dim_nams {}".format(
-            self.shape, self.process_ids, self.dim_names
-        )
+        str = f"shape {self.shape}, process_ids {self.process_ids}, dim_nams {self.dim_names}"
         return str
 
     def __hash__(self):
diff --git a/python/paddle/distributed/auto_parallel/random.py b/python/paddle/distributed/auto_parallel/random.py
index 3d971ff9f40bf..d7cac9f62ceb3 100644
--- a/python/paddle/distributed/auto_parallel/random.py
+++ b/python/paddle/distributed/auto_parallel/random.py
@@ -88,9 +88,7 @@ def determinate_rng(
     # instead of using offsets to coordinate seed across devices.
     if len(process_mesh.shape) > 4:
         raise NotImplementedError(
-            "Auto Parallel Random Control for Mesh's rank > 4 is NOT supported! Got {}".format(
-                str(process_mesh)
-            )
+            f"Auto Parallel Random Control for Mesh's rank > 4 is NOT supported! Got {str(process_mesh)}"
         )
     global _basic_seed
     seed_ = _basic_seed
@@ -131,9 +129,7 @@ def determinate_rng(
     else:
         assert (
             seed_ not in _rng_name_to_seed.values()
-        ), "Seed Conflict! current seed: {}, current sharding expr: {}, generated seed: {}".format(
-            seed_, sharding_expr, _rng_name_to_seed
-        )
+        ), f"Seed Conflict! current seed: {seed_}, current sharding expr: {sharding_expr}, generated seed: {_rng_name_to_seed}"
         _rng_name_to_seed[sharding_expr] = seed_
         if paddle.in_dynamic_mode():
             # for dygraph, just init the seed when meeting a new seed
diff --git a/python/paddle/distributed/auto_parallel/static/cluster.py b/python/paddle/distributed/auto_parallel/static/cluster.py
index da1d6eed20c78..e28370623cb43 100644
--- a/python/paddle/distributed/auto_parallel/static/cluster.py
+++ b/python/paddle/distributed/auto_parallel/static/cluster.py
@@ -140,17 +140,7 @@ def memory(self, value):
 
     def __str__(self):
         str = ""
-        str += "global_id: {}, local_id: {}, machine_id: {}, type: {}, model: {}, dp_flops: {}, sp_flops: {}, hp_flops: {}, memory: {}".format(
-            self.global_id,
-            self.local_id,
-            self.machine.id,
-            self.type.name,
-            self.model,
-            self.dp_gflops,
-            self.sp_gflops,
-            self.hp_gflops,
-            self.memory,
-        )
+        str += f"global_id: {self.global_id}, local_id: {self.local_id}, machine_id: {self.machine.id}, type: {self.type.name}, model: {self.model}, dp_flops: {self.dp_gflops}, sp_flops: {self.sp_gflops}, hp_flops: {self.hp_gflops}, memory: {self.memory}"
         return str
 
     def __repr__(self):
@@ -221,13 +211,7 @@ def hop(self, value):
 
     def __str__(self):
         str = ""
-        str += "source_global_id: {}, target_global_id: {}, type: {}, bandwidth: {}, latency: {}".format(
-            self.source.global_id,
-            self.target.global_id,
-            self.type,
-            self.bandwidth,
-            self.latency,
-        )
+        str += f"source_global_id: {self.source.global_id}, target_global_id: {self.target.global_id}, type: {self.type}, bandwidth: {self.bandwidth}, latency: {self.latency}"
         return str
 
     def __repr__(self):
diff --git a/python/paddle/distributed/auto_parallel/static/completion.py b/python/paddle/distributed/auto_parallel/static/completion.py
index bd912c373d79f..9874b9c45ffb8 100644
--- a/python/paddle/distributed/auto_parallel/static/completion.py
+++ b/python/paddle/distributed/auto_parallel/static/completion.py
@@ -183,6 +183,9 @@ def _can_apply_infer_spmd_rule(dist_op):
         "silu",
         "concat",
         "expand_as_v2",
+        "swiglu",
+        "tile",
+        "fused_rms_norm",
     ]
     parallel_ce = os.getenv("PARALLEL_CROSS_ENTROPY")
     if parallel_ce == "true":
@@ -196,9 +199,7 @@ def _update_op_dims_mapping_and_distoperatorimpl(
 ):
     dist_op_container = find_distributed_operator_impl_container(dist_op)
     _logger.debug(
-        "Update Op [{}] using DistOpContainer [{}].".format(
-            dist_op.serial_op.type, dist_op_container.type
-        )
+        f"Update Op [{dist_op.serial_op.type}] using DistOpContainer [{dist_op_container.type}]."
     )
 
     updated = dist_op_container.update_dims_mapping(dist_op)
@@ -208,11 +209,7 @@ def _update_op_dims_mapping_and_distoperatorimpl(
         dist_op, original_op_dist_attr
     )
     _logger.debug(
-        "Op [{}] use dist op impl [{}] idx [{}].".format(
-            dist_op.serial_op.type,
-            dist_op.dist_attr.impl_type,
-            dist_op.dist_attr.impl_idx,
-        )
+        f"Op [{dist_op.serial_op.type}] use dist op impl [{dist_op.dist_attr.impl_type}] idx [{dist_op.dist_attr.impl_idx}]."
     )
     return changed and not (reverted)
 
@@ -395,18 +392,14 @@ def _update_op_node_dims_mapping(self, op_node, fwd=True):
         # step 2: Infer & Update dims mapping of op node using SPMD Rule.
         if _can_apply_infer_spmd_rule(dist_op):
             _logger.debug(
-                "Op [{}] update dims mapping using New InferSPMD Rule.".format(
-                    dist_op.serial_op.type
-                )
+                f"Op [{dist_op.serial_op.type}] update dims mapping using New InferSPMD Rule."
             )
             return _update_op_dims_mapping_and_distoperatorimpl(
                 dist_op, original_op_dist_attr, changed
             )
         else:
             _logger.debug(
-                "Op [{}] update dims mapping using Original DistOp Rule.".format(
-                    dist_op.serial_op.type
-                )
+                f"Op [{dist_op.serial_op.type}] update dims mapping using Original DistOp Rule."
             )
             # update_op_dims_mapping_v1()
             op_dist_impls = find_compatible_distributed_operator_impls(
@@ -1266,9 +1259,7 @@ def set_process_mesh(block, op, process_mesh, var_to_process_mesh):
         num_chunks = pp_degree * vpp_degree
         assert (
             len(seg_op_deps) % num_chunks == 0
-        ), "The number of layers[{}] ({}) should be divided by part number ({}).".format(
-            seg_method, len(seg_op_deps), num_chunks
-        )
+        ), f"The number of layers[{seg_method}] ({len(seg_op_deps)}) should be divided by part number ({num_chunks})."
 
         # Step2: analysis whether the pp_stage is non-decreasing among segments
         # 1. if non_decreasing is True, the ops' process_mesh will be changed by vpp strategy
@@ -1321,25 +1312,13 @@ def set_process_mesh(block, op, process_mesh, var_to_process_mesh):
                 seg_op_idx.extend(seg_op_deps[name])
 
             _logger.info(
-                "stage=[{}], chunk_id=[{}], layer_name=[{}]".format(
-                    pp_stage,
-                    chunk_id,
-                    struct_names,
-                )
+                f"stage=[{pp_stage}], chunk_id=[{chunk_id}], layer_name=[{struct_names}]"
             )
             _logger.info(
-                "start op: [{}]: [{}] [{}]".format(
-                    ops[start_idx].type,
-                    ops[start_idx].input_arg_names,
-                    ops[start_idx].output_arg_names,
-                )
+                f"start op: [{ops[start_idx].type}]: [{ops[start_idx].input_arg_names}] [{ops[start_idx].output_arg_names}]"
             )
             _logger.info(
-                "end op: [{}]: [{}] [{}]".format(
-                    ops[end_idx - 1].type,
-                    ops[end_idx - 1].input_arg_names,
-                    ops[end_idx - 1].output_arg_names,
-                )
+                f"end op: [{ops[end_idx - 1].type}]: [{ops[end_idx - 1].input_arg_names}] [{ops[end_idx - 1].output_arg_names}]"
             )
 
             for idx in range(start_idx, end_idx):
@@ -1993,14 +1972,10 @@ def infer_backward_op_partial_status(
                 assert grad_op.type == "fill_constant"
                 assert (
                     len(grad_op.input_arg_names) == 0
-                ), "first backward op should has only ONE output, but got [{}]".format(
-                    len(grad_op.input_arg_names)
-                )
+                ), f"first backward op should has only ONE output, but got [{len(grad_op.input_arg_names)}]"
                 assert (
                     len(grad_op.output_arg_names) == 1
-                ), "first backward op should has only ONE output, but got [{}]".format(
-                    len(grad_op.output_arg_names)
-                )
+                ), f"first backward op should has only ONE output, but got [{len(grad_op.output_arg_names)}]"
 
                 loss_var = vars[loss_op.output_arg_names[0]]
                 loss_grad_var = vars[grad_op.output_arg_names[0]]
diff --git a/python/paddle/distributed/auto_parallel/static/converter.py b/python/paddle/distributed/auto_parallel/static/converter.py
index 241a83aaf4f5d..7f1dcbb696e77 100644
--- a/python/paddle/distributed/auto_parallel/static/converter.py
+++ b/python/paddle/distributed/auto_parallel/static/converter.py
@@ -54,9 +54,7 @@ def _check_tensor_dict(self, tensors_dict):
             )
         if not isinstance(tensors_dict, dict):
             raise TypeError(
-                "The type of 'tensors_dict' should be 'dict', but got '{}'.".format(
-                    str(type(tensors_dict))
-                )
+                f"The type of 'tensors_dict' should be 'dict', but got '{str(type(tensors_dict))}'."
             )
         return tensors_dict
 
@@ -178,22 +176,16 @@ def convert(self, strict=True):
         tensor_not_in_cur = set(tensor_not_in_cur) - set(tensor_match_with_cur)
         if tensor_not_in_pre:
             warnings.warn(
-                "tensors [{}] are not found in last training strategy.".format(
-                    str(tensor_not_in_pre)
-                )
+                f"tensors [{str(tensor_not_in_pre)}] are not found in last training strategy."
             )
         if tensor_not_in_cur:
             warnings.warn(
-                "tensors [{}] are not found in current training strategy.".format(
-                    str(tensor_not_in_cur)
-                )
+                f"tensors [{str(tensor_not_in_cur)}] are not found in current training strategy."
             )
         if tensor_not_in_ckpt:
             warnings.warn(
-                "tensors [{}] are found in pre_strategy, but are not found"
-                "in checkpoint files, please check your checkpoint files.".format(
-                    str(tensor_not_in_ckpt)
-                )
+                f"tensors [{str(tensor_not_in_ckpt)}] are found in pre_strategy, but are not found"
+                "in checkpoint files, please check your checkpoint files."
             )
 
         return tensors_dict
@@ -223,9 +215,7 @@ def convert_with_prefix_match(
                             )
                         except ValueError as err:
                             raise ValueError(
-                                "Fail to convert tensor '{}' by '{}'. ".format(
-                                    str(cur_name), str(pre_name)
-                                )
+                                f"Fail to convert tensor '{str(cur_name)}' by '{str(pre_name)}'. "
                                 + str(err)
                             )
                         self._logger.info(
diff --git a/python/paddle/distributed/auto_parallel/static/cost/base_cost.py b/python/paddle/distributed/auto_parallel/static/cost/base_cost.py
index 495cff26844d7..7250f02df47ce 100644
--- a/python/paddle/distributed/auto_parallel/static/cost/base_cost.py
+++ b/python/paddle/distributed/auto_parallel/static/cost/base_cost.py
@@ -873,9 +873,7 @@ def _check_comp_op_type(cls):
         if cls.OP_TYPE != "COMP":
             if cls.OP_TYPE in NON_COMP_TYPE:
                 raise TypeError(
-                    "Please Check op type not in {}, but got {}.".format(
-                        NON_COMP_TYPE, cls.OP_TYPE
-                    )
+                    f"Please Check op type not in {NON_COMP_TYPE}, but got {cls.OP_TYPE}."
                 )
 
     def calc_flops(self):
diff --git a/python/paddle/distributed/auto_parallel/static/cost/op_runtime_cost.py b/python/paddle/distributed/auto_parallel/static/cost/op_runtime_cost.py
index 70c54e5f24279..c057d17ef4c39 100644
--- a/python/paddle/distributed/auto_parallel/static/cost/op_runtime_cost.py
+++ b/python/paddle/distributed/auto_parallel/static/cost/op_runtime_cost.py
@@ -121,9 +121,7 @@ def _alloc_and_fill_var(var_name):
             )
         )
         logger.info(
-            '[+] var: "{}", shape={}, dtype="{}".\n'.format(
-                var_name, str(var_shape), str(var_dtype)
-            )
+            f'[+] var: "{var_name}", shape={str(var_shape)}, dtype="{str(var_dtype)}".\n'
         ) if verbose else None
         np_dtype = (
             convert_dtype(var_dtype)
@@ -276,9 +274,7 @@ def measure_program_real_op_cost(
     assert any(
         isinstance(place, supported_place)
         for supported_place in supported_places
-    ), 'Current place ({}) does not support runtime profiling. "place" should be one of the following: {}.'.format(
-        str(place), str(supported_places)
-    )
+    ), f'Current place ({str(place)}) does not support runtime profiling. "place" should be one of the following: {str(supported_places)}.'
     assert isinstance(run_iters, int) and run_iters >= 1, (
         'Invalid parameter run_iters set. run_iters '
         'should be an integer >= 1.'
diff --git a/python/paddle/distributed/auto_parallel/static/cost/tensor_cost.py b/python/paddle/distributed/auto_parallel/static/cost/tensor_cost.py
index 38f7a007ceaa6..f60f8bf3bb017 100644
--- a/python/paddle/distributed/auto_parallel/static/cost/tensor_cost.py
+++ b/python/paddle/distributed/auto_parallel/static/cost/tensor_cost.py
@@ -61,9 +61,7 @@ def _check_args(self, tensor, dist_tensor, shape, dtype):
             assert tensor is None and shape is None
             if not isinstance(dist_tensor, DistributedTensor):
                 raise TypeError(
-                    "Please check dist_tensor type is DistributedTensor, but got {}".format(
-                        type(dist_tensor)
-                    )
+                    f"Please check dist_tensor type is DistributedTensor, but got {type(dist_tensor)}"
                 )
 
         elif shape is not None:
diff --git a/python/paddle/distributed/auto_parallel/static/dist_context.py b/python/paddle/distributed/auto_parallel/static/dist_context.py
index 12d88ba779d3f..e147d8986fade 100644
--- a/python/paddle/distributed/auto_parallel/static/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/static/dist_context.py
@@ -1025,35 +1025,21 @@ def validate_dist_attr_for_program(self):
                 dist_tensor = self.get_dist_tensor_for_program(tensor)
                 assert (
                     dist_tensor is not None
-                ), "Tensor {} does not have a distributed attribute.".format(
-                    dist_tensor.serial_tensor.name
-                )
+                ), f"Tensor {dist_tensor.serial_tensor.name} does not have a distributed attribute."
                 if (dist_tensor is not None) and (
                     not dist_tensor.validate_dist_attr()
                 ):
                     raise AssertionError(
-                        "Tensor {} (id: {}, original_id: {}) has a wrong distributed attributes {}.".format(
-                            dist_tensor.serial_tensor.name,
-                            dist_tensor.serial_tensor.desc.id(),
-                            dist_tensor.serial_tensor.desc.original_id(),
-                            dist_tensor.dist_attr,
-                        )
+                        f"Tensor {dist_tensor.serial_tensor.name} (id: {dist_tensor.serial_tensor.desc.id()}, original_id: {dist_tensor.serial_tensor.desc.original_id()}) has a wrong distributed attributes {dist_tensor.dist_attr}."
                     )
             for op in block.ops:
                 dist_op = self.get_dist_op_for_program(op)
                 assert (
                     dist_op is not None
-                ), "Operator {} does not have a distributed attribute.".format(
-                    dist_op.serial_op.type
-                )
+                ), f"Operator {dist_op.serial_op.type} does not have a distributed attribute."
                 if (dist_op is not None) and (not dist_op.validate_dist_attr()):
                     raise AssertionError(
-                        "Operator {} (id: {}, original_id: {}) has a wrong distributed attributes {} .".format(
-                            dist_op.serial_op.type,
-                            dist_op.serial_op.desc.id(),
-                            dist_op.serial_op.desc.original_id(),
-                            dist_op.dist_attr,
-                        )
+                        f"Operator {dist_op.serial_op.type} (id: {dist_op.serial_op.desc.id()}, original_id: {dist_op.serial_op.desc.original_id()}) has a wrong distributed attributes {dist_op.dist_attr} ."
                     )
                 if (
                     op.has_attr("op_namescope")
@@ -1230,9 +1216,7 @@ def parse_forward_blocks(self, program):
             assert idx == block.idx, "index doesn't match"
             assert (
                 block.forward_block_idx == -1
-            ), "forward_block_idx of forward block [{}] is not [{}]".format(
-                idx, block.forward_block_idx
-            )
+            ), f"forward_block_idx of forward block [{idx}] is not [{block.forward_block_idx}]"
             self.forward_indices.append(idx)
             self.nblock += 1
 
diff --git a/python/paddle/distributed/auto_parallel/static/dist_input_spec.py b/python/paddle/distributed/auto_parallel/static/dist_input_spec.py
index 5bb15901f277a..54ee342bb6cf0 100644
--- a/python/paddle/distributed/auto_parallel/static/dist_input_spec.py
+++ b/python/paddle/distributed/auto_parallel/static/dist_input_spec.py
@@ -59,6 +59,4 @@ def from_dtensor(cls, dtensor, name=None):
         )
 
     def __repr__(self):
-        return "{}, mesh:{}, placements:{}".format(
-            super().__repr__(), self.mesh, self.dims_mapping
-        )
+        return f"{super().__repr__()}, mesh:{self.mesh}, placements:{self.dims_mapping}"
diff --git a/python/paddle/distributed/auto_parallel/static/dist_loader.py b/python/paddle/distributed/auto_parallel/static/dist_loader.py
index 21b6a0aaeda96..016fef68fa78a 100644
--- a/python/paddle/distributed/auto_parallel/static/dist_loader.py
+++ b/python/paddle/distributed/auto_parallel/static/dist_loader.py
@@ -188,9 +188,7 @@ def data_generator():
                     batch_size = array.shape[0]
                     assert (
                         batch_size % self.dp_world_sizes[i] == 0
-                    ), "batch_size [{}] is not divisible by dp_world_size [{}]".format(
-                        str(batch_size), str(self.dp_world_sizes[i])
-                    )
+                    ), f"batch_size [{str(batch_size)}] is not divisible by dp_world_size [{str(self.dp_world_sizes[i])}]"
                     partial_data.append(
                         np.split(array, self.dp_world_sizes[i])[
                             self.dp_ranks[i]
diff --git a/python/paddle/distributed/auto_parallel/static/dist_op.py b/python/paddle/distributed/auto_parallel/static/dist_op.py
index 8d28c43eef4d7..8733a95b25d47 100644
--- a/python/paddle/distributed/auto_parallel/static/dist_op.py
+++ b/python/paddle/distributed/auto_parallel/static/dist_op.py
@@ -113,11 +113,7 @@ def validate_dist_attr(self):
         return True
 
     def __str__(self):
-        str = "{{op type: {}, op id: {}, op original_id: {}".format(
-            self.serial_op.desc.type(),
-            self.serial_op.desc.id(),
-            self.serial_op.desc.original_id(),
-        )
+        str = f"{{op type: {self.serial_op.desc.type()}, op id: {self.serial_op.desc.id()}, op original_id: {self.serial_op.desc.original_id()}"
 
         # str += ", {}".format(self.dist_attr)
         # return str
@@ -137,9 +133,7 @@ def __str__(self):
                 dims_mapping = self.dist_attr.get_input_dims_mapping(arg_name)
             except IndexError:
                 raise IndexError(
-                    "There is not input var '{}''s dist_attr in current op '{}'".format(
-                        arg_name, self.serial_op.desc.type()
-                    )
+                    f"There is not input var '{arg_name}''s dist_attr in current op '{self.serial_op.desc.type()}'"
                 )
             if self.dist_attr.is_annotated_input_dims_mapping(arg_name):
                 annotated_str = "annotated"
@@ -157,22 +151,14 @@ def __str__(self):
             input_dist_attr = self.dist_attr.get_input_dist_attr(arg_name)
             partial_dims = sorted(input_dist_attr._partial_dims())
 
-            str += "; {}'s dims_mapping (input, {}, {}): {}, partial on dims: {}".format(
-                arg_name,
-                annotated_str,
-                is_parameter_str,
-                dims_mapping,
-                partial_dims,
-            )
+            str += f"; {arg_name}'s dims_mapping (input, {annotated_str}, {is_parameter_str}): {dims_mapping}, partial on dims: {partial_dims}"
 
         for arg_name in self.serial_op.desc.output_arg_names():
             try:
                 dims_mapping = self.dist_attr.get_output_dims_mapping(arg_name)
             except IndexError:
                 raise IndexError(
-                    "There is not output var '{}''s dist_attr in current op '{}'".format(
-                        arg_name, self.serial_op.desc.type()
-                    )
+                    f"There is not output var '{arg_name}''s dist_attr in current op '{self.serial_op.desc.type()}'"
                 )
             if self.dist_attr.is_annotated_output_dims_mapping(arg_name):
                 annotated_str = "annotated"
@@ -190,21 +176,9 @@ def __str__(self):
             output_dist_attr = self.dist_attr.get_output_dist_attr(arg_name)
             partial_dims = sorted(output_dist_attr._partial_dims())
 
-            str += "; {}'s dims_mapping (output, {}, {}): {}, partial on dims: {}".format(
-                arg_name,
-                annotated_str,
-                is_parameter_str,
-                dims_mapping,
-                partial_dims,
-            )
+            str += f"; {arg_name}'s dims_mapping (output, {annotated_str}, {is_parameter_str}): {dims_mapping}, partial on dims: {partial_dims}"
 
-        str += (
-            ", dist_impl idx: {} , dist_impl type: {}, chunk_id: {} }}".format(
-                self.dist_attr.impl_idx,
-                self.dist_attr.impl_type,
-                self.dist_attr.chunk_id,
-            )
-        )
+        str += f", dist_impl idx: {self.dist_attr.impl_idx} , dist_impl type: {self.dist_attr.impl_type}, chunk_id: {self.dist_attr.chunk_id} }}"
 
         return str
 
@@ -245,9 +219,7 @@ def __call__(self, *args, **kwargs):
         if self._in_dims_mappings:
             assert len(args) + len(kwargs) == len(
                 self._in_dims_mappings
-            ), "The length of dims_mapping {} does not matching the length output {}.".format(
-                len(self._in_dims_mappings), len(args) + len(kwargs)
-            )
+            ), f"The length of dims_mapping {len(self._in_dims_mappings)} does not matching the length output {len(args) + len(kwargs)}."
         for arg in args:
             if isinstance(arg, Variable) and self._in_dims_mappings:
                 tensor_to_dims_mapping[arg.name] = self._in_dims_mappings[index]
@@ -278,9 +250,7 @@ def __call__(self, *args, **kwargs):
         if self._out_dims_mappings:
             assert len(new_output) == len(
                 self._out_dims_mappings
-            ), "The length of dims_mapping {} does not matching the length output {}.".format(
-                len(self._out_dims_mappings), len(new_output)
-            )
+            ), f"The length of dims_mapping {len(self._out_dims_mappings)} does not matching the length output {len(new_output)}."
         for i, item in enumerate(new_output):
             if isinstance(item, Variable) and self._out_dims_mappings:
                 tensor_to_dims_mapping[item.name] = self._out_dims_mappings[i]
@@ -312,9 +282,7 @@ def __call__(self, *args, **kwargs):
                         )
                         assert verify_shard_spec(
                             shard_spec, tensor_shape, self._process_mesh
-                        ), "For tensor {}, shard_spec {} is invalid with tensor_shape {} and process_mesh {}.".format(
-                            name, shard_spec, tensor_shape, self._process_mesh
-                        )
+                        ), f"For tensor {name}, shard_spec {shard_spec} is invalid with tensor_shape {tensor_shape} and process_mesh {self._process_mesh}."
                         tensor_dist_attr.dims_mapping = dims_mapping
                         tensor_dist_attr.mark_annotated("dims_mapping")
             for name in dist_op.serial_op.output_arg_names:
@@ -338,9 +306,7 @@ def __call__(self, *args, **kwargs):
                         )
                         assert verify_shard_spec(
                             shard_spec, tensor_shape, self._process_mesh
-                        ), "For tensor {}, shard_spec {} is invalid with tensor_shape {} and process_mesh {}.".format(
-                            name, shard_spec, tensor_shape, self._process_mesh
-                        )
+                        ), f"For tensor {name}, shard_spec {shard_spec} is invalid with tensor_shape {tensor_shape} and process_mesh {self._process_mesh}."
                         tensor_dist_attr.dims_mapping = dims_mapping
                         tensor_dist_attr.mark_annotated("dims_mapping")
             dist_op.dist_attr.process_mesh = self._process_mesh
diff --git a/python/paddle/distributed/auto_parallel/static/dist_tensor.py b/python/paddle/distributed/auto_parallel/static/dist_tensor.py
index b15218d47426b..7420ad1f014f9 100644
--- a/python/paddle/distributed/auto_parallel/static/dist_tensor.py
+++ b/python/paddle/distributed/auto_parallel/static/dist_tensor.py
@@ -49,27 +49,21 @@ def _validate_sizes_and_dist_attr(
             and all(isinstance(x, int) and x >= -1 for x in dims_mapping)
         ):
             raise ValueError(
-                "The dims_mapping must be list or tuple and item in dims_mapping must >= -1, but got {}".format(
-                    dims_mapping
-                )
+                f"The dims_mapping must be list or tuple and item in dims_mapping must >= -1, but got {dims_mapping}"
             )
         if not (
             isinstance(processes, (list, tuple))
             and all(isinstance(x, int) and x >= 0 for x in processes)
         ):
             raise ValueError(
-                "The processes must be list or tuple and item in processes must be integer, but got {}".format(
-                    processes
-                )
+                f"The processes must be list or tuple and item in processes must be integer, but got {processes}"
             )
         if not (
             isinstance(topology, (list, tuple))
             and all(isinstance(x, int) and x > 0 for x in topology)
         ):
             raise ValueError(
-                "The topology must be list or tuple and item in topology must be non-negative integer, but got {}".format(
-                    topology
-                )
+                f"The topology must be list or tuple and item in topology must be non-negative integer, but got {topology}"
             )
         if rank is not None and not (isinstance(rank, int) and rank >= 0):
             raise ValueError(f"The rank must >= 0, but got {rank}")
@@ -156,9 +150,7 @@ def get_local_shard(
         )
         assert len(local_sizes) == len(
             local_offsets
-        ), "The length of local_sizes must be equal to local_offsets, but got {} and {}.".format(
-            len(local_sizes), len(local_offsets)
-        )
+        ), f"The length of local_sizes must be equal to local_offsets, but got {len(local_sizes)} and {len(local_offsets)}."
 
         local_end_offsets = [
             x[0] + x[1] for x in zip(local_offsets, local_sizes)
@@ -384,11 +376,7 @@ def __deepcopy__(self, memo):
         return result
 
     def __str__(self):
-        str = "{{tensor name: {}, tensor id: {}, tensor original_id {}".format(
-            self.serial_tensor.desc.name(),
-            self.serial_tensor.desc.id(),
-            self.serial_tensor.desc.original_id(),
-        )
+        str = f"{{tensor name: {self.serial_tensor.desc.name()}, tensor id: {self.serial_tensor.desc.id()}, tensor original_id {self.serial_tensor.desc.original_id()}"
 
         # str += ", {}".format(self.dist_attr)
         # return str
diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py
index 68cb8fda4a210..2bda746dfa92b 100644
--- a/python/paddle/distributed/auto_parallel/static/engine.py
+++ b/python/paddle/distributed/auto_parallel/static/engine.py
@@ -209,7 +209,8 @@ def __init__(
         # TODO: remove _fwd_main_progs and _orig_optimizer and _pir_main_progs
         self._fwd_dist_contexts = {}
         self._fwd_main_progs = {}
-        self._pir_main_progs = {}
+        self._pir_dist_main_progs = {}
+        self._pir_dense_main_progs = {}
         self._orig_optimizer = copy.deepcopy(self._optimizer)
 
         self._executor = None
@@ -278,17 +279,13 @@ def _prepare_data_spec_from_dataloader(self, dataloader):
             data = tuple(data.values())
             if len(data) != 2:
                 raise ValueError(
-                    "Data should be a dict with two keys, but received {}.".format(
-                        len(data)
-                    )
+                    f"Data should be a dict with two keys, but received {len(data)}."
                 )
             inputs, labels = data
         elif isinstance(data, (list, tuple)):
             if len(data) != 2:
                 raise ValueError(
-                    "Data should be a list or tuple with two elements, but received {}.".format(
-                        len(data)
-                    )
+                    f"Data should be a list or tuple with two elements, but received {len(data)}."
                 )
             inputs, labels = data
         else:
@@ -337,9 +334,7 @@ def _prepare_data_spec(self, data, split, batch_size):
                 labels = sample[split:]
         else:
             raise TypeError(
-                "Data should be a Dataset or IterableDataset, but received {}.".format(
-                    type(data).__name__
-                )
+                f"Data should be a Dataset or IterableDataset, but received {type(data).__name__}."
             )
         inputs = auto_utils.to_list(inputs)
         labels = auto_utils.to_list(labels)
@@ -369,9 +364,7 @@ def _infer_item_spec(item, name, batch_size, specs):
                 specs.append(InputSpec([batch_size], type(item), name))
             else:
                 raise TypeError(
-                    "The sample's dtype returned of dataset should be number, np.ndarray or Tensor, but got {}".format(
-                        type(item).__name__
-                    )
+                    f"The sample's dtype returned of dataset should be number, np.ndarray or Tensor, but got {type(item).__name__}"
                 )
 
         if inputs is not None:
@@ -531,6 +524,10 @@ def _prepare_fetch(self, user_fetches, mode):
         fetch_names = []
         fetch_indices = []
 
+        # TODO(2024-Q2)
+        if self._in_pir_mode:
+            return fetch_names, fetch_indices
+
         def _process_fetch_group(group_name, var_list):
             group_indices = []
             for var in var_list:
@@ -703,8 +700,11 @@ def _parallel_pir(self, mode):
 
         # TODO(JZ-LIANG) Step 4.4 Dist2Dense Pass
         # NOTE All optimization pass that need dist_attr info should be called before Dist2Dense Pass.
-        #   dense_program = apply_dist2dense_pass_optimization_pass(dist_program)
-        self._pir_main_progs[mode] = dist_program
+        dense_program = paddle.base.libpaddle.pir.apply_dist2dense_pass(
+            dist_program
+        )
+        self._pir_dense_main_progs[mode] = dense_program
+        self._pir_dist_main_progs[mode] = dist_program
 
     def _prepare_program(self, mode, init_parameters=True):
         # Do the build process
@@ -712,6 +712,7 @@ def _prepare_program(self, mode, init_parameters=True):
         # TODO(zhiqiu): fit the processes below for pir
         if self._in_pir_mode:
             self._parallel_pir(mode)
+            self._has_prepared[mode] = True
             return
         # Do the planning process
         self._plan(mode)
@@ -837,6 +838,8 @@ def _build(self, mode):
             #     )
             # else:
 
+            # concrete_program: <class 'paddle.jit.dy2static.program_translator.ConcreteProgram'>
+            # serial_main_prog:  <class 'paddle.base.libpaddle.pir.Program'>
             self._fwd_main_progs[mode] = serial_main_prog
             return
 
@@ -990,9 +993,7 @@ def _init_dist_context(self, mode):
                 ref_op = ref_blocks[ib].ops[iop]
                 assert (
                     op.type == ref_op.type
-                ), "'{}' mode op '{}' is different with '{}' op '{}'. ".format(
-                    mode, op.type, ref_mode, ref_op.type
-                )
+                ), f"'{mode}' mode op '{op.type}' is different with '{ref_mode}' op '{ref_op.type}'. "
                 ref_op_dist_attr = (
                     ref_dist_context.get_op_dist_attr_for_program(ref_op)
                 )
@@ -1003,6 +1004,9 @@ def _init_comm(self):
             if self._in_pir_mode:
                 # TODO(hitywt) Initialize the communicator collected in Reshard Pass.
                 # pir_init_comms()
+                all_process_groups = get_all_process_groups()
+                for process_group in all_process_groups:
+                    process_group.instantiate()
                 pass
                 return
 
@@ -1019,18 +1023,27 @@ def _init_comm(self):
                     process_group.instantiate()
 
     def _initialize(self, mode, init_parameters=True):
-        if self._in_pir_mode:
-            # TODO(xxxxx) Share the parameter tensor data from dygraph tensor to pir value.
-            # _pir_initialize()
-            pass
-            return
-
         self._place = _get_device()
         if isinstance(self._place, paddle.framework.CUDAPlace):
             self._place = paddle.framework.CUDAPlace(
                 paddle.distributed.ParallelEnv().dev_id
             )
 
+        if self._in_pir_mode:
+            # TODO(2024-Q2)
+            # 1. unify random control
+            # 2. initilization of non-parameter buffer
+            # 3. run startup program for pir
+            # 4. lazy init adaption
+            # 5. amp init adaption
+            # 6. vpp init adaption
+
+            self.program_helper.init_pir(
+                self._pir_dense_main_progs[mode], self._place
+            )
+
+            return
+
         if self._strategy.seed:
             paddle.seed(self._strategy.seed + self._dp_ranks[0])
             np.random.seed(self._strategy.seed + self._dp_ranks[0])
@@ -1084,8 +1097,17 @@ def _initialize(self, mode, init_parameters=True):
                 if scope_var and scope_var.get_tensor()._is_initialized():
                     continue
                 uninitialized.append(var)
-            if uninitialized:
-                prune_startup_prog = dist_startup_prog._prune(uninitialized)
+            # Make sure the number of communication operators is consistent
+            commu_ops = []
+            if self._nranks > 1:
+                for op in dist_startup_prog.global_block().ops:
+                    if auto_utils.is_comm_op(op):
+                        commu_ops.append(op)
+            reserved_vars_and_ops = uninitialized + commu_ops
+            if reserved_vars_and_ops:
+                prune_startup_prog = dist_startup_prog._prune(
+                    reserved_vars_and_ops
+                )
                 self._executor.run(prune_startup_prog)
 
             if hasattr(self, "_state_dict") and hasattr(self, "_dist_attr"):
@@ -1756,13 +1778,26 @@ def run(self, data=None, feed=None, fetch_list=None, mode=None):
             self.enable_job_schedule_profiler
         )
 
+        # TODO(2024-Q2)
+        use_cache = self._strategy.use_cache
+        if self._in_pir_mode:
+            use_cache = False
+            fetch_names = [
+                self.main_program.get_output_value_by_name(self._loss_names[0])
+            ]
+
         outs = self._executor.run(
             self.main_program,
             feed=feed_dict,
             fetch_list=fetch_names,
-            use_program_cache=self._strategy.use_cache,
+            use_program_cache=use_cache,
             return_numpy=self._strategy.return_numpy,
         )
+
+        if self._in_pir_mode:
+            logs = {"outputs": outs[0], "loss": outs[0]}
+            return logs
+
         logs = self._prepare_logger(
             outs, None, None, None, fetch_names, fetch_indices, self._mode
         )
@@ -1927,21 +1962,15 @@ def _validate_batch_size(self, batch_size):
         if auto_utils.use_new_executor():
             assert (
                 len(set(self._dp_world_sizes)) == 1
-            ), "DistributedBatchSampler only support one data parallel group, but got [{}] different data parallel groups".format(
-                len(set(self._dp_world_sizes))
-            )
+            ), f"DistributedBatchSampler only support one data parallel group, but got [{len(set(self._dp_world_sizes))}] different data parallel groups"
             assert (
                 batch_size % self._dp_world_sizes[0] == 0
-            ), "batch_size [{}] is not divisible by dp_world_size [{}]".format(
-                str(batch_size), str(self._dp_world_sizes[0])
-            )
+            ), f"batch_size [{str(batch_size)}] is not divisible by dp_world_size [{str(self._dp_world_sizes[0])}]"
             return batch_size // self._dp_world_sizes[0]
         else:
             assert (
                 batch_size % self._acc_steps == 0
-            ), "Requires batch_size:[{}] to be divisible by acc_steps:[{}].".format(
-                batch_size, self._acc_steps
-            )
+            ), f"Requires batch_size:[{batch_size}] to be divisible by acc_steps:[{self._acc_steps}]."
             return batch_size // self._acc_steps
 
     def _validate_batch(self, batch):
@@ -1984,9 +2013,7 @@ def _validate_spec(self, specs):
                     shape = list(spec.shape)
                     assert (
                         shape[0] % self._acc_steps == 0
-                    ), "Requires batch_size[{}] to be divisible by k_steps[{}].".format(
-                        spec.shape[0], self._acc_steps
-                    )
+                    ), f"Requires batch_size[{spec.shape[0]}] to be divisible by k_steps[{self._acc_steps}]."
                     shape[0] //= self._acc_steps
                     spec.shape = shape
         return specs or []
@@ -2039,11 +2066,7 @@ def _set_state_dict(self, mode, strict, state_dict, dist_attr):
                 continue
             if param_array.dtype != state_dict[name].dtype:
                 self._logger.info(
-                    "cast {}'s dtype from '{}' to '{}'".format(
-                        name,
-                        str(state_dict[name].dtype),
-                        str(param_array.dtype),
-                    )
+                    f"cast {name}'s dtype from '{str(state_dict[name].dtype)}' to '{str(param_array.dtype)}'"
                 )
                 state_dict[name] = state_dict[name].astype(param_array.dtype)
         program.set_state_dict(state_dict)
@@ -2215,9 +2238,7 @@ def cost(self, inputs_spec=None, labels_spec=None, mode=None):
         assert mode is not None, "Please set mode."
         if mode not in self._has_prepared:
             raise ValueError(
-                "The mode {} is not in accepted modes {}".format(
-                    mode, list(self._has_prepared.keys())
-                )
+                f"The mode {mode} is not in accepted modes {list(self._has_prepared.keys())}"
             )
         self.to_mode(mode)
 
@@ -2263,6 +2284,8 @@ def get_serial_startup_program(self, mode):
 
     @property
     def main_program(self):
+        if self._in_pir_mode:
+            return self._pir_dense_main_progs[self._mode]
         dist_context = self._dist_contexts[self._mode]
         return dist_context.dist_main_programs[self._cur_rank]
 
diff --git a/python/paddle/distributed/auto_parallel/static/helper.py b/python/paddle/distributed/auto_parallel/static/helper.py
index 8400db4871278..a2c6873f2372e 100644
--- a/python/paddle/distributed/auto_parallel/static/helper.py
+++ b/python/paddle/distributed/auto_parallel/static/helper.py
@@ -364,6 +364,61 @@ def static_func(self):
         func_name = '_' + self.proxy_layer.mode
         return getattr(self.proxy_layer, func_name)
 
+    def init_pir(self, main_program, place):
+        # collect all params in current dist program
+        param_values = main_program.global_block().all_parameters()
+        value_name_to_value = {}
+        dy_param_name_to_pir_param_name = {}
+        for value in param_values:
+            value_name_to_value[value.name] = value
+
+        dy_params = self.concrete_program.parameters[0]
+        pir_param = self.concrete_program.parameters[1]
+
+        for i in range(len(pir_param)):
+            if pir_param[i].name in value_name_to_value:
+                dy_param_name_to_pir_param_name[dy_params[i].name] = pir_param[
+                    i
+                ].name
+
+        for param in dy_params:
+            # create var in scope and share parameters to scope
+            if param is None:
+                continue
+            if param.name not in dy_param_name_to_pir_param_name:
+                # Release the reduntant params
+                param.get_tensor()._clear()
+                continue
+            if param.is_dense():
+                value_name = dy_param_name_to_pir_param_name[param.name]
+                value = value_name_to_value[value_name]
+                # get param_var's dist_attr
+                assert (
+                    value.is_dist_dense_tensor_type()
+                ), f"param [{value.name}] is not dist tensor type"
+                dist_attr = {
+                    "dims_mapping": value.dist_attr().dims_mapping,
+                    "process_shape": value.dist_attr().process_mesh.shape,
+                    "process_group": value.dist_attr().process_mesh.process_ids,
+                }
+                # slice param_value with dist_attr
+                # share sliced_param_value with param_tensor in global_scope
+                pir_scope_param = global_scope().var(value_name).get_tensor()
+                sliced_param = Converter.slice_with_dist_attr(
+                    param.numpy(), dist_attr
+                )
+                pir_scope_param.set(sliced_param, place)
+                param.get_tensor()._clear()
+
+            elif param.is_dist():
+                value_name = dy_param_name_to_pir_param_name[param.name]
+                value = value_name_to_value[value_name]
+                # assert value.is_dist_dense_tensor_type(), "param [{}] is not dist tensor type".format(value.name)
+                pir_scope_param = global_scope().var(value_name).get_tensor()
+                pir_scope_param._share_data_with(
+                    param.get_tensor().get_tensor()
+                )
+
     def init(self, main_program, place, dist_context):
         if self.lazy_init:
             return
diff --git a/python/paddle/distributed/auto_parallel/static/operators/__init__.py b/python/paddle/distributed/auto_parallel/static/operators/__init__.py
index 93d2c2597e819..de448784f8535 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/__init__.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/__init__.py
@@ -27,6 +27,7 @@
     dist_fused_attention,
     dist_fused_dropout_add,
     dist_fused_feedforward,
+    dist_fused_rms_norm,
     dist_fused_rope,
     dist_layer_norm,
     dist_matmul,
@@ -38,6 +39,7 @@
     dist_slice,
     dist_softmax,
     dist_split,
+    dist_tile,
     dist_transpose,
     dist_unsqueeze2,
     dist_update_loss_scaling,
diff --git a/python/paddle/distributed/auto_parallel/static/operators/common.py b/python/paddle/distributed/auto_parallel/static/operators/common.py
index c6de9955e08ea..a08723c5ca37c 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/common.py
@@ -14,6 +14,7 @@
 
 import abc
 import logging
+import warnings
 
 import paddle
 from paddle.base.log_helper import get_logger
@@ -336,9 +337,7 @@ def find_distributed_operator_impl_container(dist_op):
             )
 
     _logger.debug(
-        "Op [{}] Complete DistAttr using {}".format(
-            op_type, type(dist_op_impl_container).__name__
-        )
+        f"Op [{op_type}] Complete DistAttr using {type(dist_op_impl_container).__name__}"
     )
     return dist_op_impl_container
 
@@ -364,6 +363,14 @@ def is_parameter_related(varname, block, dist_context=None):
         var = serial_program.global_block()._find_var_recursive(varname)
         if var is None:
             return False
+    # NOTE(liym27): when Y_var is not a parameter, but Y_var is resharded by a parameter.
+    elif "reshard_api" in varname:
+        for op in block.ops:
+            if op.type == "assign" and varname in op.output("Out"):
+                in_varname = op.input("X")[0]
+                var = block._find_var_recursive(in_varname)
+                if var is not None and var.is_parameter:
+                    return True
     return var.is_parameter
 
 
@@ -555,9 +562,7 @@ def sync_and_scale_gradients(dist_ctx, op, groups, allreduce_var_names):
             dims_mapping = op_dist_attr.get_output_dims_mapping(grad_var.name)
             assert (
                 dims_mapping is not None
-            ), "Unexpected: dims_mapping of output [{}] of op [{}] is None".format(
-                grad_var.name, op_dist_attr.op_type
-            )
+            ), f"Unexpected: dims_mapping of output [{grad_var.name}] of op [{op_dist_attr.op_type}] is None"
             # NOTE auxiliary op's dist attr should follow dist_op not dist_tensor
             for new_op in added_ops:
                 new_op_attr = OperatorDistAttr()
@@ -592,9 +597,7 @@ def get_partial_groups(dist_ctx, op, out_grad_names, rank):
         else:
             assert (
                 partial_dims == var_dist_attr._partial_dims()
-            ), "Partial dims of outputs {} of op [{}] is not consistent".format(
-                out_grad_names, op.type
-            )
+            ), f"Partial dims of outputs {out_grad_names} of op [{op.type}] is not consistent"
 
     partial_dims = list(partial_dims)
     partial_dims.sort()
@@ -744,16 +747,14 @@ def update_op_dims_mapping(
 
     op_dist_attr = dist_op.dist_attr
     changed = False
-    assert len(input_arg_names) == len(
-        infered_input_dims_mappings
-    ), "dims mapping is NOT Match, infered [{}], original: [{}]; dist op: [{}]".format(
-        len(infered_input_dims_mappings), len(input_arg_names), str(dist_op)
-    )
-    assert len(output_arg_names) == len(
-        infered_output_dims_mappings
-    ), "dims mapping is NOT Match, infered [{}], original: [{}]; dist op: [{}]".format(
-        len(infered_output_dims_mappings), len(output_arg_names), str(dist_op)
-    )
+    if len(input_arg_names) != len(infered_input_dims_mappings):
+        warnings.warn(
+            f"dims mapping is NOT Match, infered [{len(infered_input_dims_mappings)}], original: [{len(input_arg_names)}]; dist op: [{str(dist_op)}]"
+        )
+    if len(output_arg_names) != len(infered_output_dims_mappings):
+        warnings.warn(
+            f"dims mapping is NOT Match, infered [{len(infered_output_dims_mappings)}], original: [{len(output_arg_names)}]; dist op: [{str(dist_op)}]"
+        )
 
     for i in range(len(input_arg_names)):
         original_dims_mapping = op_dist_attr.get_input_dims_mapping(
@@ -764,12 +765,7 @@ def update_op_dims_mapping(
             original_dims_mapping != infered_dims_mapping
         ):
             _logger.debug(
-                "Changed: Op [{}], name [{}], Original [{}], Infered [{}]".format(
-                    dist_op.serial_op.type,
-                    input_arg_names[i],
-                    original_dims_mapping,
-                    infered_dims_mapping,
-                )
+                f"Changed: Op [{dist_op.serial_op.type}], name [{input_arg_names[i]}], Original [{original_dims_mapping}], Infered [{infered_dims_mapping}]"
             )
             changed = True
             op_dist_attr.set_input_dims_mapping(
@@ -786,12 +782,7 @@ def update_op_dims_mapping(
             original_dims_mapping != infered_dims_mapping
         ):
             _logger.debug(
-                "Changed: Op [{}], name [{}], Original [{}], Infered [{}]".format(
-                    dist_op.serial_op.type,
-                    output_arg_names[i],
-                    original_dims_mapping,
-                    infered_dims_mapping,
-                )
+                f"Changed: Op [{dist_op.serial_op.type}], name [{output_arg_names[i]}], Original [{original_dims_mapping}], Infered [{infered_dims_mapping}]"
             )
             changed = True
             op_dist_attr.set_output_dims_mapping(
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_default.py b/python/paddle/distributed/auto_parallel/static/operators/dist_default.py
index 85163c57a3baa..c69f4e613e4eb 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_default.py
@@ -15,14 +15,14 @@
 
 from paddle.distributed.fleet.meta_optimizers.common import OP_ROLE_KEY, OpRole
 
-from ..completion import get_phi_spmd_rule
+from ..completion import contains_spmd_rule, get_phi_spmd_rule
 from ..cost import (
     _g_op_cost_factory,
     build_comp_costs_from_descs,
     build_comp_desc_from_dist_op,
     build_dp_costs,
 )
-from ..dist_attribute import OperatorDistAttr
+from ..dist_attribute import DistTensorSpec, OperatorDistAttr
 from ..process_group import new_process_group
 from ..utils import (
     _get_comm_group,
@@ -122,9 +122,7 @@ def update_dims_mapping(dist_op):
         for i in range(num_inputs):
             assert not is_parameter_related(
                 input_arg_names[i], main_block
-            ), "input {} of op {} is parameter, op should not use default rule.".format(
-                input_arg_names[i], str(dist_op.serial_op)
-            )
+            ), f"input {input_arg_names[i]} of op {str(dist_op.serial_op)} is parameter, op should not use default rule."
             input_specs.append(
                 get_dist_tensor_spec(dist_op, input_arg_names[i])
             )
@@ -133,18 +131,28 @@ def update_dims_mapping(dist_op):
         for i in range(num_outputs):
             assert not is_parameter_related(
                 output_arg_names[i], main_block
-            ), "output {} of op {} is parameter, op should not use default rule.".format(
-                output_arg_names[i], str(dist_op.serial_op)
-            )
+            ), f"output {output_arg_names[i]} of op {str(dist_op.serial_op)} is parameter, op should not use default rule."
             output_specs.append(
                 get_dist_tensor_spec(dist_op, output_arg_names[i], False)
             )
 
         # step2: infer spmd
-        rule = get_phi_spmd_rule("default_")
-        # tensor order following order in PHI definition
-        fw_results = rule.infer_forward(input_specs, output_specs)
-        bw_results = rule.infer_backward(input_specs, output_specs)
+        if contains_spmd_rule(dist_op.serial_op.type):
+            # when some inputs are optional, the input_arg_names will be less than input_names
+            # and we can pass empty DistTensorSpec() as argument
+            if len(op_desc.input_names()) > len(op_desc.input_arg_names()):
+                for i in range(
+                    len(op_desc.input_names()) - len(op_desc.input_arg_names())
+                ):
+                    input_specs.append(DistTensorSpec())
+            rule = get_phi_spmd_rule(dist_op.serial_op.type)
+            fw_results = rule.infer_forward(*input_specs)
+            bw_results = rule.infer_backward(*input_specs, output_specs)
+        else:
+            rule = get_phi_spmd_rule('default_')
+            # tensor order following order in PHI definition
+            fw_results = rule.infer_forward(input_specs, output_specs)
+            bw_results = rule.infer_backward(input_specs, output_specs)
 
         # step3: update dist_attr
         # tensor order following order in PHI definition
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_eltwise.py b/python/paddle/distributed/auto_parallel/static/operators/dist_eltwise.py
index 588d067a22db7..344fd33877134 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_eltwise.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_eltwise.py
@@ -49,15 +49,11 @@ def update_dims_mapping(dist_op):
         op_desc = dist_op.serial_op.desc
         assert (
             len(op_desc.input_arg_names()) >= 1
-        ), "elementwise op [{}] has [{}] inputs".format(
-            op_desc.type, len(op_desc.input_arg_names())
-        )
+        ), f"elementwise op [{op_desc.type}] has [{len(op_desc.input_arg_names())}] inputs"
         input_arg_names = op_desc.input_arg_names()
         assert (
             len(op_desc.output_arg_names()) == 1
-        ), "elementwise op [{}] has [{}] outputs".format(
-            str(dist_op.serial_op), len(op_desc.output_arg_names())
-        )
+        ), f"elementwise op [{str(dist_op.serial_op)}] has [{len(op_desc.output_arg_names())}] outputs"
         output_arg_name = op_desc.output_arg_names()[0]
         num_inputs = len(input_arg_names)
 
@@ -70,7 +66,7 @@ def update_dims_mapping(dist_op):
         output_spec = get_dist_tensor_spec(dist_op, output_arg_name, False)
 
         # step2: infer spmd
-        # TODO reivse me
+        # TODO revise me
         op_type = op_desc.type()
         rule = get_phi_spmd_rule(op_type)
         fw_results = rule.infer_forward(*input_specs)
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/static/operators/dist_embedding.py
index 9210ef4fcf231..588a0f30ebb0b 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_embedding.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_embedding.py
@@ -130,9 +130,7 @@ def mapping_to_dist_operator_impl(dist_op, original_op_dist_attr):
 def adopt_lookup_table_v1(ctx, main_block, src_op, Ids_var):
     assert (
         len(Ids_var.shape) == 3
-    ), "input Ids to lookup_table should have 3 dimensions but got [{}] with shape [{}]".format(
-        Ids_var.name, Ids_var.shape
-    )
+    ), f"input Ids to lookup_table should have 3 dimensions but got [{Ids_var.name}] with shape [{Ids_var.shape}]"
     if not Ids_var.stop_gradient:
         raise NotImplementedError(
             'Requiring the gradient of Ids of lookup_table(v1) dist op is not currently supported. Please open an issue with details on your use case so that we can prioritize adding this (for instance, adversarial training for language model).'
@@ -461,9 +459,7 @@ def forward(ctx, *args, **kwargs):
         )[0]
         assert (
             embedding_row_dim_mapping >= 0
-        ), "row_parallel_embedding's row should be divided by a specific mesh axis, but got [{}]".format(
-            embedding_row_dim_mapping
-        )
+        ), f"row_parallel_embedding's row should be divided by a specific mesh axis, but got [{embedding_row_dim_mapping}]"
         process_mesh_shape = op_dist_attr.process_mesh.shape
         process_mesh_group = op_dist_attr.process_mesh.process_ids
 
@@ -626,9 +622,7 @@ def backward(ctx, *args, **kwargs):
         )[0]
         assert (
             embedding_row_dim_mapping >= 0
-        ), "row_parallel_embedding's row should be divided by a specific mesh axis, but got [{}]".format(
-            embedding_row_dim_mapping
-        )
+        ), f"row_parallel_embedding's row should be divided by a specific mesh axis, but got [{embedding_row_dim_mapping}]"
         process_mesh_shape = dist_attr.process_mesh.shape
         process_mesh_group = dist_attr.process_mesh.process_ids
 
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_fused_attention.py b/python/paddle/distributed/auto_parallel/static/operators/dist_fused_attention.py
index 5b5abf015c950..6c7ba951980a7 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_fused_attention.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_fused_attention.py
@@ -174,9 +174,7 @@ def forward(ctx, *args, **kwargs):
         ]
         assert (
             qkv_w_col_dim_mapping >= 0
-        ), "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
-            qkv_w_col_dim_mapping
-        )
+        ), f"col_parallel_matmul's row should be divided by a specific mesh axis, but got [{qkv_w_col_dim_mapping}]"
         process_mesh_shape = op_dist_attr.process_mesh.shape
         process_mesh_group = op_dist_attr.process_mesh.process_ids
 
@@ -213,9 +211,7 @@ def backward(ctx, *args, **kwargs):
         out_w_col_dim_mapping = op_dist_attr.get_input_dims_mapping(out_w)[-1]
         assert (
             out_w_col_dim_mapping >= 0
-        ), "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
-            out_w_col_dim_mapping
-        )
+        ), f"col_parallel_matmul's row should be divided by a specific mesh axis, but got [{out_w_col_dim_mapping}]"
         process_mesh_shape = op_dist_attr.process_mesh.shape
         process_mesh_group = op_dist_attr.process_mesh.process_ids
 
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_fused_feedforward.py b/python/paddle/distributed/auto_parallel/static/operators/dist_fused_feedforward.py
index 6c4952416e341..1df1bf8849026 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_fused_feedforward.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_fused_feedforward.py
@@ -165,9 +165,7 @@ def forward(ctx, *args, **kwargs):
         )[-1]
         assert (
             linear1_weight_col_dim_mapping >= 0
-        ), "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
-            linear1_weight_col_dim_mapping
-        )
+        ), f"col_parallel_matmul's row should be divided by a specific mesh axis, but got [{linear1_weight_col_dim_mapping}]"
         process_mesh_shape = op_dist_attr.process_mesh.shape
         process_mesh_group = op_dist_attr.process_mesh.process_ids
 
@@ -206,9 +204,7 @@ def backward(ctx, *args, **kwargs):
         )[-1]
         assert (
             linear2_weight_col_dim_mapping >= 0
-        ), "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
-            linear2_weight_col_dim_mapping
-        )
+        ), f"col_parallel_matmul's row should be divided by a specific mesh axis, but got [{linear2_weight_col_dim_mapping}]"
         process_mesh_shape = op_dist_attr.process_mesh.shape
         process_mesh_group = op_dist_attr.process_mesh.process_ids
 
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_fused_rms_norm.py b/python/paddle/distributed/auto_parallel/static/operators/dist_fused_rms_norm.py
new file mode 100644
index 0000000000000..40c82921b2971
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_fused_rms_norm.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import logging
+
+from paddle.base.log_helper import get_logger
+
+from ..completion import get_phi_spmd_rule
+from ..utils import get_dist_tensor_spec
+from .common import (
+    DistributedOperatorImplContainer,
+    get_default_distributed_operator_impl,
+    register_distributed_operator_impl_container,
+    update_op_dims_mapping,
+)
+
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
+)
+
+
+class DistributedLayerNorm(DistributedOperatorImplContainer):
+    def __init__(self, op_type):
+        super().__init__(op_type)
+
+    @staticmethod
+    def update_dims_mapping(dist_op):
+        # step1: prepare inputs need for rule (order args as PHI definition and filter out unnecessary args)
+        op_desc = dist_op.serial_op.desc
+
+        x_name = op_desc.input('x')[0]
+        scale_name = op_desc.input('scale')[0]
+        y_name = op_desc.output('y')[0]
+        invvar_name = op_desc.output('invvar')[0]
+
+        x_spec = get_dist_tensor_spec(dist_op, x_name)
+        scale_spec = get_dist_tensor_spec(dist_op, scale_name)
+
+        y_spec = get_dist_tensor_spec(dist_op, y_name, is_input=False)
+        invvar_spec = get_dist_tensor_spec(dist_op, invvar_name, is_input=False)
+
+        epsilon = op_desc.attr('epsilon')
+
+        # step2: infer spmd
+        rule = get_phi_spmd_rule("fused_rms_norm")
+        # tensor order following order in PHI definition
+        fw_results = rule.infer_forward(x_spec, scale_spec, epsilon)
+        bw_results = rule.infer_backward(
+            x_spec,
+            scale_spec,
+            y_spec,
+            invvar_spec,
+            epsilon,
+        )
+
+        # step3: update dist_attr
+        # tensor order following order in PHI definition
+        changed = update_op_dims_mapping(
+            dist_op,
+            [x_name, scale_name],
+            [y_name, invvar_name],
+            fw_results,
+            bw_results,
+        )
+
+        return changed
+
+    @staticmethod
+    def mapping_to_dist_operator_impl(dist_op, original_op_dist_attr):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+
+        # default impl
+        default_impl = get_default_distributed_operator_impl()
+        op_dist_attr.impl_type = default_impl.type
+        op_dist_attr.impl_idx = default_impl.idx
+
+        return False
+
+
+register_distributed_operator_impl_container(
+    DistributedLayerNorm("fused_rms_norm")
+)
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_fused_rope.py b/python/paddle/distributed/auto_parallel/static/operators/dist_fused_rope.py
index db54199ac248d..f4011e5cd3f62 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_fused_rope.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_fused_rope.py
@@ -101,6 +101,7 @@ def update_dims_mapping(dist_op):
 
         use_neox_rotary_style = op_desc.attr("use_neox_rotary_style")
         time_major = op_desc.attr("time_major")
+        rotary_emb_base = op_desc.attr("rotary_emb_base")
 
         # step2: infer spmd
         rule = get_phi_spmd_rule("fused_rotary_position_embedding")
@@ -114,6 +115,7 @@ def update_dims_mapping(dist_op):
             position_ids_spec,
             use_neox_rotary_style,
             time_major,
+            rotary_emb_base,
         )
         bw_results = rule.infer_backward(
             q_spec,
@@ -127,6 +129,7 @@ def update_dims_mapping(dist_op):
             out_v_spec,
             use_neox_rotary_style,
             time_major,
+            rotary_emb_base,
         )
 
         # remove optional args in spmd results
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/static/operators/dist_matmul.py
index b2a07034d526b..4b44e17dea210 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_matmul.py
@@ -515,7 +515,7 @@ def update_dims_mapping_matmul(dist_op):
         trans_x = False
         trans_y = False
 
-    # TODO (zhangyichen) replace dist tensor spece by dist tensor in future.
+    # TODO (zhangyichen) replace dist tensor spec by dist tensor in future.
     x_spec = get_dist_tensor_spec(dist_op, x_name)
     y_spec = get_dist_tensor_spec(dist_op, y_name)
     out_spec = get_dist_tensor_spec(dist_op, out_name, False)
@@ -818,9 +818,7 @@ def forward(ctx, *args, **kwargs):
             )[-2]
         assert (
             matmul_col_dim_mapping >= 0
-        ), "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
-            matmul_col_dim_mapping
-        )
+        ), f"col_parallel_matmul's row should be divided by a specific mesh axis, but got [{matmul_col_dim_mapping}]"
         process_mesh_shape = op_dist_attr.process_mesh.shape
         process_mesh_group = op_dist_attr.process_mesh.process_ids
 
@@ -1075,9 +1073,7 @@ def forward(ctx, *args, **kwargs):
             )[-1]
         assert (
             matmul_row_dim_mapping >= 0
-        ), "row_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
-            matmul_row_dim_mapping
-        )
+        ), f"row_parallel_matmul's row should be divided by a specific mesh axis, but got [{matmul_row_dim_mapping}]"
         process_mesh_shape = op_dist_attr.process_mesh.shape
         process_mesh_group = op_dist_attr.process_mesh.process_ids
 
@@ -1515,9 +1511,7 @@ def forward(ctx, *args, **kwargs):
             )[-2]
         assert (
             matmul_col_dim_mapping >= 0
-        ), "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
-            matmul_col_dim_mapping
-        )
+        ), f"col_parallel_matmul's row should be divided by a specific mesh axis, but got [{matmul_col_dim_mapping}]"
 
         # infer new var shape with op dist attr
         x_tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(X_var)
@@ -1766,9 +1760,7 @@ def forward(ctx, *args, **kwargs):
             )[-1]
         assert (
             matmul_row_dim_mapping >= 0
-        ), "row_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
-            matmul_row_dim_mapping
-        )
+        ), f"row_parallel_matmul's row should be divided by a specific mesh axis, but got [{matmul_row_dim_mapping}]"
         process_mesh_shape = op_dist_attr.process_mesh.shape
         process_mesh_group = op_dist_attr.process_mesh.process_ids
 
@@ -2193,9 +2185,7 @@ def forward(ctx, *args, **kwargs):
         )[-1]
         assert (
             matmul_col_dim_mapping >= 0
-        ), "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
-            matmul_col_dim_mapping
-        )
+        ), f"col_parallel_matmul's row should be divided by a specific mesh axis, but got [{matmul_col_dim_mapping}]"
         process_mesh_shape = op_dist_attr.process_mesh.shape
         process_mesh_group = op_dist_attr.process_mesh.process_ids
 
@@ -2438,9 +2428,7 @@ def forward(ctx, *args, **kwargs):
         )[-2]
         assert (
             matmul_row_dim_mapping >= 0
-        ), "row_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
-            matmul_row_dim_mapping
-        )
+        ), f"row_parallel_matmul's row should be divided by a specific mesh axis, but got [{matmul_row_dim_mapping}]"
         process_mesh_shape = op_dist_attr.process_mesh.shape
         process_mesh_group = op_dist_attr.process_mesh.process_ids
 
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_reduce_sum_p.py b/python/paddle/distributed/auto_parallel/static/operators/dist_reduce_sum_p.py
index 64aa0c8c9793a..e99b57f8f97d8 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_reduce_sum_p.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_reduce_sum_p.py
@@ -45,20 +45,16 @@ def update_dims_mapping(dist_op):
         op_desc = dist_op.serial_op.desc
         assert (
             len(op_desc.input_arg_names()) == 1
-        ), "reduce_sum op [{}] has [{}] inputs".format(
-            op_desc.type, len(op_desc.input_arg_names())
-        )
+        ), f"reduce_sum op [{op_desc.type}] has [{len(op_desc.input_arg_names())}] inputs"
         input_arg_name = op_desc.input_arg_names()[0]
         assert (
             len(op_desc.output_arg_names()) == 1
-        ), "reduce_sum op [{}] has [{}] outputs".format(
-            op_desc.type, len(op_desc.output_arg_names())
-        )
+        ), f"reduce_sum op [{op_desc.type}] has [{len(op_desc.output_arg_names())}] outputs"
         output_arg_name = op_desc.output_arg_names()[0]
         keep_dim = op_desc.attr('keep_dim')
         dims = op_desc.attr('dim')
 
-        # TODO (zhangyichen) replace dist tensor spece by dist tensor in future.
+        # TODO (zhangyichen) replace dist tensor spec by dist tensor in future.
         input_spec = get_dist_tensor_spec(dist_op, input_arg_name)
         output_spec = get_dist_tensor_spec(dist_op, output_arg_name, False)
         # len(dims) == 0 means reduce_all
@@ -122,18 +118,18 @@ def is_partial_reduce(axes, dims_mapping):
 register_distributed_operator_impl_container(DistributedReduceSum("reduce_sum"))
 
 
-class DistributedReduceSumPrimtive(DistributedOperatorImplContainer):
+class DistributedReduceSumPrimitive(DistributedOperatorImplContainer):
     def __init__(self, op_type):
         super().__init__(op_type)
 
 
 register_distributed_operator_impl_container(
-    DistributedReduceSumPrimtive("reduce_sum_p")
+    DistributedReduceSumPrimitive("reduce_sum_p")
 )
 
 
 # Batch Dimension ReduceSum Primitive
-class DistributedReduceSumPrimtiveImpl0(DistributedOperatorImpl):
+class DistributedReduceSumPrimitiveImpl0(DistributedOperatorImpl):
     def __init__(self, name):
         super().__init__(name)
         self._forward_implemented = True
@@ -235,13 +231,11 @@ def forward(ctx, *args, **kwargs):
     @staticmethod
     def backward(ctx, *args, **kwargs):
         raise RuntimeError(
-            "primitive operator does NOT have backward function, op type: {}".format(
-                str(op.type)  # noqa: F821
-            )
+            f"primitive operator does NOT have backward function, op type: {str(op.type)}"  # noqa: F821
         )
 
 
 register_distributed_operator_impl(
     "reduce_sum_p",
-    DistributedReduceSumPrimtiveImpl0("batch_dimension_reduce_sum_p"),
+    DistributedReduceSumPrimitiveImpl0("batch_dimension_reduce_sum_p"),
 )
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_tile.py b/python/paddle/distributed/auto_parallel/static/operators/dist_tile.py
new file mode 100644
index 0000000000000..45371797e1687
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_tile.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+
+from ..completion import get_phi_spmd_rule
+from ..utils import (
+    get_dist_tensor_spec,
+)
+from .common import (
+    DistributedOperatorImplContainer,
+    get_default_distributed_operator_impl,
+    register_distributed_operator_impl_container,
+    update_op_dims_mapping,
+)
+
+
+class DistributedTile(DistributedOperatorImplContainer):
+    def __init__(self, op_type):
+        super().__init__(op_type)
+
+    @staticmethod
+    def update_dims_mapping(dist_op):
+        # step1: prepare inputs need for rule (order args as PHI definition and filter out unnecessary args)
+        op_desc = dist_op.serial_op.desc
+        assert (
+            dist_op.serial_op.type == "tile"
+        ), f"{dist_op.serial_op.type} is not supported by dist transpose yet."
+
+        x_name = op_desc.input('X')[0]
+        out_name = op_desc.output('Out')[0]
+        repeat_times = op_desc.attr('repeat_times')
+
+        x_spec = get_dist_tensor_spec(dist_op, x_name)
+        output_spec = get_dist_tensor_spec(dist_op, out_name, False)
+
+        # step2: infer spmd
+        rule = get_phi_spmd_rule("tile")
+        # tensor order following order in PHI definition
+        fw_results = rule.infer_forward(x_spec, repeat_times)
+        bw_results = rule.infer_backward(x_spec, output_spec, repeat_times)
+
+        # step3: update dist_attr
+        # tensor order following order in PHI definition
+        changed = update_op_dims_mapping(
+            dist_op, [x_name], [out_name], fw_results, bw_results
+        )
+
+        return changed
+
+    @staticmethod
+    def mapping_to_dist_operator_impl(dist_op, original_op_dist_attr):
+        # all elementwise op use default dist operator impl.
+        op_dist_attr = dist_op.dist_attr
+        default_impl = get_default_distributed_operator_impl()
+        op_dist_attr.impl_type = default_impl.type
+        op_dist_attr.impl_idx = default_impl.idx
+
+        return False
+
+
+register_distributed_operator_impl_container(DistributedTile("tile"))
diff --git a/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
index 6e7c774688d32..b95bcae8ecea8 100644
--- a/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
@@ -92,9 +92,7 @@ def parallel(self, rank, parameter_list=None):
                 params_grads,
             )
             self._logger.debug(
-                "within parallel apply_pre_optimization time: {}, mode {}".format(
-                    time.time() - time0, self._mode
-                )
+                f"within parallel apply_pre_optimization time: {time.time() - time0}, mode {self._mode}"
             )
             # Do logical partition
             time0 = time.time()
@@ -110,9 +108,7 @@ def parallel(self, rank, parameter_list=None):
             init_auto_parallel_rng()
 
             self._logger.debug(
-                "within parallel partitioner time: {}, mode {}".format(
-                    time.time() - time0, self._mode
-                )
+                f"within parallel partitioner time: {time.time() - time0}, mode {self._mode}"
             )
             # Generate optimizer
             time0 = time.time()
@@ -123,9 +119,7 @@ def parallel(self, rank, parameter_list=None):
                 dist_params_grads,
             )
             self._logger.debug(
-                "within parallel optimizer time: {}, mode {}".format(
-                    time.time() - time0, self._mode
-                )
+                f"within parallel optimizer time: {time.time() - time0}, mode {self._mode}"
             )
 
             resharder = Resharder(
@@ -137,9 +131,7 @@ def parallel(self, rank, parameter_list=None):
             )
             resharder.reshard()
             self._logger.debug(
-                "within parallel reshard time: {}, mode {}".format(
-                    time.time() - time0, self._mode
-                )
+                f"within parallel reshard time: {time.time() - time0}, mode {self._mode}"
             )
             # Apply post optimization passes
             time0 = time.time()
@@ -147,9 +139,7 @@ def parallel(self, rank, parameter_list=None):
                 dist_main_prog, dist_startup_prog, rank, dist_params_grads
             )
             self._logger.debug(
-                "within parallel apply_post_optimization time: {}, mode {}".format(
-                    time.time() - time0, self._mode
-                )
+                f"within parallel apply_post_optimization time: {time.time() - time0}, mode {self._mode}"
             )
         else:
             # Apply pre optimization passes
@@ -162,9 +152,7 @@ def parallel(self, rank, parameter_list=None):
                 serial_main_program, serial_startup_program, None, None, []
             )
             self._logger.debug(
-                "within parallel apply_pre_optimization time: {}, mode {}".format(
-                    time.time() - time0, self._mode
-                )
+                f"within parallel apply_pre_optimization time: {time.time() - time0}, mode {self._mode}"
             )
             # Do logical partition
             time0 = time.time()
@@ -178,9 +166,7 @@ def parallel(self, rank, parameter_list=None):
             )
             # Do reshard process
             self._logger.debug(
-                "within parallel partitioner time: {}, mode {}".format(
-                    time.time() - time0, self._mode
-                )
+                f"within parallel partitioner time: {time.time() - time0}, mode {self._mode}"
             )
             time0 = time.time()
             # Do reshard process
@@ -199,9 +185,7 @@ def parallel(self, rank, parameter_list=None):
             )
             resharder.reshard()
             self._logger.debug(
-                "within parallel reshard time: {}, mode {}".format(
-                    time.time() - time0, self._mode
-                )
+                f"within parallel reshard time: {time.time() - time0}, mode {self._mode}"
             )
             # Apply post optimization passes
             time0 = time.time()
@@ -209,9 +193,7 @@ def parallel(self, rank, parameter_list=None):
                 dist_main_prog, dist_startup_prog, rank, dist_params_grads
             )
             self._logger.debug(
-                "within parallel apply_post_optimization time: {}, mode {}".format(
-                    time.time() - time0, self._mode
-                )
+                f"within parallel apply_post_optimization time: {time.time() - time0}, mode {self._mode}"
             )
 
         # Clone program for test
diff --git a/python/paddle/distributed/auto_parallel/static/partitioner.py b/python/paddle/distributed/auto_parallel/static/partitioner.py
index 024c921e60ba2..58fd66b6d5c6b 100644
--- a/python/paddle/distributed/auto_parallel/static/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/static/partitioner.py
@@ -411,14 +411,7 @@ def _get_dist_shape(var, dist_attr):
         else:
             assert (
                 var_shape[idx] % mesh[mapping[idx]] == 0
-            ), "un-event partition: var_shape[idx]=[{}], mesh[{}], {}, {}, {}, {}".format(
-                var_shape[idx],
-                mesh[mapping[idx]],
-                var.name,
-                var_shape,
-                mesh,
-                mapping,
-            )
+            ), f"un-event partition: var_shape[idx]=[{var_shape[idx]}], mesh[{mesh[mapping[idx]]}], {var.name}, {var_shape}, {mesh}, {mapping}"
             new_shape.append(var_shape[idx] // mesh[mapping[idx]])
 
     return new_shape
diff --git a/python/paddle/distributed/auto_parallel/static/pir_pass.py b/python/paddle/distributed/auto_parallel/static/pir_pass.py
index 3196612fa708b..f8e96c56b446d 100644
--- a/python/paddle/distributed/auto_parallel/static/pir_pass.py
+++ b/python/paddle/distributed/auto_parallel/static/pir_pass.py
@@ -23,7 +23,7 @@ def apply_partition_pass(program):
         for op in new_program.global_block().ops:
             # assert len(op.operands()) == len(op.dist_attr().operand_dist_attrs()), f'The number of operand and operand_dist_attrs are not equal in op: {op}'
             for var, operand_dist_attr in zip(
-                op.operands(), op.dist_attr().operand_dist_attrs()
+                op.operands(), op.dist_attr.operand_dist_attrs()
             ):
                 if (
                     var.source().is_dist_dense_tensor_type()
@@ -60,7 +60,7 @@ def apply_reshard_pass(program):
                     == op.operand(0).source().dist_attr().dims_mapping
                 ), f'only support the same dims maping on 1-D mesh now, but the op is: {op}'
                 assert (
-                    op.dist_attr().operand_dist_attr(0).partial_status[0]
+                    op.dist_attr.operand_dist_attr(0).partial_status[0]
                     == paddle.distributed.ReduceType.kRedSum
                 ), f'only support partial sum now, but the op is: {op}'
                 assert (
diff --git a/python/paddle/distributed/auto_parallel/static/planner.py b/python/paddle/distributed/auto_parallel/static/planner.py
index 8b5d5e93c9f17..d638003fba4dc 100755
--- a/python/paddle/distributed/auto_parallel/static/planner.py
+++ b/python/paddle/distributed/auto_parallel/static/planner.py
@@ -962,9 +962,7 @@ def search(self):
         pipeline_min_cost = None
         for process_mesh_topology in process_mesh_topology_list:
             print(
-                "MCMC search: search process mesh {} with pipeline mode.".format(
-                    process_mesh_topology
-                )
+                f"MCMC search: search process mesh {process_mesh_topology} with pipeline mode."
             )
             (
                 valid_dist_attr_dict,
@@ -983,9 +981,7 @@ def search(self):
                 valid_dist_attr_dict, init_dist_context, pipeline_process_meshes
             )
             print(
-                "MCMC search: the min cost is {} in the process mesh {} with pipeline mode.".format(
-                    cost, process_mesh_topology
-                )
+                f"MCMC search: the min cost is {cost} in the process mesh {process_mesh_topology} with pipeline mode."
             )
             best_dist_context._dist_op_context = DistributedOperatorContext()
             pipeline_min_cost = (
@@ -1007,9 +1003,7 @@ def search(self):
             if len(process_mesh_topology) == 3:
                 continue
             print(
-                "MCMC search: search process mesh {} without pipeline mode.".format(
-                    process_mesh_topology
-                )
+                f"MCMC search: search process mesh {process_mesh_topology} without pipeline mode."
             )
             (
                 valid_dist_attr_dict,
@@ -1028,9 +1022,7 @@ def search(self):
                 valid_dist_attr_dict, init_dist_context, pipeline_process_meshes
             )
             print(
-                "MCMC search: the min cost is {} in the process mesh {} without pipeline mode.".format(
-                    cost, process_mesh_topology
-                )
+                f"MCMC search: the min cost is {cost} in the process mesh {process_mesh_topology} without pipeline mode."
             )
             best_dist_context._dist_op_context = DistributedOperatorContext()
             non_pipeline_min_cost = (
@@ -1061,9 +1053,7 @@ def search(self):
             pg0.add_ranks(process_mesh.process_ids)
         end_time = time.time()
         print(
-            "End MCMC searching: the min cost is {} and the search time is {}s.".format(
-                min_cost, end_time - start_time
-            )
+            f"End MCMC searching: the min cost is {min_cost} and the search time is {end_time - start_time}s."
         )
         return searched_dist_context, min_cost
 
diff --git a/python/paddle/distributed/auto_parallel/static/planner_v2.py b/python/paddle/distributed/auto_parallel/static/planner_v2.py
index 9b39242cf006a..5b38867f71177 100755
--- a/python/paddle/distributed/auto_parallel/static/planner_v2.py
+++ b/python/paddle/distributed/auto_parallel/static/planner_v2.py
@@ -110,14 +110,7 @@ def plan(self):
                     or device_count != last_device_count
                 ):
                     logger.info(
-                        "The cluster {} nodes {} {} devices is different from the saved last cluster {} nodes {} {} devices, so we run the planner again.".format(
-                            node_count,
-                            device_count,
-                            gpu_model,
-                            last_node_count,
-                            last_device_count,
-                            last_gpu_model,
-                        )
+                        f"The cluster {node_count} nodes {device_count} {gpu_model} devices is different from the saved last cluster {last_node_count} nodes {last_device_count} {last_gpu_model} devices, so we run the planner again."
                     )
                     need_set_dist_attr = False
                 else:
diff --git a/python/paddle/distributed/auto_parallel/static/reshard.py b/python/paddle/distributed/auto_parallel/static/reshard.py
index 582b856dce56c..c0f94823a20f5 100644
--- a/python/paddle/distributed/auto_parallel/static/reshard.py
+++ b/python/paddle/distributed/auto_parallel/static/reshard.py
@@ -2457,9 +2457,7 @@ def get_op_input_attrs(self, op, var_name):
 
         assert (
             op_input_attrs
-        ), "The input '{}' of op '{}' has no distributed attributes in subblock".format(
-            op.name, var_name
-        )
+        ), f"The input '{op.name}' of op '{var_name}' has no distributed attributes in subblock"
 
         return op_input_attrs
 
diff --git a/python/paddle/distributed/auto_parallel/static/tuner/algorithms.py b/python/paddle/distributed/auto_parallel/static/tuner/algorithms.py
index 76bcd1f212dd9..fcaa325c9ab99 100644
--- a/python/paddle/distributed/auto_parallel/static/tuner/algorithms.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/algorithms.py
@@ -121,9 +121,7 @@ def _init_spaces(self):
         if stage_range:
             assert set(stage_range).issubset(
                 {0, 1, 2, 3}
-            ), "Sharding Stage should belong into range within 0 - 3 but got {}.".format(
-                stage_range
-            )
+            ), f"Sharding Stage should belong into range within 0 - 3 but got {stage_range}."
             stage_range.sort(reverse=True)
         else:
             stage_range = list(range(self._max_stage + 1)).sort(reverse=True)
diff --git a/python/paddle/distributed/auto_parallel/static/tuner/optimization_tuner.py b/python/paddle/distributed/auto_parallel/static/tuner/optimization_tuner.py
index 34a9e366c11c1..bc2b7293716b2 100644
--- a/python/paddle/distributed/auto_parallel/static/tuner/optimization_tuner.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/optimization_tuner.py
@@ -564,15 +564,11 @@ def summary(self):
         """
         # TODO summary with the trial_name with metric_of_trial
         best_trial = self._finished_trials[self._best_iter]
-        summary_ = """
+        summary_ = f"""
 Tuning Result Summary
-Run total {} trials with {} min.
-The best trial is: [{}], whose configuration is following:
-        """.format(
-            len(self._finished_trials),
-            (time.time() - self._tuning_start_time) / 60,
-            best_trial.name,
-        )
+Run total {len(self._finished_trials)} trials with {(time.time() - self._tuning_start_time) / 60} min.
+The best trial is: [{best_trial.name}], whose configuration is following:
+        """
         summary_ += "\n" + best_trial.summary() + "\n"
         self._logger.info(summary_)
         with open(os.path.join(self.project_dir, "summary.txt"), "w+") as fw:
@@ -633,9 +629,7 @@ def tune(self):
                 and self._config.early_stop <= i - self._best_iter
             ):
                 self._logger.info(
-                    "Early stop the Tuning since there is no better trial found within [{}] trials".format(
-                        self._config.early_stop
-                    )
+                    f"Early stop the Tuning since there is no better trial found within [{self._config.early_stop}] trials"
                 )
                 break
 
diff --git a/python/paddle/distributed/auto_parallel/static/tuner/profiler.py b/python/paddle/distributed/auto_parallel/static/tuner/profiler.py
index 821a0c5ec078b..59af0ba87e1d0 100644
--- a/python/paddle/distributed/auto_parallel/static/tuner/profiler.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/profiler.py
@@ -171,11 +171,7 @@ def init_comm(profile_ctx):
     genv = _get_global_env()
     genv = dist_env
     print(
-        "current process rank: {}, device_id: {}, ip: {}.".format(
-            genv.rank,
-            genv.device_id,
-            genv.current_endpoint,
-        )
+        f"current process rank: {genv.rank}, device_id: {genv.device_id}, ip: {genv.current_endpoint}."
     )
 
     # init nccl comm
diff --git a/python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py b/python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py
index ae3fa404f5181..065d79c14d10c 100644
--- a/python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py
@@ -1281,11 +1281,7 @@ def match_program(self, program):
                             ] = shard_spec[pattern_node_id]
                             tensor_name = graph.attrs["id_to_var_name"][var_id]
                             self._logger.info(
-                                "{}'s shard_spec may be {} when under {} parallelism.".format(
-                                    tensor_name,
-                                    shard_spec[pattern_node_id],
-                                    parallelism,
-                                )
+                                f"{tensor_name}'s shard_spec may be {shard_spec[pattern_node_id]} when under {parallelism} parallelism."
                             )
         else:
             self._logger.info(
@@ -1413,9 +1409,7 @@ def _complete_sub_fwd_program(self, idx, sub_fwd_program, process_mesh):
                 ] = dist_context
             else:
                 self._logger.info(
-                    "No pattern has be matched under {} parallelism whe sub program is {}.".format(
-                        parallelism, sub_fwd_program
-                    )
+                    f"No pattern has be matched under {parallelism} parallelism whe sub program is {sub_fwd_program}."
                 )
 
     def complete_sub_fwd_programs(self, process_mesh):
@@ -2326,13 +2320,7 @@ def tune_o1(self):
                         )
 
                         self._logger.info(
-                            "Cost Model: The max memory is {:.2f}GB and cost is {:.2f} when {} parallelism under process mesh shape {} on {} stages.".format(
-                                memory / (1024**3),
-                                cost,
-                                parallelism,
-                                process_mesh_shape,
-                                len(device_meshes),
-                            )
+                            f"Cost Model: The max memory is {memory / (1024**3):.2f}GB and cost is {cost:.2f} when {parallelism} parallelism under process mesh shape {process_mesh_shape} on {len(device_meshes)} stages."
                         )
                         # 10% buffer is reserved safely for memory cost
                         if memory > 0.9 * self.cluster.machines[0].devices[
@@ -2344,12 +2332,7 @@ def tune_o1(self):
                             best_cost = cost
                             best_dist_context = dist_context_of_device_meshes
                             self._logger.info(
-                                "O1 level: a better strategy has be found that parallelism is {} under process mesh shape {} on {} stages with max memory {:.2f}GB.".format(
-                                    parallelism,
-                                    process_mesh_shape,
-                                    len(device_meshes),
-                                    memory / (1024**3),
-                                )
+                                f"O1 level: a better strategy has be found that parallelism is {parallelism} under process mesh shape {process_mesh_shape} on {len(device_meshes)} stages with max memory {memory / (1024**3):.2f}GB."
                             )
 
         return best_dist_context
diff --git a/python/paddle/distributed/auto_parallel/static/tuner/tunable_variable.py b/python/paddle/distributed/auto_parallel/static/tuner/tunable_variable.py
index 3ade2b674c5a3..83ed42c3fe1c0 100644
--- a/python/paddle/distributed/auto_parallel/static/tuner/tunable_variable.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/tunable_variable.py
@@ -87,9 +87,7 @@ def __init__(self, name, values, default=None):
         types = {type(v) for v in values}
         if len(types) > 1:
             raise TypeError(
-                "Choice can contain only one type of value, but found values: {} with types: {}.".format(
-                    str(values), str(types)
-                )
+                f"Choice can contain only one type of value, but found values: {str(values)} with types: {str(types)}."
             )
         self._is_unknown_type = False
 
@@ -116,9 +114,7 @@ def __init__(self, name, values, default=None):
 
         if default is not None and default not in values:
             raise ValueError(
-                "The default value should be one of the choices {}, but found {}".format(
-                    values, default
-                )
+                f"The default value should be one of the choices {values}, but found {default}"
             )
         self._default = default
 
@@ -144,9 +140,7 @@ def get_state(self):
         return state
 
     def __repr__(self):
-        return 'Choice(name: "{}", values: {}, default: {})'.format(
-            self.name, self.values, self.default
-        )
+        return f'Choice(name: "{self.name}", values: {self.values}, default: {self.default})'
 
 
 class IntRange(TunableVariable):
@@ -195,9 +189,7 @@ def _check_int(self, val):
         return int_val
 
     def __repr__(self):
-        return "IntRange(name: {}, start: {}, stop: {}, step: {}, default: {})".format(
-            self.name, self.start, self.stop, self.step, self.default
-        )
+        return f"IntRange(name: {self.name}, start: {self.start}, stop: {self.stop}, step: {self.step}, default: {self.default})"
 
 
 class FloatRange(TunableVariable):
@@ -245,11 +237,4 @@ def get_state(self):
         return state
 
     def __repr__(self):
-        return "FloatRange(name: {}, start: {}, stop: {}, step: {}, default: {}, endpoint: {})".format(
-            self.name,
-            self.start,
-            self.stop,
-            self.step,
-            self.default,
-            self.endpoint,
-        )
+        return f"FloatRange(name: {self.name}, start: {self.start}, stop: {self.stop}, step: {self.step}, default: {self.default}, endpoint: {self.endpoint})"
diff --git a/python/paddle/distributed/auto_parallel/static/utils.py b/python/paddle/distributed/auto_parallel/static/utils.py
index 71e4c0896fd35..b6707686ff2ba 100644
--- a/python/paddle/distributed/auto_parallel/static/utils.py
+++ b/python/paddle/distributed/auto_parallel/static/utils.py
@@ -364,18 +364,14 @@ def _coordinate2linear_idx(mesh_shape, coordinate):
 
     assert len(mesh_shape) == len(
         coordinate
-    ), "coordinate should have the same size as mesh shape, but got shape: {}, coordinate: {}".format(
-        mesh_shape, coordinate
-    )
+    ), f"coordinate should have the same size as mesh shape, but got shape: {mesh_shape}, coordinate: {coordinate}"
     for i in range(len(mesh_shape)):
         assert (
             coordinate[i] >= 0
         ), f"index in dimension [{i}] is least than zero. coordinate: {coordinate}"
         assert (
             coordinate[i] < mesh_shape[i]
-        ), "index beyond extent in dimension [{}]. shape: {}, coordinate: {}".format(
-            i, mesh_shape, coordinate
-        )
+        ), f"index beyond extent in dimension [{i}]. shape: {mesh_shape}, coordinate: {coordinate}"
 
     base = mesh_shape[-1]
     linear_idx = coordinate[-1]
@@ -410,9 +406,7 @@ def _linear_idx2coordinate(mesh_shape, linear_idx):
     assert linear_idx >= 0, f"linear index [{linear_idx}] is least than zero"
     assert linear_idx < np.prod(
         mesh_shape
-    ), "linear index beyond the extent of mesh shape. shape: {}, linear index: {}".format(
-        mesh_shape, linear_idx
-    )
+    ), f"linear index beyond the extent of mesh shape. shape: {mesh_shape}, linear index: {linear_idx}"
 
     base = 1
     coordinate = [-1] * len(mesh_shape)
@@ -872,9 +866,7 @@ def merge_and_slice_parameter(dist_param_dict, pre_dist_attr, cur_dist_attr):
     assert _check_dist_attr(pre_dist_attr), "'pre_dist_attr' cannot be None."
     assert isinstance(
         dist_param_dict, dict
-    ), "The type of 'dist_param_dict' should be 'dict', but got {}.".format(
-        str(type(dist_param_dict))
-    )
+    ), f"The type of 'dist_param_dict' should be 'dict', but got {str(type(dist_param_dict))}."
     for name, value in dist_param_dict.items():
         if not isinstance(name, str):
             raise TypeError(
@@ -935,15 +927,11 @@ def merge_and_slice_parameter(dist_param_dict, pre_dist_attr, cur_dist_attr):
 
     if param_not_in_pre:
         warnings.warn(
-            "Parameters '{}' are not found in last training process.".format(
-                str(param_not_in_pre)
-            )
+            f"Parameters '{str(param_not_in_pre)}' are not found in last training process."
         )
     if param_not_in_cur:
         warnings.warn(
-            "Parameters '{}' are not found in current training process.".format(
-                str(param_not_in_cur)
-            )
+            f"Parameters '{str(param_not_in_cur)}' are not found in current training process."
         )
 
     return dist_param_dict
@@ -1295,9 +1283,7 @@ def set_var_dist_attr(dist_context, var, dims_mapping, process_mesh, **kwargs):
         tensor_dist_attr.process_mesh = process_mesh
     else:
         raise ValueError(
-            "{} must be a instance of ProcessMesh or list, but receive {}".format(
-                process_mesh, type(process_mesh)
-            )
+            f"{process_mesh} must be a instance of ProcessMesh or list, but receive {type(process_mesh)}"
         )
     if "mark_annotated" in kwargs and kwargs["mark_annotated"]:
         tensor_dist_attr.mark_annotated("dims_mapping")
@@ -1372,9 +1358,7 @@ def update_op_dims_mapping_by_default_dist_impl(dist_op):
             for idx, mapping in enumerate(dims_mapping[1:]):
                 assert (
                     mapping == -1
-                ), "{} only the batch dimension (0-dim) can be sharded, but the dimension {} is sharded by {} part.".format(
-                    op_desc.type(), idx, mapping
-                )
+                ), f"{op_desc.type()} only the batch dimension (0-dim) can be sharded, but the dimension {idx} is sharded by {mapping} part."
         if len(dims_mapping) >= 1:
             batch_dim_mappings.append(dims_mapping[0])
     for arg_name in op_desc.output_arg_names():
@@ -1387,24 +1371,18 @@ def update_op_dims_mapping_by_default_dist_impl(dist_op):
                 for idx, mapping in enumerate(dims_mapping[1:]):
                     assert (
                         mapping == -1
-                    ), "{} only the batch dimension (0-dim) can be sharded, but the dimension {} is sharded by {} part.".format(
-                        op_desc.type(), idx, mapping
-                    )
+                    ), f"{op_desc.type()} only the batch dimension (0-dim) can be sharded, but the dimension {idx} is sharded by {mapping} part."
             if len(dims_mapping) >= 1:
                 batch_dim_mappings.append(dims_mapping[0])
         else:
             assert (
                 dims_mapping[0] == -1
-            ), "{} only the batch dimension (1-dim) of XShape can be sharded, but the dimension 0 is sharded by {} part.".format(
-                op_desc.type(), mapping
-            )
+            ), f"{op_desc.type()} only the batch dimension (1-dim) of XShape can be sharded, but the dimension 0 is sharded by {mapping} part."
             if len(dims_mapping) > 2:
                 for idx, mapping in enumerate(dims_mapping[2:]):
                     assert (
                         mapping == -1
-                    ), "{} only the batch dimension (1-dim) of XShape can be sharded, but the dimension {} is sharded by {} part.".format(
-                        op_desc.type(), idx, mapping
-                    )
+                    ), f"{op_desc.type()} only the batch dimension (1-dim) of XShape can be sharded, but the dimension {idx} is sharded by {mapping} part."
             batch_dim_mappings.append(dims_mapping[1])
 
     compatible_dim_mapping = compute_compatible_dim_mapping(batch_dim_mappings)
@@ -1810,15 +1788,11 @@ def initialize_pg_in_full_mode(all_process_groups, cur_rank):
                 rank = int(rank)
                 if rank != recv_rank:
                     raise ValueError(
-                        "Please check comm pair, the recv rank should be {} but got {}.".format(
-                            recv_rank, rank
-                        )
+                        f"Please check comm pair, the recv rank should be {recv_rank} but got {rank}."
                     )
                 else:
                     print(
-                        "It is able to instantiate {} as sender now.".format(
-                            process_group.ranks
-                        )
+                        f"It is able to instantiate {process_group.ranks} as sender now."
                     )
                 client_socket.close()
             else:
@@ -1835,9 +1809,7 @@ def initialize_pg_in_full_mode(all_process_groups, cur_rank):
                         )
                         client_sockets[send_rank].close()
                         print(
-                            "It is able to instantiate {} as receiver now.".format(
-                                process_group.ranks
-                            )
+                            f"It is able to instantiate {process_group.ranks} as receiver now."
                         )
                         break
         process_group.instantiate()
@@ -2146,9 +2118,7 @@ def insert_dependencies_for_two_ops(
     ).process_mesh
     assert (
         prior_op_mesh == posterior_mesh
-    ), "two ops of dependency should have same mesh but got [{}] and [{}]".format(
-        str(prior_op_mesh), str(posterior_mesh)
-    )
+    ), f"two ops of dependency should have same mesh but got [{str(prior_op_mesh)}] and [{str(posterior_mesh)}]"
 
     def _select_best_depend_var(vars):
         # parameter should not be dep var since it maybe partition in sharding pass
diff --git a/python/paddle/distributed/auto_tuner/utils.py b/python/paddle/distributed/auto_tuner/utils.py
index ba60de32ca173..671da9e119c81 100644
--- a/python/paddle/distributed/auto_tuner/utils.py
+++ b/python/paddle/distributed/auto_tuner/utils.py
@@ -979,76 +979,10 @@ def gen_sharding_overlap_args(res_args, cfg, tuner_cfg):
 
 def gen_new_args(raw_args, cfg, tuner_cfg, run_best=False):
     """Generate new script args."""
+    cfg = copy.deepcopy(cfg)
 
-    def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
-        if arg in cmd and arg in cfg:
-            if "--" in cmd[arg][0]:
-                cmd[arg][1] = cmd[arg][1] + str(cfg[arg])
-                res_args.extend(cmd[arg])
-            elif "-o" in cmd[arg][0]:
-                cmd[arg][1] = cmd[arg][1] + "=" + str(cfg[arg])
-                res_args.extend(cmd[arg])
-            elif ".json" in cmd[arg][0]:
-                import json
-
-                file_path = cmd[arg][0]
-                prefix = ""
-                if len(cmd[arg]) >= 3:
-                    prefix = cmd[arg][2]
-                try:
-                    with open(file_path, "r") as f:
-                        cmd_cfg = json.load(f)
-                except:
-                    raise ValueError(
-                        "Please check your auto tuner json whether valid."
-                    )
-                keys = cmd[arg][1].split(".")
-                value = None
-                for key in keys[: len(keys) - 1]:
-                    if not value:
-                        value = cmd_cfg[key]
-                    else:
-                        value = value[key]
-                if value:
-                    value[keys[-1]] = (
-                        prefix + str(cfg[arg]) if prefix else cfg[arg]
-                    )
-                else:
-                    cmd_cfg[keys[-1]] = (
-                        prefix + str(cfg[arg]) if prefix else cfg[arg]
-                    )
-                json.dump(cmd_cfg, open(cmd[arg][0], "w"))
-            elif ".yaml" in cmd[arg][0]:
-                import yaml
-
-                file_path = cmd[arg][0]
-                prefix = ""
-                if len(cmd[arg]) >= 3:
-                    prefix = cmd[arg][2]
-                try:
-                    with open(file_path, "r") as f:
-                        cmd_cfg = yaml.safe_load(f)
-                except:
-                    raise ValueError(
-                        "Please check your auto tuner json whether valid."
-                    )
-                keys = cmd[arg][1].split(".")
-                value = None
-                for key in keys[: len(keys) - 1]:
-                    if not value:
-                        value = cmd_cfg[key]
-                    else:
-                        value = value[key]
-                if value:
-                    value[keys[-1]] = (
-                        prefix + str(cfg[arg]) if prefix else cfg[arg]
-                    )
-                else:
-                    cmd_cfg[keys[-1]] = (
-                        prefix + str(cfg[arg]) if prefix else cfg[arg]
-                    )
-                yaml.dump(cmd_cfg, open(cmd[arg][0], "w"))
-        elif arg == "local_batch_size" and arg in cmd:
+    def _get_new_cfg(arg, cmg, cfg, tuner_cfg):
+        if arg == "local_batch_size" and arg in cmd:
             global_batch_size = (
                 cfg["global_batch_size"]
                 if "global_batch_size" in cfg
@@ -1057,86 +991,9 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
             local_batch_size = (
                 global_batch_size // cfg["sharding_degree"] // cfg["dp_degree"]
             )
-            if "--" in cmd["local_batch_size"][0]:
-                cmd["local_batch_size"][1] = cmd["local_batch_size"][1] + str(
-                    local_batch_size
-                )
-                res_args.extend(cmd["local_batch_size"])
-            elif "-o" in cmd["local_batch_size"][0]:
-                cmd["local_batch_size"][1] = (
-                    cmd["local_batch_size"][1] + "=" + str(local_batch_size)
-                )
-                res_args.extend(cmd["local_batch_size"])
-            elif ".json" in cmd[arg][0]:
-                import json
-
-                file_path = cmd[arg][0]
-                prefix = ""
-                if len(cmd[arg]) >= 3:
-                    prefix = cmd[arg][2]
-                try:
-                    with open(file_path, "r") as f:
-                        cmd_cfg = json.load(f)
-                except:
-                    raise ValueError(
-                        "Please check your auto tuner json whether valid."
-                    )
-                keys = cmd[arg][1].split(".")
-                value = None
-                for key in keys[: len(keys) - 1]:
-                    if not value:
-                        value = cmd_cfg[key]
-                    else:
-                        value = value[key]
-                if value:
-                    value[keys[-1]] = (
-                        prefix + str(local_batch_size)
-                        if prefix
-                        else local_batch_size
-                    )
-                else:
-                    cmd_cfg[keys[-1]] = (
-                        prefix + str(local_batch_size)
-                        if prefix
-                        else local_batch_size
-                    )
-                json.dump(cmd_cfg, open(cmd[arg][0], "w"))
-            elif ".yaml" in cmd[arg][0]:
-                import yaml
+            cfg["local_batch_size"] = local_batch_size
 
-                file_path = cmd[arg][0]
-                prefix = ""
-                if len(cmd[arg]) >= 3:
-                    prefix = cmd[arg][2]
-                try:
-                    with open(file_path, "r") as f:
-                        cmd_cfg = yaml.safe_load(f)
-                except:
-                    raise ValueError(
-                        "Please check your auto tuner json whether valid."
-                    )
-                keys = cmd[arg][1].split(".")
-                value = None
-                for key in keys[: len(keys) - 1]:
-                    if not value:
-                        value = cmd_cfg[key]
-                    else:
-                        value = value[key]
-                if value:
-                    value[keys[-1]] = (
-                        prefix + str(local_batch_size)
-                        if prefix
-                        else local_batch_size
-                    )
-                else:
-                    cmd_cfg[keys[-1]] = (
-                        prefix + str(local_batch_size)
-                        if prefix
-                        else local_batch_size
-                    )
-                yaml.dump(cmd_cfg, open(cmd[arg][0], "w"))
-
-        elif arg == "gradient_accumulation_steps" and arg in cmd:
+        if arg == "gradient_accumulation_steps" and arg in cmd:
             try:
                 global_batch_size = (
                     cfg["global_batch_size"]
@@ -1149,195 +1006,36 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                     // cfg["dp_degree"]
                     // cfg["micro_batch_size"]
                 )
+                cfg["gradient_accumulation_steps"] = gradient_accumulation_steps
             except:
                 return
-            if "--" in cmd["gradient_accumulation_steps"][0]:
-                cmd["gradient_accumulation_steps"][1] = cmd[
-                    "gradient_accumulation_steps"
-                ][1] + str(gradient_accumulation_steps)
-                res_args.extend(cmd["gradient_accumulation_steps"])
-
-            elif "-o" in cmd["gradient_accumulation_steps"][0]:
-                cmd["gradient_accumulation_steps"][1] = (
-                    cmd["gradient_accumulation_steps"][1]
-                    + "="
-                    + str(gradient_accumulation_steps)
-                )
-                res_args.extend(cmd["gradient_accumulation_steps"])
-            elif ".json" in cmd[arg][0]:
-                import json
-
-                file_path = cmd[arg][0]
-                prefix = ""
-                if len(cmd[arg]) >= 3:
-                    prefix = cmd[arg][2]
-                try:
-                    with open(file_path, "r") as f:
-                        cmd_cfg = json.load(f)
-                except:
-                    raise ValueError(
-                        "Please check your auto tuner json whether valid."
-                    )
-                keys = cmd[arg][1].split(".")
-                value = None
-                for key in keys[: len(keys) - 1]:
-                    if not value:
-                        value = cmd_cfg[key]
-                    else:
-                        value = value[key]
-                if value:
-                    value[keys[-1]] = (
-                        prefix + str(gradient_accumulation_steps)
-                        if prefix
-                        else gradient_accumulation_steps
-                    )
-                else:
-                    cmd_cfg[keys[-1]] = (
-                        prefix + str(gradient_accumulation_steps)
-                        if prefix
-                        else gradient_accumulation_steps
-                    )
-                json.dump(cmd_cfg, open(cmd[arg][0], "w"))
-            elif ".yaml" in cmd[arg][0]:
-                import yaml
-
-                file_path = cmd[arg][0]
-                prefix = ""
-                if len(cmd[arg]) >= 3:
-                    prefix = cmd[arg][2]
-                try:
-                    with open(file_path, "r") as f:
-                        cmd_cfg = yaml.safe_load(f)
-                except:
-                    raise ValueError(
-                        "Please check your auto tuner json whether valid."
-                    )
-                keys = cmd[arg][1].split(".")
-                value = None
-                for key in keys[: len(keys) - 1]:
-                    if not value:
-                        value = cmd_cfg[key]
-                    else:
-                        value = value[key]
-                if value:
-                    value[keys[-1]] = (
-                        prefix + str(gradient_accumulation_steps)
-                        if prefix
-                        else gradient_accumulation_steps
-                    )
-                else:
-                    cmd_cfg[keys[-1]] = (
-                        prefix + str(gradient_accumulation_steps)
-                        if prefix
-                        else gradient_accumulation_steps
-                    )
-                yaml.dump(cmd_cfg, open(cmd[arg][0], "w"))
 
-        elif arg == "sequence_parallel" and arg in cmd:
+        if arg == "sequence_parallel" and arg in cmd:
             try:
                 sequence_parallel = 1 if cfg["mp_degree"] > 1 else 0
+                cfg["sequence_parallel"] = sequence_parallel
             except:
                 return
-            if "--" in cmd["sequence_parallel"][0]:
-                cmd["sequence_parallel"][1] = cmd["sequence_parallel"][1] + str(
-                    sequence_parallel
-                )
-                res_args.extend(cmd["sequence_parallel"])
-
-            elif "-o" in cmd["sequence_parallel"][0]:
-                cmd["sequence_parallel"][1] = (
-                    cmd["sequence_parallel"][1] + "=" + str(sequence_parallel)
-                )
-                res_args.extend(cmd["sequence_parallel"])
-            elif ".json" in cmd[arg][0]:
-                import json
-
-                file_path = cmd[arg][0]
-                prefix = ""
-                if len(cmd[arg]) >= 3:
-                    prefix = cmd[arg][2]
-                try:
-                    with open(file_path, "r") as f:
-                        cmd_cfg = json.load(f)
-                except:
-                    raise ValueError(
-                        "Please check your auto tuner json whether valid."
-                    )
-                keys = cmd[arg][1].split(".")
-                value = None
-                for key in keys[: len(keys) - 1]:
-                    if not value:
-                        value = cmd_cfg[key]
-                    else:
-                        value = value[key]
-                if value:
-                    value[keys[-1]] = (
-                        prefix + str(sequence_parallel)
-                        if prefix
-                        else sequence_parallel
-                    )
-                else:
-                    cmd_cfg[keys[-1]] = (
-                        prefix + str(sequence_parallel)
-                        if prefix
-                        else sequence_parallel
-                    )
-                json.dump(cmd_cfg, open(cmd[arg][0], "w"))
-            elif ".yaml" in cmd[arg][0]:
-                import yaml
-
-                file_path = cmd[arg][0]
-                prefix = ""
-                if len(cmd[arg]) >= 3:
-                    prefix = cmd[arg][2]
-                try:
-                    with open(file_path, "r") as f:
-                        cmd_cfg = yaml.safe_load(f)
-                except:
-                    raise ValueError(
-                        "Please check your auto tuner json whether valid."
-                    )
-                keys = cmd[arg][1].split(".")
-                value = None
-                for key in keys[: len(keys) - 1]:
-                    if not value:
-                        value = cmd_cfg[key]
-                    else:
-                        value = value[key]
-                if value:
-                    value[keys[-1]] = (
-                        prefix + str(sequence_parallel)
-                        if prefix
-                        else sequence_parallel
-                    )
-                else:
-                    cmd_cfg[keys[-1]] = (
-                        prefix + str(sequence_parallel)
-                        if prefix
-                        else sequence_parallel
-                    )
-                yaml.dump(cmd_cfg, open(cmd[arg][0], "w"))
 
-        elif arg == "global_batch_size" and arg in cmd:
+        if arg == "global_batch_size" and arg in cmd:
             try:
                 global_batch_size = (
                     cfg["global_batch_size"]
                     if "global_batch_size" in cfg
                     else tuner_cfg["model_cfg"]["global_batch_size"]
                 )
+                cfg["global_batch_size"] = global_batch_size
             except:
                 return
-            if "--" in cmd["global_batch_size"][0]:
-                cmd["global_batch_size"][1] = cmd["global_batch_size"][1] + str(
-                    global_batch_size
-                )
-                res_args.extend(cmd["global_batch_size"])
 
-            elif "-o" in cmd["global_batch_size"][0]:
-                cmd["global_batch_size"][1] = (
-                    cmd["global_batch_size"][1] + "=" + str(global_batch_size)
-                )
-                res_args.extend(cmd["global_batch_size"])
+    def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
+        if arg in cmd and arg in cfg:
+            if "--" in cmd[arg][0]:
+                cmd[arg][1] = cmd[arg][1] + str(cfg[arg])
+                res_args.extend(cmd[arg])
+            elif "-o" in cmd[arg][0]:
+                cmd[arg][1] = cmd[arg][1] + "=" + str(cfg[arg])
+                res_args.extend(cmd[arg])
             elif ".json" in cmd[arg][0]:
                 import json
 
@@ -1361,17 +1059,22 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                         value = value[key]
                 if value:
                     value[keys[-1]] = (
-                        prefix + str(global_batch_size)
-                        if prefix
-                        else global_batch_size
+                        prefix + str(cfg[arg]) if prefix else cfg[arg]
                     )
                 else:
                     cmd_cfg[keys[-1]] = (
-                        prefix + str(global_batch_size)
-                        if prefix
-                        else global_batch_size
+                        prefix + str(cfg[arg]) if prefix else cfg[arg]
                     )
                 json.dump(cmd_cfg, open(cmd[arg][0], "w"))
+                if tuner_cfg["run_cmd"].get("generate_launch_cfg", True):
+                    new_cmd_apth = (
+                        os.path.splitext(cmd[arg][0])[0]
+                        + "_"
+                        + cfg["log_dir_name"]
+                        + ".json"
+                    )
+                    json.dump(cmd_cfg, open(new_cmd_apth, "w"))
+
             elif ".yaml" in cmd[arg][0]:
                 import yaml
 
@@ -1395,17 +1098,20 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                         value = value[key]
                 if value:
                     value[keys[-1]] = (
-                        prefix + str(global_batch_size)
-                        if prefix
-                        else global_batch_size
+                        prefix + str(cfg[arg]) if prefix else cfg[arg]
                     )
                 else:
                     cmd_cfg[keys[-1]] = (
-                        prefix + str(global_batch_size)
-                        if prefix
-                        else global_batch_size
+                        prefix + str(cfg[arg]) if prefix else cfg[arg]
                     )
                 yaml.dump(cmd_cfg, open(cmd[arg][0], "w"))
+                if tuner_cfg["run_cmd"].get("generate_launch_cfg", True):
+                    new_cmd_apth = (
+                        os.path.splitext(cmd[arg][0])[0]
+                        + cfg["log_dir_name"]
+                        + ".yaml"
+                    )
+                    yaml.dump(cmd_cfg, open(new_cmd_apth, "w"))
 
         elif arg == "refined_recompute" and arg in cmd:
             if "--" in cmd["refined_recompute"][0]:
@@ -1449,6 +1155,14 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                 else:
                     cmd_cfg[keys[-1]] = rr_values
                 json.dump(cmd_cfg, open(cmd[arg][0], "w"))
+                if tuner_cfg["run_cmd"].get("generate_launch_cfg", True):
+                    new_cmd_apth = (
+                        os.path.splitext(cmd[arg][0])[0]
+                        + cfg["log_dir_name"]
+                        + ".json"
+                    )
+                    json.dump(cmd_cfg, open(new_cmd_apth, "w"))
+
             elif ".yaml" in cmd[arg][0]:
                 import yaml
 
@@ -1482,6 +1196,13 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                 else:
                     cmd_cfg[keys[-1]] = rr_values
                 yaml.dump(cmd_cfg, open(cmd[arg][0], "w"))
+                if tuner_cfg["run_cmd"].get("generate_launch_cfg", True):
+                    new_cmd_apth = (
+                        os.path.splitext(cmd[arg][0])[0]
+                        + cfg["log_dir_name"]
+                        + ".yaml"
+                    )
+                    yaml.dump(cmd_cfg, open(new_cmd_apth, "w"))
 
     assert "run_cmd" in tuner_cfg
     cmd = copy.deepcopy(tuner_cfg["run_cmd"])
@@ -1509,6 +1230,7 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
             new_args.append(key)
 
     for arg in new_args:
+        _get_new_cfg(arg, cmd, cfg, tuner_cfg)
         _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg)
 
     if tuner_cfg["run_cmd"].get("search_stage", None) and not run_best:
@@ -1688,15 +1410,13 @@ def read_metric_log(
         re_metric_pattern = (
             target_metric + r":* *(\d+(\.\d*)?)|(\d+(\.\d*)?) *" + target_metric
         )
-        re_out_of_memory_pattern = r"Out of memory"
+        re_out_of_memory_pattern = r"Out of memory error on"
         out_of_memory_flag = 0
         metric_list = []
         lines = f.readlines()
         for line in lines:
             metric = re.findall(re_metric_pattern, line)
-            out_of_memory = re.findall(
-                re_out_of_memory_pattern, line, re.IGNORECASE
-            )
+            out_of_memory = re.findall(re_out_of_memory_pattern, line)
             if metric:
                 value = None
                 for item in metric[0]:
diff --git a/python/paddle/distributed/cloud_utils.py b/python/paddle/distributed/cloud_utils.py
index ff9908c09c96a..c384572dc04a0 100644
--- a/python/paddle/distributed/cloud_utils.py
+++ b/python/paddle/distributed/cloud_utils.py
@@ -116,9 +116,7 @@ def get_cluster_and_pod(args):
     selected_devices = get_gpus(args.selected_devices)
     trainers_num = _get_trainers_num()
     logger.debug(
-        "parsed from args trainerss_num:{} selected_devices:{}".format(
-            trainers_num, selected_devices
-        )
+        f"parsed from args trainerss_num:{trainers_num} selected_devices:{selected_devices}"
     )
 
     cluster = None
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 62b79302f32dd..1169057465dad 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -262,45 +262,6 @@ def load_from_prototxt(self, pb_file):
                 str(f.read()), self.strategy
             )
 
-    @property
-    def execution_strategy(self):
-        """
-        Configure ExecutionStrategy for DistributedStrategy
-
-        Examples:
-            .. code-block:: python
-
-                >>> import paddle
-                >>> exe_strategy = paddle.static.ExecutionStrategy()
-                >>> exe_strategy.num_threads = 10
-                >>> exe_strategy.num_iteration_per_drop_scope = 10
-                >>> exe_strategy.num_iteration_per_run = 10
-
-                >>> strategy = paddle.distributed.fleet.DistributedStrategy()
-                >>> strategy.execution_strategy = exe_strategy
-
-        """
-        execution_strategy = paddle.static.ExecutionStrategy()
-        fields = self.strategy.execution_strategy.DESCRIPTOR.fields
-        for f in fields:
-            setattr(
-                execution_strategy,
-                f.name,
-                getattr(self.strategy.execution_strategy, f.name),
-            )
-        return execution_strategy
-
-    @execution_strategy.setter
-    @is_strict_auto
-    def execution_strategy(self, strategy):
-        fields = self.strategy.execution_strategy.DESCRIPTOR.fields
-        for f in fields:
-            setattr(
-                self.strategy.execution_strategy,
-                f.name,
-                getattr(strategy, f.name),
-            )
-
     @property
     def build_strategy(self):
         """
@@ -413,9 +374,7 @@ def a_sync(self, flag):
             self.a_sync_configs = {"k_steps": 0}
         else:
             raise ValueError(
-                "The type of `flag` is invalid, expected type is bool, but received {}".format(
-                    type(flag)
-                )
+                f"The type of `flag` is invalid, expected type is bool, but received {type(flag)}"
             )
 
     @property
@@ -529,9 +488,7 @@ def adam_d2sum(self, flag):
             self.strategy.adam_d2sum = flag
         else:
             raise ValueError(
-                "The type of `flag` is invalid, expected type is bool, but received {}".format(
-                    type(flag)
-                )
+                f"The type of `flag` is invalid, expected type is bool, but received {type(flag)}"
             )
 
     @trainer_desc_configs.setter
@@ -2660,7 +2617,7 @@ def __repr__(self):
 
         env_draws = line + "\n"
         for f in fields:
-            if "build_strategy" in f.name or "execution_strategy" in f.name:
+            if "build_strategy" in f.name:
                 continue
             if "_configs" in f.name:
                 continue
@@ -2736,15 +2693,5 @@ def __repr__(self):
             )
         build_strategy_str += border + "\n"
 
-        execution_strategy_str = h1_format.format("Execution Strategy")
-        execution_strategy_str += line + "\n"
-
-        fields = self.strategy.execution_strategy.DESCRIPTOR.fields
-        for f in fields:
-            execution_strategy_str += h2_format.format(
-                f.name, str(getattr(self.strategy.execution_strategy, f.name))
-            )
-        execution_strategy_str += border + "\n"
-
-        result_res += build_strategy_str + execution_strategy_str
+        result_res += build_strategy_str
         return result_res
diff --git a/python/paddle/distributed/fleet/base/orthogonal_strategy.py b/python/paddle/distributed/fleet/base/orthogonal_strategy.py
index 61bcd69b7075e..9af780b03126c 100644
--- a/python/paddle/distributed/fleet/base/orthogonal_strategy.py
+++ b/python/paddle/distributed/fleet/base/orthogonal_strategy.py
@@ -143,9 +143,7 @@ def _check_valid_strategy(self):
 
         assert num_of_ranks == len(
             self._strategy_rank_list
-        ), "There are total {} ranks, but need {} ranks in this strategy.".format(
-            len(self._strategy_rank_list), num_of_ranks
-        )
+        ), f"There are total {len(self._strategy_rank_list)} ranks, but need {num_of_ranks} ranks in this strategy."
 
         for fused_strategy in self._fused_strategy_dict.values():
             for strategy in fused_strategy:
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index e6d0b1832ff77..4b9d60e80837d 100755
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -489,12 +489,7 @@ def _get_pserver_endpoints(self):
         return self._server_endpoints
 
     def to_string(self):
-        return "role: {}, current_id: {}, worker_endpoints: {}, server_endpoints: {}".format(
-            self._role,
-            self._current_id,
-            self._worker_endpoints,
-            self._server_endpoints,
-        )
+        return f"role: {self._role}, current_id: {self._current_id}, worker_endpoints: {self._worker_endpoints}, server_endpoints: {self._server_endpoints}"
 
     def _all_gather(self, input, comm_world="worker"):
         print("warning: RoleMakerBase does not have all gather worker.")
@@ -906,9 +901,7 @@ def _ps_env(self):  # each role will execute it
             "COORDINATOR",
         ]:
             raise ValueError(
-                "TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER or COORDINATOR, but get {}, please check your environment.".format(
-                    training_role
-                )
+                f"TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER or COORDINATOR, but get {training_role}, please check your environment."
             )
 
         # For Heter Parameter Server env setting
diff --git a/python/paddle/distributed/fleet/base/topology.py b/python/paddle/distributed/fleet/base/topology.py
index 1c73198bcc744..8105e2672c87f 100644
--- a/python/paddle/distributed/fleet/base/topology.py
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -195,14 +195,7 @@ def __init__(self, topology):
 
         assert (
             self._check_valid_topo()
-        ), "nranks: {}, mp_num: {}, sharding_num: {}, pp_num: {}, dp_num: {}, sep_num: {}".format(
-            self.nranks,
-            self._mp_degree,
-            self._sharding_degree,
-            self._pp_degree,
-            self._dp_degree,
-            self._sep_degree,
-        )
+        ), f"nranks: {self.nranks}, mp_num: {self._mp_degree}, sharding_num: {self._sharding_degree}, pp_num: {self._pp_degree}, dp_num: {self._dp_degree}, sep_num: {self._sep_degree}"
 
         # create comm group for pipe parallel
         self._pp_group, self._pp_comm_group = self._set_comm_group("pipe")
@@ -282,14 +275,7 @@ def __init__(self, topology):
                 self._sep_degree,
             )
         )
-        debug_str += ", mp_group: {},  sharding_group: {}, pp_group: {}, dp_group: {}, sep:group: {}, check/clip group: {}".format(
-            self._mp_group,
-            self._sharding_group,
-            self._pp_group,
-            self._dp_group,
-            self._sep_group,
-            self._check_group,
-        )
+        debug_str += f", mp_group: {self._mp_group},  sharding_group: {self._sharding_group}, pp_group: {self._pp_group}, dp_group: {self._dp_group}, sep:group: {self._sep_group}, check/clip group: {self._check_group}"
         logger.info(debug_str)
 
         global _HYBRID_PARALLEL_GROUP
@@ -369,9 +355,7 @@ def _set_comm_group(self, parallel_method="data"):
         assert parallel_comm_group is not None
 
         logger.info(
-            "Total {} {} comm group(s) create successfully!".format(
-                len(parallel_groups), parallel_method
-            )
+            f"Total {len(parallel_groups)} {parallel_method} comm group(s) create successfully!"
         )
         return parallel_group, parallel_comm_group
 
@@ -587,9 +571,7 @@ def create_fuse_group(self, fused_strategy_list):
         assert len(parallel_comm_group) > 0
 
         logger.info(
-            "Total {} comm group(s) of fused {} create successfully!".format(
-                len(parallel_groups), fused_strategy_list
-            )
+            f"Total {len(parallel_groups)} comm group(s) of fused {fused_strategy_list} create successfully!"
         )
         if len(parallel_group) > 1:
             return parallel_group, parallel_comm_group
diff --git a/python/paddle/distributed/fleet/base/util_factory.py b/python/paddle/distributed/fleet/base/util_factory.py
index f6dab5426233a..7eeb9dc027dc3 100755
--- a/python/paddle/distributed/fleet/base/util_factory.py
+++ b/python/paddle/distributed/fleet/base/util_factory.py
@@ -414,13 +414,7 @@ def _proto_check(self, config):
                 or var.dtype != train_prog_var.dtype
             ):
                 print(
-                    "variable: {} not match. in pruned program shape: {} dtype:{}, in train program shape: {} dtype: {}".format(
-                        var_name,
-                        var.shape,
-                        var.dtype,
-                        train_prog_var.shape,
-                        train_prog_var.dtype,
-                    )
+                    f"variable: {var_name} not match. in pruned program shape: {var.shape} dtype:{var.dtype}, in train program shape: {train_prog_var.shape} dtype: {train_prog_var.dtype}"
                 )
                 is_match = False
         return is_match
@@ -486,9 +480,7 @@ def check_not_expected_ops(prog, not_expected_op_types):
         not_expected_op_types = check_not_expected_ops(prog, ["lookup_table"])
         if len(not_expected_op_types) > 0:
             print(
-                "find op type '{}' in program, please check if your program is pruned correctly !".format(
-                    list(not_expected_op_types)
-                )
+                f"find op type '{list(not_expected_op_types)}' in program, please check if your program is pruned correctly !"
             )
             return False
 
@@ -524,10 +516,8 @@ def check_not_expected_ops(prog, not_expected_op_types):
                 orig_shape = orig_para_shape.get(each_var.name)
                 if new_shape != orig_shape:
                     raise RuntimeError(
-                        "Shape not matching: the Program requires a parameter with a shape of ({}), "
-                        "while the loaded parameter (namely [ {} ]) has a shape of  ({}).".format(
-                            orig_shape, each_var.name, new_shape
-                        )
+                        f"Shape not matching: the Program requires a parameter with a shape of ({orig_shape}), "
+                        f"while the loaded parameter (namely [ {each_var.name} ]) has a shape of  ({new_shape})."
                     )
 
             # check feed/fetch vars in program and config
@@ -545,9 +535,7 @@ def check_not_expected_ops(prog, not_expected_op_types):
                 and feed_target_names != feed_config.feeded_vars_names
             ):
                 print(
-                    "warning! feed vars in program and config are diff: feed in program: {}. feed in config {}.".format(
-                        feed_target_names, feed_config.feeded_vars_names
-                    )
+                    f"warning! feed vars in program and config are diff: feed in program: {feed_target_names}. feed in config {feed_config.feeded_vars_names}."
                 )
                 feed_name_list = feed_config.feeded_vars_names
                 # remove feed op in inference_program. new feed op will be added in exe.run
@@ -564,9 +552,7 @@ def check_not_expected_ops(prog, not_expected_op_types):
                 and fetch_targets_names != fetch_config.fetch_vars_names
             ):
                 print(
-                    "warning! fetch vars in program and config are diff: fetch in program: {}. fetch in config {}.".format(
-                        fetch_targets_names, fetch_config.fetch_vars_names
-                    )
+                    f"warning! fetch vars in program and config are diff: fetch in program: {fetch_targets_names}. fetch in config {fetch_config.fetch_vars_names}."
                 )
                 fetch_list = [
                     inference_program.global_block().var(i)
@@ -607,11 +593,7 @@ def check_not_expected_ops(prog, not_expected_op_types):
                 var_shape = var.shape[1:]
                 if tensor_shape != var_shape:
                     raise RuntimeError(
-                        "feed variable '{}' shape not match. infer program  shape: {}. feed tensor shape: {}".format(
-                            feed_config.feeded_vars_names[i],
-                            var_shape,
-                            tensor_shape,
-                        )
+                        f"feed variable '{feed_config.feeded_vars_names[i]}' shape not match. infer program  shape: {var_shape}. feed tensor shape: {tensor_shape}"
                     )
 
             if not feed_config.feeded_vars_filelist:
diff --git a/python/paddle/distributed/fleet/elastic/manager.py b/python/paddle/distributed/fleet/elastic/manager.py
index 9d511c2d39603..482e0c136c439 100644
--- a/python/paddle/distributed/fleet/elastic/manager.py
+++ b/python/paddle/distributed/fleet/elastic/manager.py
@@ -103,9 +103,7 @@ def _check_procs(self):
                     return ret
                 logger.error("ABORT!!! ABORT!!! ABORT!!!")
                 logger.error(
-                    "ERROR rank {} error with exit code {}, check log for detail.".format(
-                        p.rank, ret
-                    )
+                    f"ERROR rank {p.rank} error with exit code {ret}, check log for detail."
                 )
                 result = ret
         if not alive and result is None:
@@ -209,9 +207,7 @@ def __init__(self, args, etcd_client):
 
         if not server or ':' not in server or not name or not self.np:
             logger.info(
-                'Elastic is not enabled with server {} name {} and np {}'.format(
-                    server, name, self.np
-                )
+                f'Elastic is not enabled with server {server} name {name} and np {self.np}'
             )
             self.enable = False
             return
diff --git a/python/paddle/distributed/fleet/fleet.py b/python/paddle/distributed/fleet/fleet.py
index c9ea552815a83..bcd527fe5d4ed 100755
--- a/python/paddle/distributed/fleet/fleet.py
+++ b/python/paddle/distributed/fleet/fleet.py
@@ -245,9 +245,7 @@ def init(
                 )
             else:
                 raise ValueError(
-                    "`is_collective` should be instance of `bool`, but got {}".format(
-                        type(is_collective)
-                    )
+                    f"`is_collective` should be instance of `bool`, but got {type(is_collective)}"
                 )
         else:
             if isinstance(role_maker, RoleMakerBase):
@@ -255,9 +253,7 @@ def init(
                 self._is_collective = role_maker._is_collective
             else:
                 raise ValueError(
-                    "`role_maker` should be subclass of `RoleMakerBase`, but got {}".format(
-                        type(role_maker)
-                    )
+                    f"`role_maker` should be subclass of `RoleMakerBase`, but got {type(role_maker)}"
                 )
         self._role_maker._generate_role()
 
@@ -391,7 +387,7 @@ def allreduce_perf(
         )
         if perf_threshold_time > -1 and ret > perf_threshold_time:
             logger.warning(
-                f"[Perf Warnning] AllReduce Test Timeout! {ret} > {perf_threshold_time}"
+                f"[Perf Warning] AllReduce Test Timeout! {ret} > {perf_threshold_time}"
             )
 
     # test reduce perf
@@ -412,7 +408,7 @@ def reduce_perf(self, iteration, x, group, perf_size, perf_threshold_time):
         )
         if perf_threshold_time > -1 and ret > perf_threshold_time:
             logger.warning(
-                f"[Perf Warnning] Reduce Test Timeout! {ret} > {perf_threshold_time}"
+                f"[Perf Warning] Reduce Test Timeout! {ret} > {perf_threshold_time}"
             )
 
     # test broadcast perf
@@ -435,7 +431,7 @@ def broadcast_perf(
         )
         if perf_threshold_time > -1 and ret > perf_threshold_time:
             logger.warning(
-                f"[Perf Warnning] Broadcast Test Timeout! {ret} > {perf_threshold_time}"
+                f"[Perf Warning] Broadcast Test Timeout! {ret} > {perf_threshold_time}"
             )
 
     # test allgather perf
@@ -459,7 +455,7 @@ def allgather_perf(
         )
         if perf_threshold_time > -1 and ret > perf_threshold_time:
             logger.warning(
-                f"[Perf Warnning] Allgather Test Timeout! {ret} > {perf_threshold_time}"
+                f"[Perf Warning] Allgather Test Timeout! {ret} > {perf_threshold_time}"
             )
 
     # test reduce_scatter perf
@@ -502,7 +498,7 @@ def reduce_scatter_perf(
         )
         if perf_threshold_time > -1 and ret > perf_threshold_time:
             logger.warning(
-                f"[Perf Warnning] ReduceScatter Test Timeout! {ret} > {perf_threshold_time}"
+                f"[Perf Warning] ReduceScatter Test Timeout! {ret} > {perf_threshold_time}"
             )
 
     def _collective_perf_impl(self, round=50, context={}, hcg=None):
diff --git a/python/paddle/distributed/fleet/fleet_executor_utils.py b/python/paddle/distributed/fleet/fleet_executor_utils.py
index e299445cf3f34..2c1b288f9c180 100755
--- a/python/paddle/distributed/fleet/fleet_executor_utils.py
+++ b/python/paddle/distributed/fleet/fleet_executor_utils.py
@@ -119,7 +119,7 @@ def task_node(self):
     def set_program(self, program):
         assert (
             self.lazy_initialize
-        ), "Inside program is unchangable for immediate initialized task node. Set the lazy_initialize to be true if the inside program need to be update. Remember to do all your change before eval node.task_node()."
+        ), "Inside program is unchangeable for immediate initialized task node. Set the lazy_initialize to be true if the inside program need to be update. Remember to do all your change before eval node.task_node()."
         self.program = program
 
     def get_program(self):
@@ -423,7 +423,7 @@ def run1f1b(
 ):
     """
     Split the program to support 1f1b pipeline scheduler.
-    This funct will split the program based on the op_role.
+    This function will split the program based on the op_role.
     The program will be split into four parts: lr_sched, fwd, bwd, opt.
     And will create task nodes based on the four parts of the program.
     :param program: The origin program.
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 146d8a627e5c5..6a5fdfd6e3e67 100755
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -329,9 +329,7 @@ def get_cluster_info(args):
         )
     trainers_num = cloud_utils.get_trainers_num()
     logger.debug(
-        "parsed from args trainers_num:{} mode:{} devices:{}".format(
-            trainers_num, device_mode, devices_per_proc
-        )
+        f"parsed from args trainers_num:{trainers_num} mode:{device_mode} devices:{devices_per_proc}"
     )
 
     cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
@@ -531,9 +529,7 @@ def which_distributed_mode(args):
 
     if len(has_ps_args) > 0:
         logger.info(
-            "Run parameter-sever mode. pserver arguments:{}, accelerators count:{}".format(
-                has_ps_args, accelerators
-            )
+            f"Run parameter-sever mode. pserver arguments:{has_ps_args}, accelerators count:{accelerators}"
         )
         has_ps_heter_args = list(set(has_ps_args) & set(ps_heter_args))
         has_coordinator_args = list(set(has_ps_args) & set(coordinator_args))
@@ -543,9 +539,7 @@ def which_distributed_mode(args):
             return DistributeMode.PS
     elif len(has_collective_args) > 0:
         logger.info(
-            "Run collective mode. gpu arguments:{}, cuda count:{}".format(
-                has_collective_args, accelerators
-            )
+            f"Run collective mode. gpu arguments:{has_collective_args}, cuda count:{accelerators}"
         )
         return DistributeMode.COLLECTIVE
     else:
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index c0a01d43fd688..31e117a8ef5b2 100755
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -65,12 +65,7 @@ def __init__(self, hdfs):
         self.job_stage_flag = None
 
     def __str__(self):
-        return "job_server:{} pods:{} job_stage_flag:{} hdfs:{}".format(
-            self.job_server,
-            [str(pod) for pod in self.pods],
-            self.job_stage_flag,
-            self.hdfs,
-        )
+        return f"job_server:{self.job_server} pods:{[str(pod) for pod in self.pods]} job_stage_flag:{self.job_stage_flag} hdfs:{self.hdfs}"
 
     def __eq__(self, cluster):
         if len(self.pods) != len(cluster.pods):
@@ -152,9 +147,7 @@ def __init__(self):
         self.stage = None
 
     def __str__(self):
-        return "accelerator:{} endpoint:{} rank:{}".format(
-            self.accelerators, self.endpoint, self.rank
-        )
+        return f"accelerator:{self.accelerators} endpoint:{self.endpoint} rank:{self.rank}"
 
     def __eq__(self, t):
         if len(self.accelerators) != len(t.accelerators):
@@ -191,19 +184,8 @@ def __init__(self):
         self.device_mode = None
 
     def __str__(self):
-        return "rank:{} id:{} addr:{} port:{} visible_accelerator:{} trainers:{} servers:{} \
-            workers:{} heter_workers:{} coordinators:{}".format(
-            self.rank,
-            self.id,
-            self.addr,
-            self.port,
-            self.accelerators,
-            [str(t) for t in self.trainers],
-            [str(s) for s in self.servers],
-            [str(w) for w in self.workers],
-            [str(h) for h in self.heter_workers],
-            [str(c) for c in self.coordinators],
-        )
+        return f"rank:{self.rank} id:{self.id} addr:{self.addr} port:{self.port} visible_accelerator:{self.accelerators} trainers:{[str(t) for t in self.trainers]} servers:{[str(s) for s in self.servers]} \
+            workers:{[str(w) for w in self.workers]} heter_workers:{[str(h) for h in self.heter_workers]} coordinators:{[str(c) for c in self.coordinators]}"
 
     def __eq__(self, pod):
         if (
@@ -321,7 +303,7 @@ def get_cluster(
 
 
 def terminate_local_procs(procs):
-    # try to terminate process by group, this happend in multiprocess senario in user process
+    # try to terminate process by group, this happened in multiprocess scenario in user process
     if os.name != 'nt':
         for p in procs:
             if p.proc.poll() is None:
@@ -664,17 +646,13 @@ def watch_local_trainers(procs, nranks):
         return
     except SystemExit:
         logger.error(
-            "ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".format(
-                nranks, error_rank
-            )
+            f"ABORT!!! Out of all {nranks} trainers, the trainer process with rank={error_rank} was aborted. Please check its log."
         )
         terminate_local_procs(procs)
         raise
     except:
         logger.error(
-            "ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".format(
-                nranks, error_rank
-            )
+            f"ABORT!!! Out of all {nranks} trainers, the trainer process with rank={error_rank} was aborted. Please check its log."
         )
         terminate_local_procs(procs)
         return
@@ -785,9 +763,7 @@ def get_device_proc_info(args):
         if args.nproc_per_node is not None:
             assert (
                 len(gpus) % int(args.nproc_per_node)
-            ) == 0, "gpus' number:{} mod args.nproc_per_node:{} must == 0".format(
-                len(gpus), args.nproc_per_node
-            )
+            ) == 0, f"gpus' number:{len(gpus)} mod args.nproc_per_node:{args.nproc_per_node} must == 0"
 
             n = int(len(gpus) / int(args.nproc_per_node))
             devices_per_proc = [gpus[i : i + n] for i in range(0, len(gpus), n)]
@@ -798,9 +774,7 @@ def get_device_proc_info(args):
         if args.nproc_per_node is not None:
             assert (
                 len(xpus) % int(args.nproc_per_node)
-            ) == 0, "xpus' number:{} mod args.nproc_per_node:{} must == 0".format(
-                len(xpus), args.nproc_per_node
-            )
+            ) == 0, f"xpus' number:{len(xpus)} mod args.nproc_per_node:{args.nproc_per_node} must == 0"
 
             n = int(len(xpus) / int(args.nproc_per_node))
             devices_per_proc = [xpus[i : i + n] for i in range(0, len(xpus), n)]
@@ -1002,9 +976,7 @@ def get_relative_gpu_id(gpu_id):
             cuda_visible_devices_list = cuda_visible_devices.split(',')
             relative_id = cuda_visible_devices_list.index(str(gpu_id))
             logger.info(
-                "Change gpu id from {} to {} based on CUDA_VISIBLE_DEVICES {}".format(
-                    gpu_id, relative_id, cuda_visible_devices_list
-                )
+                f"Change gpu id from {gpu_id} to {relative_id} based on CUDA_VISIBLE_DEVICES {cuda_visible_devices_list}"
             )
             return relative_id
 
@@ -1477,9 +1449,7 @@ def get_role_endpoints(self, args):
         if self.current_node_ip in self.node_ips:
             self.node_rank = self.node_ips.index(self.current_node_ip)
             logger.debug(
-                "parsed from args: node_ips:{} current_node_ip:{} node_rank:{}".format(
-                    self.node_ips, self.current_node_ip, self.node_rank
-                )
+                f"parsed from args: node_ips:{self.node_ips} current_node_ip:{self.current_node_ip} node_rank:{self.node_rank}"
             )
 
     def start_ps(self):
@@ -1523,9 +1493,8 @@ def start_ps(self):
             for k in range(len(self.heter_worker_endpoints_ips)):
                 if ip == self.heter_worker_endpoints_ips[k]:
                     heter_worker = Trainer()
-                    heter_worker.endpoint = "{}:{}".format(
-                        ip,
-                        self.heter_worker_endpoints_port[k],
+                    heter_worker.endpoint = (
+                        f"{ip}:{self.heter_worker_endpoints_port[k]}"
                     )
                     heter_worker.rank = heter_worker_rank
                     heter_worker.stage = self.stage_list[k]
@@ -1565,12 +1534,7 @@ def start_ps(self):
             self.start_pod_heter_worker(self.args, pod)
 
         logger.info(
-            "Please check servers, workers, coordinator and heter_worker logs in {}/workerlog.*, {}/serverlog.* , {}/coordinatorlog.*, and {}/heterlog.*".format(
-                self.args.log_dir,
-                self.args.log_dir,
-                self.args.log_dir,
-                self.args.log_dir,
-            )
+            f"Please check servers, workers, coordinator and heter_worker logs in {self.args.log_dir}/workerlog.*, {self.args.log_dir}/serverlog.* , {self.args.log_dir}/coordinatorlog.*, and {self.args.log_dir}/heterlog.*"
         )
 
         # 4. wait for finish training
diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
index fd66927ced6db..d1cc46f59611f 100644
--- a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
@@ -306,6 +306,7 @@ def backward(ctx, dy):
                     task.wait()
                     return dx, None, None
                 else:
+                    # When main_grad is not enabled and gradient_accumulation is used, the grad is not initialized for the first acc step.
                     (
                         dw,
                         dbias,
diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
index 1b022f87a8388..59b31636daa02 100644
--- a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
@@ -400,9 +400,7 @@ def forward(self, input):
 
     def extra_repr(self):
         name_str = f', name={self.name}' if self.name else ''
-        return 'in_features={}, out_features={}, dtype={}{}'.format(
-            self.weight.shape[0], self.weight.shape[1], self._dtype, name_str
-        )
+        return f'in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, dtype={self._dtype}{name_str}'
 
 
 def _c_softmax_with_cross_entropy(
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
index 36833fd7b5a97..358c6023e6c6f 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
@@ -44,6 +44,7 @@ def minimize(self, optimizer, *args, **kwargs):
         if hasattr(optimizer, "_set_auxiliary_var"):
             optimizer._set_auxiliary_var('found_inf', self._found_inf)
             optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
+            # TODO: Fix to _cache_found_inf after PaddleNLP update
             self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf')
         else:
             if self._found_inf:
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
index 8c6474cf200f3..523c93067e142 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
@@ -74,9 +74,7 @@ def _can_apply(self):
         if self.user_defined_strategy.lamb:
             if not isinstance(self.inner_opt, Adam):
                 logging.warn(
-                    "lamb need the inner optimizer to be AdamOptimizer optimizer but got {}.".format(
-                        self.inner_opt.type
-                    )
+                    f"lamb need the inner optimizer to be AdamOptimizer optimizer but got {self.inner_opt.type}."
                 )
                 return False
             return True
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
index 53541e4a809fd..2c9fd2b6c4fdd 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
@@ -63,9 +63,7 @@ def _can_apply(self):
         if self.user_defined_strategy.lars:
             if not isinstance(self.inner_opt, Momentum):
                 logging.warn(
-                    "lars need the inner optimizer to be Momentum optimizer but got {}.".format(
-                        self.inner_opt.type
-                    )
+                    f"lars need the inner optimizer to be Momentum optimizer but got {self.inner_opt.type}."
                 )
                 return False
             return True
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
index 0af5824ce3b6f..0b9ba1d801071 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
@@ -150,11 +150,8 @@ def prune_fp16(block, shard, reduced_grads_to_param, ring_ids):
                 )
                 assert (
                     to_check_param == should_check_param
-                ), "amp \
-                    check_finite_and_unscale checking miss [{}] and got unexpected [{}]".format(
-                    should_check_param - to_check_param,
-                    to_check_param - should_check_param,
-                )
+                ), f"amp \
+                    check_finite_and_unscale checking miss [{should_check_param - to_check_param}] and got unexpected [{to_check_param - should_check_param}]"
 
         if update_loss_scaling_op_idx == -1:
             return
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
index b3905371e8827..d3db37a27b7dd 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
@@ -142,11 +142,8 @@ def prune_gradient_clip(self, block, shard, ring_ids):
         )
         assert (
             to_check_param == should_check_param
-        ), "amp check_finite_and_unscale \
-        checking miss [{}] and got unexpected [{}]".format(
-            should_check_param - to_check_param,
-            to_check_param - should_check_param,
-        )
+        ), f"amp check_finite_and_unscale \
+        checking miss [{should_check_param - to_check_param}] and got unexpected [{to_check_param - should_check_param}]"
 
         for var_name in deprecated_vars:
             block._remove_var(var_name, sync=False)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
index 56c5202f7a7cc..9a83d40f84fac 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
@@ -94,13 +94,8 @@ def crop_input_var_from_op(self, op_idx, var_name):
             if self._var_to_use_op[var_name] != []:
                 if op_idx not in self._var_to_use_op[var_name]:
                     raise ValueError(
-                        "op_idx: {} is not in self._var_to_use_op[{}], "
-                        "self._var_to_use_op[{}] is {}".format(
-                            op_idx,
-                            var_name,
-                            var_name,
-                            self._var_to_use_op[var_name],
-                        )
+                        f"op_idx: {op_idx} is not in self._var_to_use_op[{var_name}], "
+                        f"self._var_to_use_op[{var_name}] is {self._var_to_use_op[var_name]}"
                     )
                 self._var_to_use_op[var_name].remove(op_idx)
             # update _should_removed_var
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
index 852e7ced16e4a..9f1eec2d8fcf1 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -215,9 +215,7 @@ def check_allreduce_sum(block, shard, sharding_ring_id, dp_ring_id=-1):
                             f"after allreduce the Var: {input_name}"
                         )
                     raise ValueError(
-                        "The reduce output grad [{}] should NOT be be used in Non-root rank.".format(
-                            input_name
-                        )
+                        f"The reduce output grad [{input_name}] should NOT be be used in Non-root rank."
                     )
                 if input_name in dp_grads_status:
                     if dp_ring_id == -1:
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index 298e84ace66f1..dfdeef1a341c0 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -132,9 +132,7 @@ def _get_sharding_segment_strategy(self):
             self._forward_remain_anchors = []
         else:
             raise NotImplementedError(
-                "the sharding segment strategy [{}] is not implemented".format(
-                    str(segment_strategy)
-                )
+                f"the sharding segment strategy [{str(segment_strategy)}] is not implemented"
             )
         self._sharding_segment_strategy = segment_strategy
 
@@ -168,20 +166,12 @@ def _get_hybrid_degree(self):
             )
             assert (
                 global_world_size == mp_degree * sharding_degree * dp_degree
-            ), "global work size [{}], mp_degree [{}], sharding_degree [{}], dp_degree [{}].".format(
-                global_world_size, mp_degree, sharding_degree, dp_degree
-            )
+            ), f"global work size [{global_world_size}], mp_degree [{mp_degree}], sharding_degree [{sharding_degree}], dp_degree [{dp_degree}]."
         else:
             assert (
                 global_world_size
                 == mp_degree * sharding_degree * pp_degree * dp_degree
-            ), "global work size [{}], mp_degree [{}], sharding_degree [{}], pp_degree [{}], dp_degree [{}].".format(
-                global_world_size,
-                mp_degree,
-                sharding_degree,
-                pp_degree,
-                dp_degree,
-            )
+            ), f"global work size [{global_world_size}], mp_degree [{mp_degree}], sharding_degree [{sharding_degree}], pp_degree [{pp_degree}], dp_degree [{dp_degree}]."
 
         # FIXME (JZ-LIANG) deprecated hybrid_dp
         if sharding_configs["hybrid_dp"]:
@@ -962,9 +952,7 @@ def _split_program(self, block):
                 var2broadcast_time, key=var2broadcast_time.get, reverse=True
             ):
                 logger.info(
-                    "Sharding broadcast: [{}] times [{}]".format(
-                        var2broadcast_time[varname], varname
-                    )
+                    f"Sharding broadcast: [{var2broadcast_time[varname]}] times [{varname}]"
                 )
             for idx_ in range(len(self._segments)):
                 logger.info(f"segment [{idx_}] :")
@@ -1476,24 +1464,16 @@ def _build_groups(self):
         )
         assert (
             self.global_word_size % self.mp_degree == 0
-        ), "global_word_size: {} should be divisible to the mp_degree: {}".format(
-            self.global_word_size, self.mp_degree
-        )
+        ), f"global_word_size: {self.global_word_size} should be divisible to the mp_degree: {self.mp_degree}"
         assert (
             self.global_word_size % self.sharding_degree == 0
-        ), "global_word_size: {} should be divisible to the sharding_degree: {}".format(
-            self.global_word_size, self.sharding_degree
-        )
+        ), f"global_word_size: {self.global_word_size} should be divisible to the sharding_degree: {self.sharding_degree}"
         assert (
             self.global_word_size % self.pp_degree == 0
-        ), "global_word_size: {} should be divisible to the pp_degree: {}".format(
-            self.global_word_size, self.pp_degree
-        )
+        ), f"global_word_size: {self.global_word_size} should be divisible to the pp_degree: {self.pp_degree}"
         assert (
             self.global_word_size % self.dp_degree == 0
-        ), "global_word_size: {} should be divisible to the dp_degree: {}".format(
-            self.global_word_size, self.dp_degree
-        )
+        ), f"global_word_size: {self.global_word_size} should be divisible to the dp_degree: {self.dp_degree}"
 
         # mp group
         if self.mp_degree > 1:
@@ -1508,9 +1488,7 @@ def _build_groups(self):
             assert self.current_endpoint in self.mp_group_endpoints
             assert (
                 len(self.mp_group_endpoints) == self.mp_degree
-            ), "num of mp worker in group is [{}], but mp group size is [{}]".format(
-                len(self.mp_group_endpoints), self.mp_degree
-            )
+            ), f"num of mp worker in group is [{len(self.mp_group_endpoints)}], but mp group size is [{self.mp_degree}]"
         else:
             self.mp_degree = 1
             self.mp_ring_id = -1
@@ -1600,12 +1578,7 @@ def _build_groups(self):
             assert (
                 self.global_word_size
                 == self.mp_degree * self.sharding_degree * self.dp_degree
-            ), "global work size [{}], mp_degree [{}], sharding_degree [{}], dp_degree [{}].".format(
-                self.global_word_size,
-                self.mp_degree,
-                self.sharding_degree,
-                self.dp_degree,
-            )
+            ), f"global work size [{self.global_word_size}], mp_degree [{self.mp_degree}], sharding_degree [{self.sharding_degree}], dp_degree [{self.dp_degree}]."
             local_pp_degree = 1
         else:
             assert (
@@ -1614,13 +1587,7 @@ def _build_groups(self):
                 * self.sharding_degree
                 * self.pp_degree
                 * self.dp_degree
-            ), "mp_degree: [{}], sharding_degree: [{}], pp_degree: [{}], dp_degree: [{}]; BUT global nrank: [{}]".format(
-                self.mp_degree,
-                self.sharding_degree,
-                self.pp_degree,
-                self.dp_degree,
-                self.global_word_size,
-            )
+            ), f"mp_degree: [{self.mp_degree}], sharding_degree: [{self.sharding_degree}], pp_degree: [{self.pp_degree}], dp_degree: [{self.dp_degree}]; BUT global nrank: [{self.global_word_size}]"
 
         if self.dp_degree > 1:
             self.dp_ring_id = 2
@@ -1788,9 +1755,7 @@ def create_persistable_gradients_and_insert_merge_ops(
             persistable_grad_name = grad_name + '@GradientMerge'
             assert (
                 grad_name not in self._grad2merged_grad
-            ), "grad [{}] already in grad2merged_grad, maybe you meet sharing weight case !".format(
-                grad_name
-            )
+            ), f"grad [{grad_name}] already in grad2merged_grad, maybe you meet sharing weight case !"
             self._grad2merged_grad[grad_name] = persistable_grad_name
             grad_var = main_block.var(grad_name)
             # create var
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
index d2c05f9d19fd1..ff7730c772904 100755
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
@@ -131,9 +131,7 @@ def check_sanity():
                 return seg_method
             else:
                 raise ValueError(
-                    "We set seg_method as {}, this length is {}, but the number of stages is {}".format(
-                        seg_method, len(seg_method), self.num_parts
-                    )
+                    f"We set seg_method as {seg_method}, this length is {len(seg_method)}, but the number of stages is {self.num_parts}"
                 )
 
         elif self.method == "uniform":
@@ -155,9 +153,7 @@ def check_sanity():
 
             assert (
                 sum(weights) % actual_num_parts == 0
-            ), "number of layers ({}) should be divided by part number({})".format(
-                sum(weights), actual_num_parts
-            )
+            ), f"number of layers ({sum(weights)}) should be divided by part number({actual_num_parts})"
             part_size = sum(weights) // actual_num_parts
             result = [0 for _ in range(actual_num_parts + 1)]
 
@@ -381,7 +377,7 @@ def __init__(
 
         self.device_id = dist.ParallelEnv().device_id
         self.layers = layers
-        self._loss_fn = loss_fn
+        self._loss_fn = loss_fn if isinstance(loss_fn, list) else [loss_fn]
         self._topo = topology
         self._recompute_interval = recompute_interval
         self.recompute_ctx = recompute_ctx
@@ -398,9 +394,7 @@ def __init__(
             offload = recompute_ctx.get('offload', False)
             partition = recompute_ctx.get('partition', False)
             logger.info(
-                "Start Recompute for PipeLineParallel. recompute_offload: {}, recompute_partition: {}".format(
-                    offload, partition
-                )
+                f"Start Recompute for PipeLineParallel. recompute_offload: {offload}, recompute_partition: {partition}"
             )
 
         world_size = dist.get_world_size()
@@ -633,9 +627,7 @@ def _print_segmentation_for_debug(self):
             start = self.segment_parts[stage]
             end = self.segment_parts[stage + 1]
             logger.info(
-                "stage={}, global_rank={} ,layer_number={}".format(
-                    stage, self.global_rank, end - start
-                )
+                f"stage={stage}, global_rank={self.global_rank} ,layer_number={end - start}"
             )
 
             for index, layer in enumerate(self._layers_desc[start:end]):
@@ -654,11 +646,14 @@ def _print_segmentation_for_debug(self):
                     stage_to_virtual_stage_info += f" {i},"
                 logger.info(stage_to_virtual_stage_info)
 
-        if self._loss_fn:
-            try:
-                logger.info(f"loss: {self._loss_fn.__name__}")
-            except AttributeError:
-                logger.info(f"loss: {self._loss_fn.__class__.__name__}")
+        if self._loss_fn[0]:
+            loss_fn_names = []
+            for idx in range(len(self._loss_fn)):
+                try:
+                    loss_fn_names.append(self._loss_fn[idx].__name__)
+                except AttributeError:
+                    loss_fn_names.append(self._loss_fn[idx].__class__.__name__)
+            logger.info(f"loss: {', '.join(loss_fn_names)}")
 
     def _build_layer_with_interleave(self):
         from paddle.distributed.fleet.meta_parallel.parallel_layers.random import (
diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
index c8378b4479bb9..dccd32699d13a 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
@@ -294,6 +294,9 @@ def __init__(self, layers, hcg, strategy):
         self.global_rank = self._hcg.get_global_rank()
         self.micro_batch_id = 0
 
+        # default loss function index
+        self.loss_fn_idx = 0
+
         self._compute_loss = True
 
         logger.info(
@@ -686,8 +689,18 @@ def _wrap_data(self, data):
         )
         return micro_dataset
 
-    def train_batch(self, data, optimizer, lr_scheduler=None, scaler=None):
+    def train_batch(
+        self, data, optimizer, lr_scheduler=None, scaler=None, loss_fn_idx=0
+    ):
         data = self._prepare_training(data, optimizer, lr_scheduler)
+
+        # check loss_fn_idx is valid and loss_fn exists
+        assert (
+            loss_fn_idx in range(len(self._layers._loss_fn))
+            and self._layers._loss_fn[loss_fn_idx] is not None
+        ), f"loss function {loss_fn_idx} should exist to compute loss"
+        self.loss_fn_idx = loss_fn_idx
+
         # 1f1b scheduler for pipeline parallel
         train_loss = self.forward_backward_pipeline(data, scaler)
 
@@ -697,7 +710,7 @@ def train_batch(self, data, optimizer, lr_scheduler=None, scaler=None):
 
         return train_loss
 
-    def eval_batch(self, data, compute_loss=False):
+    def eval_batch(self, data, compute_loss=False, loss_fn_idx=0):
         # reset the virtual pp rank for each run
         self.set_virtual_pipeline_rank(0)
 
@@ -710,6 +723,13 @@ def eval_batch(self, data, compute_loss=False):
         # store total loss of entire batch
         self.total_loss = None
 
+        # check loss_fn_idx is valid and loss_fn exists
+        assert (
+            loss_fn_idx in range(len(self._layers._loss_fn))
+            and self._layers._loss_fn[loss_fn_idx] is not None
+        ), f"loss function {loss_fn_idx} should exist to compute loss"
+        self.loss_fn_idx = loss_fn_idx
+
         startup_steps = self.num_stages - self.stage_id - 1
         startup_steps = min(startup_steps, self.accumulate_steps)
         steady_steps = self.accumulate_steps - startup_steps
@@ -780,29 +800,43 @@ def _forward_step(self, input_tensor, micro_dataset, chunk_id=None):
             # train calculate loss for train
             if self._compute_loss:
                 assert (
-                    self._layers._loss_fn is not None
+                    self._layers._loss_fn[self.loss_fn_idx] is not None
                 ), "loss function should exist to compute loss"
                 labels = next(micro_dataset)[1]
                 self._check_micro_batch_data_valid(labels)
-                output_tensor = self._layers._loss_fn(output_tensor, labels)
-                assert isinstance(
-                    output_tensor, (paddle.Tensor, framework.core.eager.Tensor)
-                ), "Currently, loss_fn should obtain Paddle.Tensor dtype"
-
-                with paddle.amp.auto_cast(enable=False):
-                    if self.accumulate_steps > 1 and not self._delay_scale_loss:
-                        output_tensor = output_tensor / self.accumulate_steps
-
-                    if self.total_loss is None:
-                        self.total_loss = paddle.zeros_like(output_tensor)
-                    self.total_loss += output_tensor.detach()
-
+                for idx, loss_fn in enumerate(self._layers._loss_fn):
+                    loss_tensor = loss_fn(output_tensor, labels)
+                    assert isinstance(
+                        loss_tensor,
+                        (paddle.Tensor, framework.core.eager.Tensor),
+                    ), "Currently, loss_fn should obtain Paddle.Tensor dtype"
+
+                    with paddle.amp.auto_cast(enable=False):
+                        if (
+                            self.accumulate_steps > 1
+                            and not self._delay_scale_loss
+                        ):
+                            loss_tensor = loss_tensor / self.accumulate_steps
+
+                        if self.total_loss is None:
+                            self.total_loss = []
+                        # when self.total_loss length is less than idx, append a new tensor
+                        if len(self.total_loss) <= idx:
+                            self.total_loss.append(
+                                paddle.zeros_like(loss_tensor)
+                            )
+                        self.total_loss[idx] += loss_tensor.detach()
+
+                    if idx == self.loss_fn_idx:
+                        backward_loss_tensor = loss_tensor
         if self.is_pipeline_first_stage() or self.is_pipeline_last_stage():
             # Only increase micro batch id at virtual first/last pp stage.
             # The micro batch id is used to load data, therefore, only increase it when load data.
             self.micro_batch_id += 1
         if self._enable_timer:
             self.timers("forward_step").stop()
+        if self.is_pipeline_last_stage() and self._compute_loss:
+            return backward_loss_tensor
         return output_tensor
 
     def _backward_step(self, input_tensor, output_tensor, output_tensor_grad):
@@ -855,42 +889,52 @@ def _broadcast_final_loss(self):
             assert (
                 self.total_loss is not None
             ), "train_batch() in last stage should obtain valid loss"
-            loss = (
-                self.total_loss.detach()
+            losses = [
+                self.total_loss[idx].detach()
                 if not self._delay_scale_loss
-                else self.total_loss / self.accumulate_steps
-            )
-            is_fp32 = (
-                paddle.full([], 1, 'int64')
-                if loss.dtype == paddle.float32
-                else paddle.full([], 0, 'int64')
-            )
-            paddle.distributed.broadcast(
-                is_fp32, src=self.global_rank, sync_op=True, group=self.pp_group
-            )
-            paddle.distributed.broadcast(
-                loss, src=self.global_rank, sync_op=True, group=self.pp_group
-            )
+                else self.total_loss[idx] / self.accumulate_steps
+                for idx in range(len(self._layers._loss_fn))
+            ]
+            for idx in range(len(self._layers._loss_fn)):
+                is_fp32 = (
+                    paddle.full([], 1, 'int64')
+                    if losses[idx].dtype == paddle.float32
+                    else paddle.full([], 0, 'int64')
+                )
+                paddle.distributed.broadcast(
+                    is_fp32,
+                    src=self.global_rank,
+                    sync_op=True,
+                    group=self.pp_group,
+                )
+                paddle.distributed.broadcast(
+                    losses[idx],
+                    src=self.global_rank,
+                    sync_op=True,
+                    group=self.pp_group,
+                )
         else:
-            is_fp32 = paddle.full([], 1, 'int64')
-            paddle.distributed.broadcast(
-                is_fp32,
-                src=self._hcg.get_rank_from_stage(self.num_stages - 1),
-                sync_op=True,
-                group=self.pp_group,
-            )
-            loss = (
-                paddle.zeros(shape=[1], dtype="float32")
-                if is_fp32.item()
-                else paddle.zeros(shape=[1], dtype="float16")
-            )
-            paddle.distributed.broadcast(
-                loss,
-                src=self._hcg.get_rank_from_stage(self.num_stages - 1),
-                sync_op=True,
-                group=self.pp_group,
-            )
-        return loss
+            losses = []
+            for idx in range(len(self._layers._loss_fn)):
+                is_fp32 = paddle.full([], 1, 'int64')
+                paddle.distributed.broadcast(
+                    is_fp32,
+                    src=self._hcg.get_rank_from_stage(self.num_stages - 1),
+                    sync_op=True,
+                    group=self.pp_group,
+                )
+                losses.append(
+                    paddle.zeros(shape=[1], dtype="float32")
+                    if is_fp32.item()
+                    else paddle.zeros(shape=[1], dtype="float16")
+                )
+                paddle.distributed.broadcast(
+                    losses[idx],
+                    src=self._hcg.get_rank_from_stage(self.num_stages - 1),
+                    sync_op=True,
+                    group=self.pp_group,
+                )
+        return losses[0] if len(losses) == 1 else losses
 
     def _optimizer_step(self):
         if self._delay_scale_loss:
@@ -984,9 +1028,7 @@ def _check_sanity(self):
 
         assert (
             self.accumulate_steps >= 2 * self.num_stages
-        ), "accumulate_steps({}) should be greater than or equal to 2 * num_stages({}) for pipeline with interleave".format(
-            self.accumulate_steps, self.num_stages
-        )
+        ), f"accumulate_steps({self.accumulate_steps}) should be greater than or equal to 2 * num_stages({self.num_stages}) for pipeline with interleave"
 
     def _reset_counter(self):
         for i in range(self.num_model_chunks):
@@ -1710,8 +1752,18 @@ def _process_bwd_buffer(step_id, tensor):
         self.timer_printer()
         return train_loss
 
-    def train_batch(self, data, optimizer, lr_scheduler=None, scaler=None):
+    def train_batch(
+        self, data, optimizer, lr_scheduler=None, scaler=None, loss_fn_idx=0
+    ):
         data = self._prepare_training(data, optimizer, lr_scheduler)
+
+        # check loss_fn_idx is valid and loss_fn exists
+        assert (
+            loss_fn_idx in range(len(self._layers._loss_fn))
+            and self._layers._loss_fn[loss_fn_idx] is not None
+        ), f"loss function {loss_fn_idx} should exist to compute loss"
+        self.loss_fn_idx = loss_fn_idx
+
         # interleave scheduler for pipeline parallel
         train_loss = self.forward_backward_pipeline(data, scaler)
 
@@ -1721,13 +1773,20 @@ def train_batch(self, data, optimizer, lr_scheduler=None, scaler=None):
 
         return train_loss
 
-    def eval_batch(self, data, compute_loss=False):
+    def eval_batch(self, data, compute_loss=False, loss_fn_idx=0):
         # reset the virtual pp rank for each run
         self.set_virtual_pipeline_rank(0)
 
         self._layers.eval()
         self._compute_loss = compute_loss
 
+        # check loss_fn_idx is valid and loss_fn exists
+        assert (
+            loss_fn_idx in range(len(self._layers._loss_fn))
+            and self._layers._loss_fn[loss_fn_idx] is not None
+        ), f"loss function {loss_fn_idx} should exist to compute loss"
+        self.loss_fn_idx = loss_fn_idx
+
         return self.forward_backward_pipeline(data, None, forward_only=True)
 
     def get_static_scheduler(self):
@@ -1818,9 +1877,7 @@ def forward_backward_pipeline(
         assert (
             self.accumulate_steps == self.num_stages
             or self.accumulate_steps % self.num_stages != 0
-        ), "accumulate_steps({}) and num_stages({}) should be a multiple or accumulate_steps % num_stages == 0".format(
-            self.accumulate_steps, self.num_stages
-        )
+        ), f"accumulate_steps({self.accumulate_steps}) and num_stages({self.num_stages}) should be a multiple or accumulate_steps % num_stages == 0"
 
         self._backward_step_count = 0
         skip_steps = self.accumulate_steps - self.num_stages
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/four_directions_p2p_communication.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/four_directions_p2p_communication.py
index b0da2823e230b..ac2a32deb78d8 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/four_directions_p2p_communication.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/four_directions_p2p_communication.py
@@ -67,13 +67,8 @@ def initialize_p2p_groups(
     ) = _hcg.get_p2p_groups()
 
     debug_str = (
-        "P2pInfo: send_next_group: {}, send_prev_group: {}, "
-        "recv_next_group: {}, recv_prev_group: {}".format(
-            repr(send_next_group),
-            repr(send_prev_group),
-            repr(recv_next_group),
-            repr(recv_prev_group),
-        )
+        f"P2pInfo: send_next_group: {repr(send_next_group)}, send_prev_group: {repr(send_prev_group)}, "
+        f"recv_next_group: {repr(recv_next_group)}, recv_prev_group: {repr(recv_prev_group)}"
     )
     logger.info(debug_str)
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
index 8ed634a2ca26f..925e4a728021f 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
@@ -188,23 +188,13 @@ def check_send_message(self, tensor):
         actual_shape, actual_dtype = self._obtain_send_message(tensor)
         assert (
             self.send_shape_message == actual_shape
-        ), "send_shape_message: {}, actual_shape: {}".format(
-            self.send_shape_message, actual_shape
-        )
+        ), f"send_shape_message: {self.send_shape_message}, actual_shape: {actual_shape}"
         assert (
             self.send_dtype_message == actual_dtype
-        ), "send_dtype_message: {}, actual_dtype: {}".format(
-            self.send_dtype_message, actual_dtype
-        )
+        ), f"send_dtype_message: {self.send_dtype_message}, actual_dtype: {actual_dtype}"
 
     def __repr__(self):
-        return "send_shape_message: {}, send_dtype_message: {}, recv_shape_message: {}, recv_dtype_message: {}, recv_stop_gradient: {}".format(
-            self.send_shape_message,
-            self.send_dtype_message,
-            self.recv_shape_message,
-            self.recv_dtype_message,
-            self.recv_stop_gradient,
-        )
+        return f"send_shape_message: {self.send_shape_message}, send_dtype_message: {self.send_dtype_message}, recv_shape_message: {self.recv_shape_message}, recv_dtype_message: {self.recv_dtype_message}, recv_stop_gradient: {self.recv_stop_gradient}"
 
 
 def _is_valid_send_recv_partial(tensor, mp_degree):
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
index 6ebddfc111434..c7cec68b24c0e 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
@@ -642,26 +642,17 @@ def _rank_buffer_size(self, buffer_max_size, model_size):
         if Type.fp16.value in rank_buffer_size.keys():
             # FP16 GradStorage and model size
             logger_.info(
-                "====== FP16 GradStorage size: {:.2f}M parameters, Model size {:.2f}M parameters ======".format(
-                    rank_buffer_size[Type.fp16.value] / 2**19,
-                    model_size / 2**19,
-                )
+                f"====== FP16 GradStorage size: {rank_buffer_size[Type.fp16.value] / 2**19:.2f}M parameters, Model size {model_size / 2**19:.2f}M parameters ======"
             )
         if Type.bf16.value in rank_buffer_size.keys():
             # FP16 GradStorage and model size
             logger_.info(
-                "====== BF16 GradStorage size: {:.2f}M parameters, Model size {:.2f}M parameters ======".format(
-                    rank_buffer_size[Type.bf16.value] / 2**19,
-                    model_size / 2**19,
-                )
+                f"====== BF16 GradStorage size: {rank_buffer_size[Type.bf16.value] / 2**19:.2f}M parameters, Model size {model_size / 2**19:.2f}M parameters ======"
             )
         if Type.fp32.value in rank_buffer_size.keys():
             # FP32 GradStorage and model size
             logger_.info(
-                "====== FP32 GradStorage size: {:.2f}M parameters, Model size {:.2f}M parameters ======".format(
-                    rank_buffer_size[Type.fp32.value] / 2**18,
-                    model_size / 2**18,
-                )
+                f"====== FP32 GradStorage size: {rank_buffer_size[Type.fp32.value] / 2**18:.2f}M parameters, Model size {model_size / 2**18:.2f}M parameters ======"
             )
         return rank_buffer_size
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py
index 3c253cbcd9617..edeee54ed30d9 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py
@@ -173,9 +173,7 @@ def add_rank_params(self, trainable_params, param2align, convert_gpu=True):
     def _add_param_as_view(self, param, align, convert_gpu=True):
         assert (
             param.dtype == self.buffer.dtype
-        ), "Different types for the InternalStorage and the param, cannot proceed: {} - {}".format(
-            param.dtype, self.buffer.dtype
-        )
+        ), f"Different types for the InternalStorage and the param, cannot proceed: {param.dtype} - {self.buffer.dtype}"
 
         var_end = self._fill + param._numel()
         offset = var_end + align
diff --git a/python/paddle/distributed/fleet/recompute/recompute.py b/python/paddle/distributed/fleet/recompute/recompute.py
index b59f304d69a42..ceff9d1958e46 100644
--- a/python/paddle/distributed/fleet/recompute/recompute.py
+++ b/python/paddle/distributed/fleet/recompute/recompute.py
@@ -14,6 +14,7 @@
 
 import contextlib
 import copy
+import inspect
 import weakref
 
 import paddle
@@ -303,9 +304,7 @@ def _recompute_without_reentrant(
             fw_cuda_rng_state = paddle.get_rng_state(cur_device)
         else:
             raise RuntimeError(
-                "Recompute with RNG preserve is not support current device: {}.".format(
-                    cur_device
-                )
+                f"Recompute with RNG preserve is not support current device: {cur_device}."
             )
         fwd_cuda_rng_state_tracker = (
             get_rng_state_tracker().get_states_tracker()
@@ -512,29 +511,36 @@ def recompute(function, *args, **kwargs):
             normal_loss: [0.0018744759727269411, 0.0, 0.035971127450466156, 0.0, 0.0], recompute_loss: [0.0018744759727269411, 0.0, 0.035971127450466156, 0.0, 0.0]
 
     """
-    if not in_dynamic_mode():
-        from paddle.distributed.auto_parallel.interface import (
-            recompute as static_auto_recompute,
-        )
-
-        return static_auto_recompute(function)(*args, **kwargs)
-
     # Hack to mix *args with **kwargs in a python 2.7-compliant way
     preserve = kwargs.pop('preserve_rng_state', True)
 
     # whether to use reentrant method to implement recompute
     use_reentrant = kwargs.pop('use_reentrant', True)
 
-    if kwargs and use_reentrant:
-        raise ValueError(
-            "Error, if you want to send kwargs(dict parameter) to function, please set use_reentrant=False."
+    if not in_dynamic_mode():
+        from paddle.distributed.auto_parallel.interface import (
+            recompute as static_auto_recompute,
         )
 
+        return static_auto_recompute(function)(*args, **kwargs)
+
     if framework._dygraph_tracer()._has_grad:
-        check_recompute_necessary(args)
+        check_args = list(args)
+        check_args.extend(list(kwargs.values()))
+        check_recompute_necessary(check_args)
 
     if use_reentrant:
-        return RecomputeFunction.apply(function, preserve, *args)
+        input_args = args
+        # rearrange `position-args + keyword-args` into `position-args`
+        if isinstance(function, paddle.nn.Layer):
+            dyfunc_sig = inspect.signature(function.forward)
+        else:
+            dyfunc_sig = inspect.signature(function)
+
+        bound_args = dyfunc_sig.bind(*args, **kwargs)
+        bound_args.apply_defaults()
+        input_args = list(bound_args.arguments.values())
+        return RecomputeFunction.apply(function, preserve, *input_args)
     else:
         return _recompute_without_reentrant(function, preserve, *args, **kwargs)
 
diff --git a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
index 29e7c73459854..fa438fd123da6 100644
--- a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
+++ b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
@@ -38,9 +38,7 @@ def _split_activation(tensor, mp_group):
     assert tensor_numel != 0, "can't recompute zero element"
     assert (
         tensor_numel % mp_degree == 0
-    ), "The capacity of the activation ({}) cannot be divisible by mp_degree({})".format(
-        tensor_numel, mp_degree
-    )
+    ), f"The capacity of the activation ({tensor_numel}) cannot be divisible by mp_degree({mp_degree})"
 
     # use inplace operation to save memory
     data = tensor.flatten_()
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index f69470397e1d9..3cda433c61d37 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -449,9 +449,7 @@ def _get_optimizer_status(self, op, param_name):
 
         if op not in supported_opts:
             raise ValueError(
-                "fleet can not support optimizer: {}, only this can be supported: {}".format(
-                    op, supported_opts
-                )
+                f"fleet can not support optimizer: {op}, only this can be supported: {supported_opts}"
             )
 
         reshaped_names = [
diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py
index 94d403765b1a0..3882981687715 100644
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
@@ -139,16 +139,12 @@ def check_embedding_dim(accessor, varname, o_main_program):
     fea_dim = accessor.fea_dim
     if fea_dim != embedding_dim:
         raise ValueError(
-            "The fea_dim is wrong, it will be sparse_embedding_dim: {}, but got {}".format(
-                embedding_dim, fea_dim
-            )
+            f"The fea_dim is wrong, it will be sparse_embedding_dim: {embedding_dim}, but got {fea_dim}"
         )
     embedx_dim = accessor.embedx_dim
     if embedx_dim != embedding_dim - 3:
         raise ValueError(
-            "The embedx_dim is wrong, it will be sparse_embedding_dim - 3: {}, but got {}".format(
-                embedding_dim - 3, embedx_dim
-            )
+            f"The embedx_dim is wrong, it will be sparse_embedding_dim - 3: {embedding_dim - 3}, but got {embedx_dim}"
         )
 
 
@@ -1201,9 +1197,7 @@ def _init_server(self, dirname=None, var_names=None, **kwargs):
             for var_name in var_names:
                 if var_name not in distributed_varnames:
                     raise ValueError(
-                        "fleet.init server can only load sparse variables in {}".format(
-                            distributed_varnames
-                        )
+                        f"fleet.init server can only load sparse variables in {distributed_varnames}"
                     )
             load_varnames = var_names
 
diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
index 5c2ec7fece24d..fb7ca165f1094 100644
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -434,9 +434,7 @@ def handler(*args, **kwargs):
 
                 if time.time() - last_print_time > 30:
                     print(
-                        "hadoop operator timeout:args:{} timeout:{}".format(
-                            args, time.time() - start
-                        )
+                        f"hadoop operator timeout:args:{args} timeout:{time.time() - start}"
                     )
                     last_print_time = time.time()
 
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
index 38e6eeca008d6..df791c42cca2b 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
@@ -491,9 +491,9 @@ def _check_validation(self, block):
 
         pre_stage_id = None
         for op in block.ops:
-            assert op.has_attr(self._op_role_key), "{} has no {} set .".format(
-                op.type, self._op_role_key
-            )
+            assert op.has_attr(
+                self._op_role_key
+            ), f"{op.type} has no {self._op_role_key} set ."
             op_role = op.attr(self._op_role_key)
             assert op_role == int(
                 self._op_role.Forward
@@ -506,9 +506,9 @@ def _check_validation(self, block):
                 sub_block_id = op.attr('sub_block').id
                 sub_block = block.program.block(sub_block_id)
                 self._check_validation(sub_block)
-            assert op.has_attr(self._op_device_key), "{} has no {} set.".format(
-                op.type, self._op_device_key
-            )
+            assert op.has_attr(
+                self._op_device_key
+            ), f"{op.type} has no {self._op_device_key} set."
 
             device = op.attr(self._op_device_key)
             assert device, f"{op.type} has no {self._op_device_key} set."
diff --git a/python/paddle/distributed/fleet/utils/mix_precision_utils.py b/python/paddle/distributed/fleet/utils/mix_precision_utils.py
index 8cef7ab36f38d..4bb967ac7f145 100644
--- a/python/paddle/distributed/fleet/utils/mix_precision_utils.py
+++ b/python/paddle/distributed/fleet/utils/mix_precision_utils.py
@@ -54,9 +54,7 @@ def _update_main_grad_hook(self, param):
         def param_hook(tmp_grad):
             assert (
                 param.grad is None
-            ), "In main_grad node, param.grad should be None, but find param[{}] has grad.".format(
-                param.name
-            )
+            ), f"In main_grad node, param.grad should be None, but find param[{param.name}] has grad."
             if tmp_grad is not None and tmp_grad._is_initialized():
                 # Some previous pylayer may return None, should check grad validation.
                 if param.main_grad is None:
diff --git a/python/paddle/distributed/fleet/utils/pp_parallel_adaptor.py b/python/paddle/distributed/fleet/utils/pp_parallel_adaptor.py
index a9874cb996e53..e3970ce936401 100644
--- a/python/paddle/distributed/fleet/utils/pp_parallel_adaptor.py
+++ b/python/paddle/distributed/fleet/utils/pp_parallel_adaptor.py
@@ -558,14 +558,7 @@ def parse_args():
     ], "segment_method should be 'uniform' or 'layer"
 
     print(
-        "adapt model dumped by task with pp degree:{}, vp degree:{}, mp degree:{} to task with pp degree:{}, vp degree:{}, mp degree:{}".format(
-            args.src_pp,
-            args.src_vp,
-            args.src_mp,
-            args.dst_pp,
-            args.dst_vp,
-            args.dst_mp,
-        )
+        f"adapt model dumped by task with pp degree:{args.src_pp}, vp degree:{args.src_vp}, mp degree:{args.src_mp} to task with pp degree:{args.dst_pp}, vp degree:{args.dst_vp}, mp degree:{args.dst_mp}"
     )
 
     return args
diff --git a/python/paddle/distributed/fleet/utils/sequence_parallel_utils.py b/python/paddle/distributed/fleet/utils/sequence_parallel_utils.py
index 96d511f2dc06c..542f66982b629 100644
--- a/python/paddle/distributed/fleet/utils/sequence_parallel_utils.py
+++ b/python/paddle/distributed/fleet/utils/sequence_parallel_utils.py
@@ -14,6 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+
 import paddle
 from paddle import distributed as dist
 from paddle.autograd import PyLayer
@@ -28,6 +30,8 @@
     functional as F,
 )
 
+from .log_util import logger
+
 ####################################################
 #                                                  #
 #        Distributed Communication Operator        #
@@ -43,9 +47,7 @@ def scatter(input):
     seq_len = input.shape[0]
     assert (
         seq_len % parallelism == 0
-    ), "Input sequence length {} can't be divided exactly by sequence parallelism {}".format(
-        seq_len, parallelism
-    )
+    ), f"Input sequence length {seq_len} can't be divided exactly by sequence parallelism {parallelism}"
     interval = seq_len // parallelism
     input = paddle.slice(
         input, axes=[0], starts=[interval * rank], ends=[interval * (rank + 1)]
@@ -71,9 +73,7 @@ def reduce_scatter(input):
     output_shape = input.shape
     assert (
         input.shape[0] % parallelism == 0
-    ), "Input sequence length {} can't be divided exactly by sequence parallelism {}".format(
-        input.shape[0], parallelism
-    )
+    ), f"Input sequence length {input.shape[0]} can't be divided exactly by sequence parallelism {parallelism}"
     output_shape[0] = output_shape[0] // parallelism
     output = paddle.empty(shape=output_shape, dtype=input.dtype)
     dist.stream.reduce_scatter(
@@ -234,6 +234,9 @@ def is_fused_linear_param_grad_add_supported():
         return False
 
 
+_raise_cuda_env_unset_warning_for_sp = True
+
+
 class SPInnerOverlapLinear(paddle.autograd.PyLayer):
     @staticmethod
     def forward(
@@ -274,9 +277,7 @@ def backward(ctx, dy):
 
         assert (
             dinput_parallel.shape[0] % parallelism == 0
-        ), "Input sequence length {} can't be divided exactly by sequence parallelism {}".format(
-            dinput_parallel.shape[0], parallelism
-        )
+        ), f"Input sequence length {dinput_parallel.shape[0]} can't be divided exactly by sequence parallelism {parallelism}"
 
         dx_shape = dinput_parallel.shape
         dx_shape[0] = dx_shape[0] // parallelism
@@ -290,6 +291,17 @@ def backward(ctx, dy):
             group=group,
             sync_op=False,
         )
+        # Using small operation to preempt GPU SMs for all_reduce to achieve overlap.
+        if int(os.getenv("CUDA_DEVICE_MAX_CONNECTIONS", "0")) != 1:
+            global _raise_cuda_env_unset_warning_for_sp
+            if _raise_cuda_env_unset_warning_for_sp:
+                logger.warning(
+                    "You set mp_async_allreduce=True, but you forget to set environment "
+                    "variable CUDA_DEVICE_MAX_CONNECTIONS=1, which may leads to performance "
+                    "loss. Try to export CUDA_DEVICE_MAX_CONNECTIONS=1 for better performance."
+                )
+            _raise_cuda_env_unset_warning_for_sp = False
+            tmp = paddle.ones([512])
 
         if ctx.mp_fused_linear_param_grad_add:
             if not is_fused_linear_param_grad_add_supported():
@@ -355,6 +367,7 @@ def backward(ctx, dy):
                     task.wait()
                     return dx, None, None
                 else:
+                    # When main_grad is not enabled and gradient_accumulation is used, the grad is not initialized for the first acc step.
                     (
                         dw,
                         dbias,
@@ -480,14 +493,25 @@ def __init__(
 
     def forward(self, x):
         # sequence parallelism is same as model parallelis, if sequence parallel is true, input shape is [s, b, h],else input shape is [b, s, h]
-        return SPInnerOverlapLinear.apply(
-            x,
-            self.weight,
-            self.bias,
-            self.fuse_matmul_bias,
-            self.mp_fused_linear_param_grad_add,
-            self.model_parallel_group,
-        )
+        # reuse mp_async_allreduce to do sequence parallelism overlap
+        if self.mp_async_allreduce:
+            output = SPInnerOverlapLinear.apply(
+                x,
+                self.weight,
+                self.bias,
+                self.fuse_matmul_bias,
+                self.mp_fused_linear_param_grad_add,
+                self.model_parallel_group,
+            )
+        else:
+            if self.is_mp:
+                input_parallel = AllGatherOp.apply(x)
+            else:
+                input_parallel = x
+            output = self.linear(
+                input_parallel, self.weight, self.bias, name=self._name
+            )
+        return output
 
 
 class MPScale(PyLayer):
diff --git a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py
index b2e4f5f4e78e9..6127c2758c22b 100644
--- a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py
+++ b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import itertools
 import weakref
 from collections import OrderedDict
@@ -24,6 +25,8 @@
     core,
 )
 
+from .log_util import logger
+
 
 class HOOK_ACTION:
     ALL_REDUCE = 0
@@ -521,13 +524,9 @@ def add_grad(self, param, use_comm=True):
         if not self._release_grads:
             current_ptr = get_grad_address(param, self.use_main_grad)
             if self._grads_to_addr[param.name] != current_ptr:
-                raise ValueError(
-                    "The address of the grad/main_grad of the param has been changed during training, "
-                    "which is not allowed for dp/sharding overlap with pp. "
-                    "This may be caused by some non-inplace operations on the grad/main_grad. Here are some examples: "
-                    "1. The grad/main_grad of the param is changed by other operations, such as: clear_grad, "
-                    "2. Using non-inplace operations on the grad/main_grad, such as: add, sub, mul, div, etc. "
-                )
+                error_message = f"The address of the grad/main_grad of param {param.name} has been changed during training, which is not allowed for dp/sharding overlap with pp. This may be caused by some non-inplace operations on the grad/main_grad. Here are some examples: 1. The grad/main_grad of the param is changed by other operations, such as: clear_grad; 2. Using non-inplace operations on the grad/main_grad, such as: add, sub, mul, div, etc."
+                logger.error(error_message)
+                raise ValueError(error_message)
         else:
             self._copy_grad_to_buffer(param)
 
@@ -566,9 +565,7 @@ def params(self):
     def comm_grads(self):
         assert self._all_params_checked_in, (
             "Not all params checked in."
-            "Parameter number: {}, Check-in number: {}".format(
-                len(self._params), self._params_checked_in
-            )
+            f"Parameter number: {len(self._params)}, Check-in number: {self._params_checked_in}"
         )
         self._comm_grads()
 
diff --git a/python/paddle/distributed/launch/controllers/ipu_controller.py b/python/paddle/distributed/launch/controllers/ipu_controller.py
index 3d231be0d547d..f6f3ade2fcceb 100644
--- a/python/paddle/distributed/launch/controllers/ipu_controller.py
+++ b/python/paddle/distributed/launch/controllers/ipu_controller.py
@@ -71,9 +71,7 @@ def replace_training_script(self):
         # The number of replicas for data parallel
         assert (
             num_ipus % poprun_args.ipus_per_replica
-        ) == 0, "The number of IPUs:{} mod the number of IPUs per replica:{} must == 0".format(
-            num_ipus, poprun_args.ipus_per_replica
-        )
+        ) == 0, f"The number of IPUs:{num_ipus} mod the number of IPUs per replica:{poprun_args.ipus_per_replica} must == 0"
         num_replicas = num_ipus // poprun_args.ipus_per_replica
         self.ctx.logger.info(f"The number of total replicas is {num_replicas}.")
 
@@ -83,9 +81,7 @@ def replace_training_script(self):
         self.ctx.logger.info(f"The number of total processes is {num_procs}.")
         assert (
             num_replicas % num_procs
-        ) == 0, "The number of replicas:{} mod the number of processes:{} must == 0".format(
-            num_replicas, num_procs
-        )
+        ) == 0, f"The number of replicas:{num_replicas} mod the number of processes:{num_procs} must == 0"
 
         # hosts and endpoints
         hosts = poprun_args.hosts.replace(' ', '').split(',')
@@ -130,9 +126,7 @@ def replace_training_script(self):
             cur_endpoint = endpoints[idx // poprun_args.nproc_per_host]
             rank_in_node = idx % poprun_args.nproc_per_host
             poprun_command.append(
-                '--instance-mpi-local-args={}:\"-x PADDLE_TRAINER_ID={} -x PADDLE_CURRENT_ENDPOINT={} -x PADDLE_RANK_IN_NODE={}\"'.format(
-                    idx, idx, cur_endpoint, rank_in_node
-                )
+                f'--instance-mpi-local-args={idx}:\"-x PADDLE_TRAINER_ID={idx} -x PADDLE_CURRENT_ENDPOINT={cur_endpoint} -x PADDLE_RANK_IN_NODE={rank_in_node}\"'
             )
 
         # executor
diff --git a/python/paddle/distributed/launch/job/container.py b/python/paddle/distributed/launch/job/container.py
index 94bd36aff2fbd..7c2fb7780c2c7 100644
--- a/python/paddle/distributed/launch/job/container.py
+++ b/python/paddle/distributed/launch/job/container.py
@@ -168,16 +168,7 @@ def status(self):
             return Status.FAILED
 
     def __str__(self):
-        return (
-            'Container rank {} status {} cmd {} code {} log {} \nenv {}'.format(
-                self._rank,
-                self.status,
-                self._entrypoint,
-                self.exit_code,
-                self.errfile,
-                self._env,
-            )
-        )
+        return f'Container rank {self._rank} status {self.status} cmd {self._entrypoint} code {self.exit_code} log {self.errfile} \nenv {self._env}'
 
     def logs(self, fn=None, offset=0, whence=1, limit=1000):
         if not self._log_handler:
diff --git a/python/paddle/distributed/launch/job/job.py b/python/paddle/distributed/launch/job/job.py
index 261e6ee7f292c..0e27f42eb29a0 100644
--- a/python/paddle/distributed/launch/job/job.py
+++ b/python/paddle/distributed/launch/job/job.py
@@ -32,14 +32,7 @@ def __init__(self, jid='default', mode=JobMode.COLLECTIVE, nnodes="1"):
         self.set_replicas(str(nnodes))
 
     def __str__(self):
-        return "Job: {}, mode {}, replicas {}[{}:{}], elastic {}".format(
-            self.id,
-            self.mode,
-            self._replicas,
-            self._replicas_min,
-            self._replicas_max,
-            self.elastic,
-        )
+        return f"Job: {self.id}, mode {self.mode}, replicas {self._replicas}[{self._replicas_min}:{self._replicas_max}], elastic {self.elastic}"
 
     @property
     def mode(self):
diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py
index 151cbf487a092..c92fc2768c12a 100644
--- a/python/paddle/distributed/launch/main.py
+++ b/python/paddle/distributed/launch/main.py
@@ -491,14 +491,10 @@ def launch():
 
                 # launch task
                 ctx.logger.info(
-                    "Launch task from auto tuner: job_id {}, log_dir {}, config {}".format(
-                        task_job_id, log_dir, gbs_cur_cfg
-                    )
+                    f"Launch task from auto tuner: job_id {task_job_id}, log_dir {log_dir}, config {gbs_cur_cfg}"
                 )
                 logger.info(
-                    "Launch task from auto tuner: job_id {}, log_dir {}, config {}".format(
-                        task_job_id, log_dir, gbs_cur_cfg
-                    )
+                    f"Launch task from auto tuner: job_id {task_job_id}, log_dir {log_dir}, config {gbs_cur_cfg}"
                 )
                 c = controllers.init(ctx)
                 c.run()
@@ -572,9 +568,7 @@ def launch():
             # prevent no valid global batch size found
             if best_gbs is None:
                 raise ValueError(
-                    "No valid global batch size found, check memory or valid search time. cur_tuner_cfg{}".format(
-                        gbs_tuner_cfg
-                    )
+                    f"No valid global batch size found, check memory or valid search time. cur_tuner_cfg{gbs_tuner_cfg}"
                 )
             # set best global batch size to tuner cfg
             tuner_cfg["model_cfg"]["global_batch_size"] = best_gbs
@@ -659,9 +653,11 @@ def launch():
                 os.path.dirname(ctx.args.auto_tuner_json), log_dir
             )
 
-            # generate script args of task
+            # generate the script arguments and launch configuration JSON/YAML for the task.
+            cur_cfg["log_dir_name"] = log_dir
             new_args = gen_new_args(raw_args, cur_cfg, tuner_cfg)
             ctx.args.training_script_args = new_args
+            cur_cfg.pop("log_dir_name")
 
             # launch task
             ctx.logger.info(
@@ -970,6 +966,32 @@ def launch():
             if tuner_cfg['metric_cfg']['name'] not in cur_cfg:
                 cur_cfg[tuner_cfg['metric_cfg']['name']] = None
 
+            path = f"auto_tuner/mem/{job_id}/{ip}"
+            if nnodes > 1:
+                while not client.put(
+                    path, str(cur_cfg["max_mem_usage"]).encode('latin-1')
+                ):
+                    time.sleep(1)
+                result = list(client.get_prefix(f"auto_tuner/mem/{job_id}"))
+                size = len(result)
+                while size != nnodes:
+                    time.sleep(1)
+                    result = list(
+                        client.get_prefix(f"auto_tuner/mem/{job_id}/")
+                    )
+                    size = len(result)
+                mem_allnodes = [i[0].decode() for i in result]
+
+                for mem in mem_allnodes:
+                    if mem is None:
+                        continue
+                    if mem == "OOM":
+                        cur_cfg["max_mem_usage"] = mem
+                        break
+                    cur_cfg["max_mem_usage"] = max(
+                        int(mem), int(cur_cfg["max_mem_usage"])
+                    )
+
             # if need accurate peak memory
             if os.environ.get("FLAGS_log_memory_stats", False):
                 max_peak_memory = None
diff --git a/python/paddle/distributed/metric/metrics.py b/python/paddle/distributed/metric/metrics.py
index 1301d764643a1..aad1edd50c3ec 100644
--- a/python/paddle/distributed/metric/metrics.py
+++ b/python/paddle/distributed/metric/metrics.py
@@ -158,18 +158,8 @@ def print_metric(metric_ptr, name):
     else:
         metric = metric_ptr.get_metric_msg(name)
         monitor_msg = (
-            "{}: AUC={:.6f} BUCKET_ERROR={:.6f} MAE={:.6f} RMSE={:.6f} "
-            "Actual CTR={:.6f} Predicted CTR={:.6f} COPC={:.6f} INS Count={:.0f}".format(
-                name,
-                metric[0],
-                metric[1],
-                metric[2],
-                metric[3],
-                metric[4],
-                metric[5],
-                metric[6],
-                metric[7],
-            )
+            f"{name}: AUC={metric[0]:.6f} BUCKET_ERROR={metric[1]:.6f} MAE={metric[2]:.6f} RMSE={metric[3]:.6f} "
+            f"Actual CTR={metric[4]:.6f} Predicted CTR={metric[5]:.6f} COPC={metric[6]:.6f} INS Count={metric[7]:.0f}"
         )
     # logger.info(monitor_msg)
     return monitor_msg
diff --git a/python/paddle/distributed/passes/auto_parallel_amp.py b/python/paddle/distributed/passes/auto_parallel_amp.py
index 53bb8f01f8ba3..81e896c4fff7d 100644
--- a/python/paddle/distributed/passes/auto_parallel_amp.py
+++ b/python/paddle/distributed/passes/auto_parallel_amp.py
@@ -547,13 +547,7 @@ def _keep_fp32_output(op, out_name):
                     else:
                         assert (
                             in_var.dtype == dst_dtype
-                        ), "op [{}] expect input [{}] to be dtype [{}] BUT got [{}]. {}".format(
-                            op.type,
-                            in_name,
-                            dst_dtype,
-                            in_var.dtype,
-                            str(op),
-                        )
+                        ), f"op [{op.type}] expect input [{in_name}] to be dtype [{dst_dtype}] BUT got [{in_var.dtype}]. {str(op)}"
 
         for out_name in op.output_names:
             if src_dtype == paddle.float32 and _keep_fp32_output(op, out_name):
diff --git a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
index 7db17c22b1453..834e18e1e785f 100644
--- a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
+++ b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
@@ -157,9 +157,7 @@ def _analyze_program(self):
 
                 assert (
                     group is not None
-                ), "Unexpected: data parallel group of [{}] from op [{}] is None".format(
-                    grad_name, str(op)
-                )
+                ), f"Unexpected: data parallel group of [{grad_name}] from op [{str(op)}] is None"
 
                 self._grad_name_to_group_map[grad_name] = group
 
@@ -186,9 +184,7 @@ def _analyze_program(self):
                 not_synchronized_grads.append(grad_name)
         assert (
             len(not_synchronized_grads) == 0
-        ), "Unexpected: gradients [{}] is scaled BUT NOT synchronized.".format(
-            not_synchronized_grads
-        )
+        ), f"Unexpected: gradients [{not_synchronized_grads}] is scaled BUT NOT synchronized."
 
     def is_data_parallel_applied(self):
         return len(self._group_to_grad_name_map) > 0
@@ -261,9 +257,7 @@ def _update_opt_rescale_grad(self):
 
         assert scaled_grads == set(
             self._grad_name_to_group_map.keys()
-        ), "Unexpected: gradients [{}] are unscaled.".format(
-            set(self._grad_name_to_group_map.keys()) - scaled_grads
-        )
+        ), f"Unexpected: gradients [{set(self._grad_name_to_group_map.keys()) - scaled_grads}] are unscaled."
 
     def _could_be_overlap(self):
         # NOTE current different nccl comm will use different cuda stream
@@ -682,17 +676,13 @@ def summary(self, grad_groups=[]):
         if len(grad_groups) > 0:
             self._logger.info("Data Parallel Optimization: ")
             self._logger.info(
-                " {} Allreduce ops are fused into {} coalesce allreduce ops.".format(
-                    len(self._grad_name_to_group_map.keys()), len(grad_groups)
-                )
+                f" {len(self._grad_name_to_group_map.keys())} Allreduce ops are fused into {len(grad_groups)} coalesce allreduce ops."
             )
             self._logger.debug("gradient fusing group are following: ")
             fused_grads = set()
             for i, group in enumerate(grad_groups):
                 self._logger.debug(
-                    "coalesce gradient [{}] is composed by: {}".format(
-                        i, [grad.name for grad in group.gradients]
-                    )
+                    f"coalesce gradient [{i}] is composed by: {[grad.name for grad in group.gradients]}"
                 )
                 fused_grads.update([grad.name for grad in group.gradients])
             individual_grads = set(self._grad_name_to_group_map.keys()) - set(
diff --git a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
index 928e24da45615..aab9bdb2456a0 100644
--- a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
+++ b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
@@ -24,6 +24,7 @@
     get_world_process_group,
 )
 from paddle.distributed.auto_parallel.static.utils import (
+    is_backward_op,
     is_forward_op,
     is_optimize_op,
     naive_set_dist_op_attr_for_program_by_mesh_and_mapping,
@@ -37,6 +38,7 @@
 from paddle.framework import core
 from paddle.static import device_guard
 
+from .auto_parallel_master_grad import _is_master_grad_cast_op
 from .pass_base import PassBase, PassType, register_pass
 
 world_process_group = get_world_process_group()
@@ -220,7 +222,7 @@ def _append_gradient_merge_backward_op(
                     outputs={"Out": startup_gradient_merge_var},
                     attrs={
                         "shape": grad.shape,
-                        "dtype": grad.dtype,
+                        "dtype": startup_gradient_merge_var.dtype,
                         "value": float(0),
                     },
                 )
@@ -256,9 +258,7 @@ def _append_gradient_merge_backward_op(
 
     assert (
         len(grad_to_params_grads) == 0
-    ), "grad_to_param_names must be empty right now, but it has {} items".format(
-        len(grad_to_params_grads)
-    )
+    ), f"grad_to_param_names must be empty right now, but it has {len(grad_to_params_grads)} items"
     main_block._sync_with_cpp()
 
     return new_params_grads, grad_to_gradient_merge
@@ -309,6 +309,46 @@ def _move_reduce_to_optimizer_ops_block(
     return optimize_ops_block
 
 
+def _remove_cast_for_master_grad(main_program, dist_context):
+    rename_var_map = {}
+    main_block = main_program.global_block()
+    for idx, op in reversed(list(enumerate(main_block.ops))):
+        if _is_master_grad_cast_op(main_block, op):
+            input_var_name = op.input_arg_names[0]
+            output_var_name = op.output_arg_names[0]
+            rename_var_map[input_var_name] = output_var_name
+            in_var = main_block.var(input_var_name)
+            out_var = main_block.var(output_var_name)
+            out_var.desc.set_dtype(in_var.dtype)
+            main_block._remove_op(idx, sync=False)
+            main_block._remove_var(input_var_name)
+
+    # rename "xxx@GRAD@master_grad_fp16" --> "xxx@GRAD"
+    if len(rename_var_map) > 0:
+        for op in reversed(main_block.ops):
+            if is_forward_op(op):
+                break
+            if is_backward_op(op):
+                output_var_names = op.output_arg_names
+                op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
+                for output_var_name in output_var_names:
+                    if output_var_name in rename_var_map:
+                        out_dims_mapping = op_dist_attr.get_output_dims_mapping(
+                            output_var_name
+                        )
+                        op.desc._rename_output(
+                            output_var_name, rename_var_map[output_var_name]
+                        )
+                        op_dist_attr.set_output_dims_mapping(
+                            rename_var_map[output_var_name], out_dims_mapping
+                        )
+                        del rename_var_map[output_var_name]
+        assert (
+            len(rename_var_map) == 0
+        ), f"rename_var_map must be empty, but it is: {rename_var_map}"
+    main_block._sync_with_cpp()
+
+
 def _create_cond_block_and_update_optimizer(
     main_program,
     cond_var,
@@ -466,6 +506,8 @@ def parse_program(
             main_program, optimize_ops_block, params_grads
         )
 
+    _remove_cast_for_master_grad(main_program, dist_context)
+
     # 4 create gradient_merge_cond
     cond_var = _get_gm_cond_var(main_program, k_steps, dist_context)
 
diff --git a/python/paddle/distributed/passes/auto_parallel_master_grad.py b/python/paddle/distributed/passes/auto_parallel_master_grad.py
index f5271616ddf74..50eb099b3bd11 100644
--- a/python/paddle/distributed/passes/auto_parallel_master_grad.py
+++ b/python/paddle/distributed/passes/auto_parallel_master_grad.py
@@ -52,18 +52,16 @@
 logger = get_logger(logging.INFO, "MasterGradPass")
 
 
-def _is_master_grad_cast_op(block, op, amp_dtype="float16"):
+def _is_master_grad_cast_op(block, op):
     if op.type != "cast":
         return False
     assert len(op.input_arg_names) == 1
     assert len(op.output_arg_names) == 1
     input_var_name = op.input_arg_names[0]
-    if amp_dtype == "float16":
-        return "@master_grad_fp16" in input_var_name
-    elif amp_dtype == "bfloat16":
-        return "@master_grad_bf16" in input_var_name
-    else:
-        return False
+    return (
+        "@master_grad_fp16" in input_var_name
+        or "@master_grad_bf16" in input_var_name
+    )
 
 
 def get_output_in_varlist(op, var_names) -> List[str]:
@@ -158,8 +156,19 @@ def _add_cast_op(self, cur_block, grad_names: List[str], dist_context):
                     ref_mesh,
                     chunk_id=ref_chunk_id,
                 )
+
+                producer_op_dist_attr = (
+                    dist_context.get_op_dist_attr_for_program(producer_op)
+                )
+                origin_out_dims_mapping = (
+                    producer_op_dist_attr.get_output_dims_mapping(grad_name)
+                )
                 producer_op._rename_output(grad_name, grad_half_precision.name)
+                producer_op_dist_attr.set_output_dims_mapping(
+                    grad_half_precision.name, origin_out_dims_mapping
+                )
                 grad_var.desc.set_dtype(core.VarDesc.VarType.FP32)
+
                 cast_op = cur_block._insert_op_without_sync(
                     idx + 1,
                     type="cast",
diff --git a/python/paddle/distributed/passes/auto_parallel_recompute.py b/python/paddle/distributed/passes/auto_parallel_recompute.py
index 822bdbd6801b2..4adbeaba1805a 100644
--- a/python/paddle/distributed/passes/auto_parallel_recompute.py
+++ b/python/paddle/distributed/passes/auto_parallel_recompute.py
@@ -110,9 +110,7 @@ def get_recompute_segments(self, no_recompute_segments=[]):
         for i in sorted(no_recompute_segments, reverse=True):
             assert i < len(
                 segments
-            ), "the no_recompute_segments idx [{}] should be lower the number of segment [{}]".format(
-                i, len(segments)
-            )
+            ), f"the no_recompute_segments idx [{i}] should be lower the number of segment [{len(segments)}]"
             segments.pop(i)
 
         return segments
@@ -328,9 +326,7 @@ def reset_recompute_op(op):
                     op_names_of_stages[id].append(op.type)
         assert (
             len(ops) == reset_ops_count + pushed_ops_count
-        ), "The sum of pushed_ops_count and reset_ops_count must be the same as length of ops, but the sum is {} while length of ops is {}".format(
-            reset_ops_count + pushed_ops_count, len(ops)
-        )
+        ), f"The sum of pushed_ops_count and reset_ops_count must be the same as length of ops, but the sum is {reset_ops_count + pushed_ops_count} while length of ops is {len(ops)}"
         return ops_of_stages, op_names_of_stages
 
     def _apply_single_impl(self, main_program, startup_program, context):
@@ -416,18 +412,10 @@ def _apply_single_impl(self, main_program, startup_program, context):
         for i, (idx1, idx2) in enumerate(segments):
             logger.debug(f"recompute segment[{i + 1}/{len(segments)}]")
             logger.debug(
-                "segment start op: [{}]: [{}] [{}]".format(
-                    rc_state.ops[idx1].type,
-                    rc_state.ops[idx1].input_arg_names,
-                    rc_state.ops[idx1].output_arg_names,
-                )
+                f"segment start op: [{rc_state.ops[idx1].type}]: [{rc_state.ops[idx1].input_arg_names}] [{rc_state.ops[idx1].output_arg_names}]"
             )
             logger.debug(
-                "segment end op: [{}]: [{}] [{}]".format(
-                    rc_state.ops[idx2 - 1].type,
-                    rc_state.ops[idx2 - 1].input_arg_names,
-                    rc_state.ops[idx2 - 1].output_arg_names,
-                )
+                f"segment end op: [{rc_state.ops[idx2 - 1].type}]: [{rc_state.ops[idx2 - 1].input_arg_names}] [{rc_state.ops[idx2 - 1].output_arg_names}]"
             )
 
         # 4. get vars that should be hold in memory
@@ -439,10 +427,8 @@ def _apply_single_impl(self, main_program, startup_program, context):
             )
         cross_vars = set(vars_should_be_hold) - set(rc_state.checkpoints)
         logger.debug(
-            "found [{}] vars which cross recompute segment: [{}],"
-            "better checkpoints might be set to reduce those vars".format(
-                len(cross_vars), cross_vars
-            )
+            f"found [{len(cross_vars)}] vars which cross recompute segment: [{cross_vars}],"
+            "better checkpoints might be set to reduce those vars"
         )
         vars_should_be_hold.extend(rc_state.reserved_vars)
         vars_should_be_hold.extend(rc_state.get_input_nodes())
diff --git a/python/paddle/distributed/passes/auto_parallel_sharding.py b/python/paddle/distributed/passes/auto_parallel_sharding.py
index bcf9326f37bd3..691cf648ac47d 100644
--- a/python/paddle/distributed/passes/auto_parallel_sharding.py
+++ b/python/paddle/distributed/passes/auto_parallel_sharding.py
@@ -231,9 +231,7 @@ def _collective_data_parallel_groups(self, main_block):
         # generated by auto search
         if len(self.dp_groups) != 1:
             raise NotImplementedError(
-                "So far Only and Exactly one data parallel group in network are supported, but got [{}] different data parallel groups".format(
-                    len(self.dp_groups)
-                )
+                f"So far Only and Exactly one data parallel group in network are supported, but got [{len(self.dp_groups)}] different data parallel groups"
             )
 
     def _build_sharding_infos(self, main_block, params_grads):
@@ -246,24 +244,16 @@ def _build_sharding_infos(self, main_block, params_grads):
         for dp_group in self.dp_groups:
             assert (
                 dp_group.nranks >= self.sharding_world_size
-            ), "sharding world size [{}] should not larger than dp world size [{}]".format(
-                self.sharding_world_size, dp_group.nranks
-            )
+            ), f"sharding world size [{self.sharding_world_size}] should not larger than dp world size [{dp_group.nranks}]"
             assert (
                 dp_group.nranks % self.sharding_world_size == 0
-            ), "sharding world size [{}] should be divisible by dp world size [{}]".format(
-                self.sharding_world_size, dp_group.nranks
-            )
+            ), f"sharding world size [{self.sharding_world_size}] should be divisible by dp world size [{dp_group.nranks}]"
             assert (
                 self.global_rank in dp_group.ranks
-            ), "current ranks [{}] does NOT belong to the data parallel group [{}]".format(
-                self.global_rank, dp_group.ranks
-            )
+            ), f"current ranks [{self.global_rank}] does NOT belong to the data parallel group [{dp_group.ranks}]"
             assert (
                 len(params_grads) >= self.sharding_world_size
-            ), "number of parameters [{}] is not enough to be shard among [{}] ranks".format(
-                len(params_grads), self.sharding_world_size
-            )
+            ), f"number of parameters [{len(params_grads)}] is not enough to be shard among [{self.sharding_world_size}] ranks"
 
             # sharding hybrid data parallel: partial sharding param within
             if dp_group.nranks > self.sharding_world_size:
@@ -684,18 +674,22 @@ def _shard_parameter(self, main_block, startup_block):
                 assert len(op.output_arg_names) == 1
                 output_name = op.output_arg_names[0]
 
-                if (
-                    op.type == "c_broadcast"
-                    and op.attr("ring_id") in dp_ring_ids
-                ):
-                    if (
-                        self.outer_dp_group
-                        and sharding_info.get_var_rank(output_name)
-                        == sharding_info.local_rank
-                    ):
-                        op._set_attr("ring_id", self.outer_dp_group.id)
-                    else:
-                        startup_block._remove_op(idx, sync=False)
+                if op.type == "c_broadcast":
+                    if op.attr("ring_id") in dp_ring_ids:
+                        if (
+                            self.outer_dp_group
+                            and sharding_info.get_var_rank(output_name)
+                            == sharding_info.local_rank
+                        ):
+                            op._set_attr("ring_id", self.outer_dp_group.id)
+                        else:
+                            startup_block._remove_op(idx, sync=False)
+                    else:  # We should remove the `c_broadcast` between `TensorParallel` mesh dim.
+                        if (
+                            sharding_info.get_var_rank(output_name)
+                            != sharding_info.local_rank
+                        ):
+                            startup_block._remove_op(idx, sync=False)
                     continue
 
                 if (
@@ -729,9 +723,7 @@ def _optimization_pass(self, main_program, startup_program):
         # TODO support multiple sub_blocks
         assert (
             len(self.sharding_infos) == 1
-        ), "gradient synchronization optimization only support one sharding group right now, but got [{}].".format(
-            len(self.sharding_infos)
-        )
+        ), f"gradient synchronization optimization only support one sharding group right now, but got [{len(self.sharding_infos)}]."
         sharding_info = self.sharding_infos[0]
 
         with paddle.static.program_guard(main_program, startup_program):
@@ -770,11 +762,7 @@ def _fuse_overlap_parameter_comm_stage_two(self, sharding_info):
         )
         _logger.info("Sharding Stage2 Optimization:")
         _logger.info(
-            "Param Bucket size is [{}], [{}] Parameters are fused into [{}] Buckets".format(
-                self.param_bucket_size_numel,
-                len(param_to_group_map.keys()),
-                len(group_to_param_map.keys()),
-            )
+            f"Param Bucket size is [{self.param_bucket_size_numel}], [{len(param_to_group_map.keys())}] Parameters are fused into [{len(group_to_param_map.keys())}] Buckets"
         )
         broadcast_var_to_group_map = {}
 
@@ -799,9 +787,7 @@ def _fuse_overlap_parameter_comm_stage_two(self, sharding_info):
                     }
                 )
             _logger.info(
-                "Parameter Communication would use [{}] streams.".format(
-                    self.param_comm_stream_num
-                )
+                f"Parameter Communication would use [{self.param_comm_stream_num}] streams."
             )
             self.op_to_stream_idx = {}
 
@@ -840,10 +826,7 @@ def _fuse_overlap_parameter_comm_stage_two(self, sharding_info):
             else:
                 param_group.coalesce_var = param_group.vars[0]
             _logger.info(
-                "Bucket[{}] size [{}]MB.".format(
-                    i,
-                    sum([get_var_size(p) for p in param_group.vars]),
-                )
+                f"Bucket[{i}] size [{sum([get_var_size(p) for p in param_group.vars])}]MB."
             )
             _logger.debug(
                 f"Bucket[{i}] parameters: {[p.name for p in param_group.vars]}."
@@ -1064,11 +1047,7 @@ def op_depend_on_group(op, group):
 
         _logger.info("Sharding Gradient Communication Optimization:")
         _logger.info(
-            "Gradient Bucket size is [{}], [{}] Gradients are fused into [{}] Buckets.".format(
-                self.grad_bucket_size_numel,
-                len(grouped_grad_names),
-                len(grad_groups),
-            )
+            f"Gradient Bucket size is [{self.grad_bucket_size_numel}], [{len(grouped_grad_names)}] Gradients are fused into [{len(grad_groups)}] Buckets."
         )
 
         # create coalesce tensor and record op idx
@@ -1132,9 +1111,7 @@ def op_depend_on_group(op, group):
                 grad_name = op.output_arg_names[0]
                 assert (
                     grad_name == group.vars[-1].name
-                ), "Unexpected: it is supposed to sync [{}] but got [{}]".format(
-                    group.vars[-1].name, grad_name
-                )
+                ), f"Unexpected: it is supposed to sync [{group.vars[-1].name}] but got [{grad_name}]"
                 op._rename_input(grad_name, group.coalesce_var.name)
                 op._rename_output(grad_name, group.coalesce_var.name)
 
@@ -1146,9 +1123,7 @@ def op_depend_on_group(op, group):
                 first_grad_name = group.vars[0].name
                 assert (
                     first_grad_name in op.output_arg_names
-                ), "Unexpected: op is supposed to generate grad [{}] but got [{}]".format(
-                    first_grad_name, str(op)
-                )
+                ), f"Unexpected: op is supposed to generate grad [{first_grad_name}] but got [{str(op)}]"
                 grad_names = [grad.name for grad in group.vars]
 
                 concated_shapes = []
@@ -1628,7 +1603,7 @@ def _is_param_grad_fp32_cast_op(block, op):
         block, op, __amp_target_dtype__, core.VarDesc.VarType.FP32
     ):
         return False
-    if _is_master_grad_cast_op(block, op, __amp_target_dtype_name__):
+    if _is_master_grad_cast_op(block, op):
         return False
     output_name = op.output_arg_names[0]
     base_name = output_name[: output_name.find("@")]
diff --git a/python/paddle/distributed/passes/pass_utils.py b/python/paddle/distributed/passes/pass_utils.py
index 5ba41b49fe1b3..53a5eb66366ee 100644
--- a/python/paddle/distributed/passes/pass_utils.py
+++ b/python/paddle/distributed/passes/pass_utils.py
@@ -303,6 +303,13 @@ def shadow_var_between_sub_programs(sub_programs):
         for op in block.ops:
             for input_arg_name in op.input_arg_names:
                 if var_can_be_deleted(input_arg_name, block):
+                    # NOTE(zhangbo): In pir, transpose_grad op has only one input, Xshape is no longer the input.
+                    if (
+                        op.type == 'transpose2_grad'
+                        and "XShape" in op.input_names
+                    ):
+                        if input_arg_name in op.input("XShape"):
+                            continue
                     input_arg_names.add(input_arg_name)
                     # NOTE(Ruibiao): When translating these codes to pir, we can simplely set
                     # `shadow_arg_names=input_arg_names-output_arg_names` since the program
diff --git a/python/paddle/distributed/passes/pipeline_scheduler_pass/__init__.py b/python/paddle/distributed/passes/pipeline_scheduler_pass/__init__.py
new file mode 100644
index 0000000000000..289211329f261
--- /dev/null
+++ b/python/paddle/distributed/passes/pipeline_scheduler_pass/__init__.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from ..pass_base import PassContext, new_pass
+from .pipeline_1f1b import Pipeline1F1BPass  # noqa: F401
+from .pipeline_eager_1f1b import PipelineEager1F1BPass  # noqa: F401
+from .pipeline_fthenb import PipelineFThenBPass  # noqa: F401
+from .pipeline_vpp import PipelineVirtualPipelinePass  # noqa: F401
+
+__all__ = []
+
+
+def apply_pass(main_program, startup_program, pass_name, pass_attr={}):
+    assert pass_name in [
+        "FThenB",
+        "1F1B",
+        "Eager1F1B",
+        "VPP",
+    ], f"pipeline scheduler only support FThenB, 1F1B, Eager1F1B and VPP, but receive {pass_name}"
+
+    if pass_name == "1F1B":
+        # TODO(Ruibiao): Move FLAGS_1f1b_backward_forward_overlap and
+        # FLAGS_mp_async_allreduce_in_backward to auto parallel Strategy
+        # after these two optimizations are available.
+        pass_attr["enable_backward_forward_overlap"] = int(
+            os.environ.get("FLAGS_1f1b_backward_forward_overlap", 0)
+        )
+
+    pipeline_pass = new_pass("pipeline_scheduler_" + pass_name, pass_attr)
+    pass_context = PassContext()
+    pipeline_pass.apply([main_program], [startup_program], pass_context)
+    plan = pass_context.get_attr("plan")
+    return plan
diff --git a/python/paddle/distributed/passes/pipeline_scheduler_pass.py b/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_1f1b.py
similarity index 61%
rename from python/paddle/distributed/passes/pipeline_scheduler_pass.py
rename to python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_1f1b.py
index accba5d08507a..db570fa02c9e0 100644
--- a/python/paddle/distributed/passes/pipeline_scheduler_pass.py
+++ b/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_1f1b.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,30 +13,20 @@
 # limitations under the License.
 
 import logging
-import os
 
 from paddle.base import core
 from paddle.distributed.auto_parallel.static.cost import calc_time_by_cost_model
 
-from ..utils.log_utils import get_logger
-from .pass_base import PassContext, new_pass, register_pass
-from .pass_utils import (
+from ...utils.log_utils import get_logger
+from ..pass_base import register_pass
+from ..pass_utils import (
     AutoParallelStreamType,
     _add_event_dependency,
     _program_for_fthenb_and_1f1b,
-    _program_for_vpp,
     split_program,
 )
 from .pipeline_pass_base import PipelinePassBase
 
-__not_shape_var_type__ = [
-    core.VarDesc.VarType.READER,
-    core.VarDesc.VarType.STEP_SCOPES,
-    core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-    core.VarDesc.VarType.FEED_MINIBATCH,
-    core.VarDesc.VarType.FETCH_LIST,
-]
-
 FORWARD = "forward"
 BACKWARD = "backward"
 OPT = "optimizer"
@@ -44,41 +34,6 @@
 logger = get_logger(logging.INFO)
 
 
-@register_pass("pipeline_scheduler_FThenB")
-class PipelineFThenBPass(PipelinePassBase):
-    def __init__(self):
-        super().__init__()
-
-    def _create_job_list(self):
-        num_micro_batches = self.get_attr("num_micro_batches")
-
-        job_list = []
-
-        for i in range(num_micro_batches):
-            forward_job = core.Job(FORWARD)
-            forward_job.set_micro_batch_id(i)
-            job_list.append(forward_job)
-
-        for i in range(num_micro_batches):
-            backward_job = core.Job(BACKWARD)
-            backward_job.set_micro_batch_id(i)
-            job_list.append(backward_job)
-
-        opt_job = core.Job(OPT)
-        opt_job.set_micro_batch_id(0)
-        job_list.append(opt_job)
-        return job_list
-
-    def _partial_programs(self, program):
-        # NOTE: The flag "enable_send_recv_overlap" may increase the reserved memory of GPUs.
-        enable_send_recv_overlap = self.get_attr("enable_send_recv_overlap")
-        types = [FORWARD, BACKWARD, OPT]
-        sub_program_list = _program_for_fthenb_and_1f1b(
-            program, enable_send_recv_overlap
-        )
-        return types, sub_program_list
-
-
 @register_pass("pipeline_scheduler_1F1B")
 class Pipeline1F1BPass(PipelinePassBase):
     def __init__(self):
@@ -402,181 +357,3 @@ def is_comm_op_valid_to_overlap(self, op):
             and op.dist_attr.execution_stream
             == AutoParallelStreamType.CALC_STREAM.value
         )
-
-
-@register_pass("pipeline_scheduler_Eager1F1B")
-class PipelineEager1F1BPass(PipelinePassBase):
-    def __init__(self):
-        super().__init__()
-
-    def _create_job_list(self):
-        num_micro_batches = self.get_attr("num_micro_batches")
-        pp_stage = self.get_attr("pp_stage")
-        pp_degree = self.get_attr("pp_degree")
-
-        job_list = []
-        assert (
-            2 * (pp_degree - pp_stage) - 1 <= num_micro_batches
-        ), "Num of micro batches should larger than 2 * (pp_degree - pp_stage) - 1."
-
-        micro_batch_in_warmup = 2 * (pp_degree - pp_stage) - 1
-        micro_batch_in_1f1b = num_micro_batches - micro_batch_in_warmup
-
-        forward_micro_batch_id = 0
-        for _ in range(micro_batch_in_warmup):
-            forward_job = core.Job(FORWARD)
-            forward_job.set_micro_batch_id(forward_micro_batch_id)
-            job_list.append(forward_job)
-            forward_micro_batch_id += 1
-
-        backward_micro_batch_id = 0
-        for _ in range(micro_batch_in_1f1b):
-            backward_job = core.Job(BACKWARD)
-            backward_job.set_micro_batch_id(backward_micro_batch_id)
-            job_list.append(backward_job)
-            backward_micro_batch_id += 1
-            forward_job = core.Job(FORWARD)
-            forward_job.set_micro_batch_id(forward_micro_batch_id)
-            job_list.append(forward_job)
-            forward_micro_batch_id += 1
-
-        for _ in range(micro_batch_in_warmup):
-            backward_job = core.Job(BACKWARD)
-            backward_job.set_micro_batch_id(backward_micro_batch_id)
-            job_list.append(backward_job)
-            backward_micro_batch_id += 1
-
-        opt_job = core.Job(OPT)
-        job_list.append(opt_job)
-        return job_list
-
-    def _partial_programs(self, program):
-        # NOTE: The flag "enable_send_recv_overlap" may increase the reserved memory of GPUs.
-        enable_send_recv_overlap = self.get_attr("enable_send_recv_overlap")
-        # TODO: More function will be added later. Now it uses the same logic as FTthenB and 1F1B.
-        types = [FORWARD, BACKWARD, OPT]
-        sub_program_list = _program_for_fthenb_and_1f1b(
-            program, enable_send_recv_overlap
-        )
-        return types, sub_program_list
-
-
-@register_pass("pipeline_scheduler_VPP")
-class PipelineVirtualPipelinePass(PipelinePassBase):
-    def __init__(self):
-        super().__init__()
-
-        self._forward_micro_step_counter = {}
-        self._backward_micro_step_counter = {}
-
-    def _record_fwd_micro_step(self, virtual_pp_rank):
-        real_micro_step = self._forward_micro_step_counter[virtual_pp_rank]
-        self._forward_micro_step_counter[virtual_pp_rank] += 1
-        return real_micro_step
-
-    def _record_bwd_micro_step(self, virtual_pp_rank):
-        real_micro_step = self._backward_micro_step_counter[virtual_pp_rank]
-        self._backward_micro_step_counter[virtual_pp_rank] += 1
-        return real_micro_step
-
-    def _create_job_list(self):
-        accumulate_steps = self.get_attr("num_micro_batches")
-        stage_id = self.get_attr("pp_stage")
-        num_stages = self.get_attr("pp_degree")
-        num_model_chunks = self.get_attr("vpp_degree")
-        for i in range(num_model_chunks):
-            self._forward_micro_step_counter[i] = 0
-            self._backward_micro_step_counter[i] = 0
-
-        assert accumulate_steps % num_stages == 0
-
-        def _get_virtual_pp_rank(micro_step, forward):
-            virtual_pp_stage = micro_step % (num_stages * num_model_chunks)
-            virtual_pp_stage = virtual_pp_stage // num_stages
-            if not forward:
-                virtual_pp_stage = num_model_chunks - virtual_pp_stage - 1
-            return virtual_pp_stage
-
-        total_num_steps = accumulate_steps * num_model_chunks
-        if accumulate_steps == num_stages:
-            warmup_steps = total_num_steps
-        else:
-            warmup_steps = (num_stages - stage_id - 1) * 2
-            warmup_steps += (num_model_chunks - 1) * num_stages
-            warmup_steps = min(warmup_steps, total_num_steps)
-
-        steady_steps = total_num_steps - warmup_steps
-
-        job_list = []
-        for micro_step in range(warmup_steps):
-            virtual_pp_rank = _get_virtual_pp_rank(micro_step, forward=True)
-            micro_batch_id = self._record_fwd_micro_step(virtual_pp_rank)
-            fw_job = core.Job(FORWARD + str(virtual_pp_rank))
-            fw_job.set_micro_batch_id(micro_batch_id)
-            job_list.append(fw_job)
-
-        for micro_step in range(steady_steps):
-            fwd_micro_step = micro_step + warmup_steps
-            fwd_virtual_pp_rank = _get_virtual_pp_rank(
-                fwd_micro_step, forward=True
-            )
-            fwd_micro_batch_id = self._record_fwd_micro_step(
-                fwd_virtual_pp_rank
-            )
-            fwd_job = core.Job(FORWARD + str(fwd_virtual_pp_rank))
-            fwd_job.set_micro_batch_id(fwd_micro_batch_id)
-            job_list.append(fwd_job)
-
-            bw_micro_step = micro_step
-            bwd_virtual_pp_rank = _get_virtual_pp_rank(
-                bw_micro_step, forward=False
-            )
-            bwd_micro_batch_id = self._record_bwd_micro_step(
-                bwd_virtual_pp_rank
-            )
-            bwd_job = core.Job(BACKWARD + str(bwd_virtual_pp_rank))
-            bwd_job.set_micro_batch_id(bwd_micro_batch_id)
-            job_list.append(bwd_job)
-
-        for micro_step in range(steady_steps, total_num_steps):
-            virtual_pp_rank = _get_virtual_pp_rank(micro_step, forward=False)
-            micro_batch_id = self._record_bwd_micro_step(virtual_pp_rank)
-            bwd_job = core.Job(BACKWARD + str(virtual_pp_rank))
-            bwd_job.set_micro_batch_id(micro_batch_id)
-            job_list.append(bwd_job)
-
-        opt_job = core.Job(OPT)
-        job_list.append(opt_job)
-        return job_list
-
-    def _partial_programs(self, program):
-        dist_context = self.get_attr("dist_context")
-        num_model_chunks = self.get_attr("vpp_degree")
-        enable_send_recv_overlap = self.get_attr("enable_send_recv_overlap")
-        types, sub_program_list = _program_for_vpp(
-            program, num_model_chunks, dist_context, enable_send_recv_overlap
-        )
-        return types, sub_program_list
-
-
-def apply_pass(main_program, startup_program, pass_name, pass_attr={}):
-    assert pass_name in [
-        "FThenB",
-        "1F1B",
-        "Eager1F1B",
-        "VPP",
-    ], f"pipeline scheduler only support FThenB, 1F1B and Eager1F1B, but receive {pass_name}"
-
-    if pass_name == "1F1B":
-        # TODO(Ruibiao): Move FLAGS_1f1b_backward_forward_overlap and
-        # FLAGS_mp_async_allreduce_in_backward to auto parallel Strategy
-        # after these two optimizations are available.
-        pass_attr["enable_backward_forward_overlap"] = int(
-            os.environ.get("FLAGS_1f1b_backward_forward_overlap", 0)
-        )
-
-    pipeline_pass = new_pass("pipeline_scheduler_" + pass_name, pass_attr)
-    pass_context = PassContext()
-    pipeline_pass.apply([main_program], [startup_program], pass_context)
-    plan = pass_context.get_attr("plan")
-    return plan
diff --git a/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_eager_1f1b.py b/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_eager_1f1b.py
new file mode 100644
index 0000000000000..f577a7b856a2a
--- /dev/null
+++ b/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_eager_1f1b.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+from paddle.base import core
+
+from ...utils.log_utils import get_logger
+from ..pass_base import register_pass
+from ..pass_utils import _program_for_fthenb_and_1f1b
+from .pipeline_pass_base import PipelinePassBase
+
+FORWARD = "forward"
+BACKWARD = "backward"
+OPT = "optimizer"
+
+logger = get_logger(logging.INFO)
+
+
+@register_pass("pipeline_scheduler_Eager1F1B")
+class PipelineEager1F1BPass(PipelinePassBase):
+    def __init__(self):
+        super().__init__()
+
+    def _create_job_list(self):
+        num_micro_batches = self.get_attr("num_micro_batches")
+        pp_stage = self.get_attr("pp_stage")
+        pp_degree = self.get_attr("pp_degree")
+
+        job_list = []
+        assert (
+            2 * (pp_degree - pp_stage) - 1 <= num_micro_batches
+        ), "Num of micro batches should larger than 2 * (pp_degree - pp_stage) - 1."
+
+        micro_batch_in_warmup = 2 * (pp_degree - pp_stage) - 1
+        micro_batch_in_1f1b = num_micro_batches - micro_batch_in_warmup
+
+        forward_micro_batch_id = 0
+        for _ in range(micro_batch_in_warmup):
+            forward_job = core.Job(FORWARD)
+            forward_job.set_micro_batch_id(forward_micro_batch_id)
+            job_list.append(forward_job)
+            forward_micro_batch_id += 1
+
+        backward_micro_batch_id = 0
+        for _ in range(micro_batch_in_1f1b):
+            backward_job = core.Job(BACKWARD)
+            backward_job.set_micro_batch_id(backward_micro_batch_id)
+            job_list.append(backward_job)
+            backward_micro_batch_id += 1
+            forward_job = core.Job(FORWARD)
+            forward_job.set_micro_batch_id(forward_micro_batch_id)
+            job_list.append(forward_job)
+            forward_micro_batch_id += 1
+
+        for _ in range(micro_batch_in_warmup):
+            backward_job = core.Job(BACKWARD)
+            backward_job.set_micro_batch_id(backward_micro_batch_id)
+            job_list.append(backward_job)
+            backward_micro_batch_id += 1
+
+        opt_job = core.Job(OPT)
+        job_list.append(opt_job)
+        return job_list
+
+    def _partial_programs(self, program):
+        # NOTE: The flag "enable_send_recv_overlap" may increase the reserved memory of GPUs.
+        enable_send_recv_overlap = self.get_attr("enable_send_recv_overlap")
+        # TODO: More function will be added later. Now it uses the same logic as FTthenB and 1F1B.
+        types = [FORWARD, BACKWARD, OPT]
+        sub_program_list = _program_for_fthenb_and_1f1b(
+            program, enable_send_recv_overlap
+        )
+        return types, sub_program_list
diff --git a/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_fthenb.py b/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_fthenb.py
new file mode 100644
index 0000000000000..c037f55e25912
--- /dev/null
+++ b/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_fthenb.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+from paddle.base import core
+
+from ...utils.log_utils import get_logger
+from ..pass_base import register_pass
+from ..pass_utils import _program_for_fthenb_and_1f1b
+from .pipeline_pass_base import PipelinePassBase
+
+FORWARD = "forward"
+BACKWARD = "backward"
+OPT = "optimizer"
+
+logger = get_logger(logging.INFO)
+
+
+@register_pass("pipeline_scheduler_FThenB")
+class PipelineFThenBPass(PipelinePassBase):
+    def __init__(self):
+        super().__init__()
+
+    def _create_job_list(self):
+        num_micro_batches = self.get_attr("num_micro_batches")
+
+        job_list = []
+
+        for i in range(num_micro_batches):
+            forward_job = core.Job(FORWARD)
+            forward_job.set_micro_batch_id(i)
+            job_list.append(forward_job)
+
+        for i in range(num_micro_batches):
+            backward_job = core.Job(BACKWARD)
+            backward_job.set_micro_batch_id(i)
+            job_list.append(backward_job)
+
+        opt_job = core.Job(OPT)
+        opt_job.set_micro_batch_id(0)
+        job_list.append(opt_job)
+        return job_list
+
+    def _partial_programs(self, program):
+        # NOTE: The flag "enable_send_recv_overlap" may increase the reserved memory of GPUs.
+        enable_send_recv_overlap = self.get_attr("enable_send_recv_overlap")
+        types = [FORWARD, BACKWARD, OPT]
+        sub_program_list = _program_for_fthenb_and_1f1b(
+            program, enable_send_recv_overlap
+        )
+        return types, sub_program_list
diff --git a/python/paddle/distributed/passes/pipeline_pass_base.py b/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_pass_base.py
similarity index 94%
rename from python/paddle/distributed/passes/pipeline_pass_base.py
rename to python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_pass_base.py
index 1d4600dc69c45..1a8bcaed3c232 100644
--- a/python/paddle/distributed/passes/pipeline_pass_base.py
+++ b/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_pass_base.py
@@ -17,9 +17,9 @@
 import paddle
 from paddle.base import core
 
-from ..utils.log_utils import get_logger
-from .pass_base import PassBase
-from .pass_utils import set_skip_gc_vars, shadow_var_between_sub_programs
+from ...utils.log_utils import get_logger
+from ..pass_base import PassBase
+from ..pass_utils import set_skip_gc_vars, shadow_var_between_sub_programs
 
 logger = get_logger(logging.INFO)
 
diff --git a/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_vpp.py b/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_vpp.py
new file mode 100644
index 0000000000000..31d0849330a80
--- /dev/null
+++ b/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_vpp.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+from paddle.base import core
+
+from ...utils.log_utils import get_logger
+from ..pass_base import register_pass
+from ..pass_utils import _program_for_vpp
+from .pipeline_pass_base import PipelinePassBase
+
+FORWARD = "forward"
+BACKWARD = "backward"
+OPT = "optimizer"
+
+logger = get_logger(logging.INFO)
+
+
+@register_pass("pipeline_scheduler_VPP")
+class PipelineVirtualPipelinePass(PipelinePassBase):
+    def __init__(self):
+        super().__init__()
+
+        self._forward_micro_step_counter = {}
+        self._backward_micro_step_counter = {}
+
+    def _record_fwd_micro_step(self, virtual_pp_rank):
+        real_micro_step = self._forward_micro_step_counter[virtual_pp_rank]
+        self._forward_micro_step_counter[virtual_pp_rank] += 1
+        return real_micro_step
+
+    def _record_bwd_micro_step(self, virtual_pp_rank):
+        real_micro_step = self._backward_micro_step_counter[virtual_pp_rank]
+        self._backward_micro_step_counter[virtual_pp_rank] += 1
+        return real_micro_step
+
+    def _create_job_list(self):
+        accumulate_steps = self.get_attr("num_micro_batches")
+        stage_id = self.get_attr("pp_stage")
+        num_stages = self.get_attr("pp_degree")
+        num_model_chunks = self.get_attr("vpp_degree")
+        for i in range(num_model_chunks):
+            self._forward_micro_step_counter[i] = 0
+            self._backward_micro_step_counter[i] = 0
+
+        assert accumulate_steps % num_stages == 0
+
+        def _get_virtual_pp_rank(micro_step, forward):
+            virtual_pp_stage = micro_step % (num_stages * num_model_chunks)
+            virtual_pp_stage = virtual_pp_stage // num_stages
+            if not forward:
+                virtual_pp_stage = num_model_chunks - virtual_pp_stage - 1
+            return virtual_pp_stage
+
+        total_num_steps = accumulate_steps * num_model_chunks
+        if accumulate_steps == num_stages:
+            warmup_steps = total_num_steps
+        else:
+            warmup_steps = (num_stages - stage_id - 1) * 2
+            warmup_steps += (num_model_chunks - 1) * num_stages
+            warmup_steps = min(warmup_steps, total_num_steps)
+
+        steady_steps = total_num_steps - warmup_steps
+
+        job_list = []
+        for micro_step in range(warmup_steps):
+            virtual_pp_rank = _get_virtual_pp_rank(micro_step, forward=True)
+            micro_batch_id = self._record_fwd_micro_step(virtual_pp_rank)
+            fw_job = core.Job(FORWARD + str(virtual_pp_rank))
+            fw_job.set_micro_batch_id(micro_batch_id)
+            job_list.append(fw_job)
+
+        for micro_step in range(steady_steps):
+            fwd_micro_step = micro_step + warmup_steps
+            fwd_virtual_pp_rank = _get_virtual_pp_rank(
+                fwd_micro_step, forward=True
+            )
+            fwd_micro_batch_id = self._record_fwd_micro_step(
+                fwd_virtual_pp_rank
+            )
+            fwd_job = core.Job(FORWARD + str(fwd_virtual_pp_rank))
+            fwd_job.set_micro_batch_id(fwd_micro_batch_id)
+            job_list.append(fwd_job)
+
+            bw_micro_step = micro_step
+            bwd_virtual_pp_rank = _get_virtual_pp_rank(
+                bw_micro_step, forward=False
+            )
+            bwd_micro_batch_id = self._record_bwd_micro_step(
+                bwd_virtual_pp_rank
+            )
+            bwd_job = core.Job(BACKWARD + str(bwd_virtual_pp_rank))
+            bwd_job.set_micro_batch_id(bwd_micro_batch_id)
+            job_list.append(bwd_job)
+
+        for micro_step in range(steady_steps, total_num_steps):
+            virtual_pp_rank = _get_virtual_pp_rank(micro_step, forward=False)
+            micro_batch_id = self._record_bwd_micro_step(virtual_pp_rank)
+            bwd_job = core.Job(BACKWARD + str(virtual_pp_rank))
+            bwd_job.set_micro_batch_id(micro_batch_id)
+            job_list.append(bwd_job)
+
+        opt_job = core.Job(OPT)
+        job_list.append(opt_job)
+        return job_list
+
+    def _partial_programs(self, program):
+        dist_context = self.get_attr("dist_context")
+        num_model_chunks = self.get_attr("vpp_degree")
+        enable_send_recv_overlap = self.get_attr("enable_send_recv_overlap")
+        types, sub_program_list = _program_for_vpp(
+            program, num_model_chunks, dist_context, enable_send_recv_overlap
+        )
+        return types, sub_program_list
diff --git a/python/paddle/distributed/passes/ps_server_pass.py b/python/paddle/distributed/passes/ps_server_pass.py
index 54b23059ed3f6..bd05e58cf0229 100755
--- a/python/paddle/distributed/passes/ps_server_pass.py
+++ b/python/paddle/distributed/passes/ps_server_pass.py
@@ -137,9 +137,7 @@ def _get_lr_scheduler_program(self, lr_scheduler, lr_decay_steps):
                 )
         else:
             raise ValueError(
-                "Not supported current LearningRate strategy, please use follow decay strategy: {}".format(
-                    scheduler_decay
-                )
+                f"Not supported current LearningRate strategy, please use follow decay strategy: {scheduler_decay}"
             )
 
         return decay_main_program, decay_startup_program, lr_name
diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py
index eb3e0368c49a8..c8292d92c3675 100755
--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -565,11 +565,7 @@ def _delete_optimizer_op_and_vars(
             set(remote_optimize_op_role_vars)
         )  # param + grad
         print(
-            "remote_optimize_vars: {}, remote_optimize_op_role_vars: {}, local_optimize_vars: {}".format(
-                remote_optimize_vars,
-                remote_optimize_op_role_vars,
-                local_optimize_vars,
-            )
+            f"remote_optimize_vars: {remote_optimize_vars}, remote_optimize_op_role_vars: {remote_optimize_op_role_vars}, local_optimize_vars: {local_optimize_vars}"
         )
         for var in remote_optimize_vars:
             if var in local_optimize_vars:
diff --git a/python/paddle/distributed/ps/coordinator.py b/python/paddle/distributed/ps/coordinator.py
index 316393309dc38..1a2c55ba7112b 100755
--- a/python/paddle/distributed/ps/coordinator.py
+++ b/python/paddle/distributed/ps/coordinator.py
@@ -132,9 +132,7 @@ def set_train_dataset_info(self, train_dataset, train_file_list):
         self.train_dataset = train_dataset
         self.train_file_list = train_file_list
         logger.info(
-            "fl-ps > {}, data_feed_desc:\n {}".format(
-                type(self.train_dataset), self.train_dataset._desc()
-            )
+            f"fl-ps > {type(self.train_dataset)}, data_feed_desc:\n {self.train_dataset._desc()}"
         )
 
     def set_test_dataset_info(self, test_dataset, test_file_list):
diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py
index dd9e6e2e79b68..919d2c9f4ccba 100755
--- a/python/paddle/distributed/ps/the_one_ps.py
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -80,32 +80,24 @@ def check_embedding_dim(accessor_proto, varname, program_id, context):
     if accessor_proto.accessor_class == "SparseAccessor":
         if fea_dim != embedding_dim + 2:
             raise ValueError(
-                "The fea_dim is wrong, it will be sparse_embedding_dim + 2: {}, but got {}".format(
-                    embedding_dim + 2, fea_dim
-                )
+                f"The fea_dim is wrong, it will be sparse_embedding_dim + 2: {embedding_dim + 2}, but got {fea_dim}"
             )
     else:
         if fea_dim != embedding_dim:
             raise ValueError(
-                "The fea_dim is wrong, it will be sparse_embedding_dim: {}, but got {}".format(
-                    embedding_dim, fea_dim
-                )
+                f"The fea_dim is wrong, it will be sparse_embedding_dim: {embedding_dim}, but got {fea_dim}"
             )
 
     embedx_dim = accessor_proto.embedx_dim
     if accessor_proto.accessor_class == "SparseAccessor":
         if embedx_dim != embedding_dim - 1:
             raise ValueError(
-                "The embedx_dim is wrong, it will be sparse_embedding_dim - 1: {}, but got {}".format(
-                    embedding_dim - 1, embedx_dim
-                )
+                f"The embedx_dim is wrong, it will be sparse_embedding_dim - 1: {embedding_dim - 1}, but got {embedx_dim}"
             )
     else:
         if embedx_dim != embedding_dim - 3:
             raise ValueError(
-                "The embedx_dim is wrong, it will be sparse_embedding_dim - 3: {}, but got {}".format(
-                    embedding_dim - 3, embedx_dim
-                )
+                f"The embedx_dim is wrong, it will be sparse_embedding_dim - 3: {embedding_dim - 3}, but got {embedx_dim}"
             )
 
 
@@ -1365,9 +1357,7 @@ def _init_server(self, dirname=None, var_names=None, **kwargs):
             for var_name in var_names:
                 if var_name not in distributed_varnames:
                     raise ValueError(
-                        "fleet.init server can only load sparse variables in {}".format(
-                            distributed_varnames
-                        )
+                        f"fleet.init server can only load sparse variables in {distributed_varnames}"
                     )
             load_varnames = var_names
 
diff --git a/python/paddle/distributed/ps/utils/ps_program_builder.py b/python/paddle/distributed/ps/utils/ps_program_builder.py
index f5b14849b3a8b..b198fb5cbe1fb 100755
--- a/python/paddle/distributed/ps/utils/ps_program_builder.py
+++ b/python/paddle/distributed/ps/utils/ps_program_builder.py
@@ -80,9 +80,7 @@ def _build_programs(self):
             self._build_trainer_programs()
             base.framework.switch_startup_program(self.cloned_startup)
             print(
-                "paddle.static.default_startup_program: {}".format(
-                    paddle.static.default_startup_program
-                )
+                f"paddle.static.default_startup_program: {paddle.static.default_startup_program}"
             )
             # print("ps_program_build before =", id(self.loss.block.program))
             self._build_trainer_desc()
@@ -459,9 +457,7 @@ def _build_programs(self):
             base.framework.switch_startup_program(self.cloned_startup)
             paddle.framework.switch_main_program(self.cloned_main)
             print(
-                "paddle.static.default_startup_program: {}".format(
-                    paddle.static.default_startup_program()._heter_pipeline_opt
-                )
+                f"paddle.static.default_startup_program: {paddle.static.default_startup_program()._heter_pipeline_opt}"
             )
         else:
             self._build_pserver_programs()
diff --git a/python/paddle/distributed/ps/utils/public.py b/python/paddle/distributed/ps/utils/public.py
index ae8a3cffd8bac..1cc8257671df9 100755
--- a/python/paddle/distributed/ps/utils/public.py
+++ b/python/paddle/distributed/ps/utils/public.py
@@ -171,24 +171,20 @@ def get_communicator_flags(self):
             ]
             if max_merge_var_num != num_threads:
                 print(
-                    'WARNING: In {} mode, communicator_max_merge_var_num '
+                    f'WARNING: In {mode_str} mode, communicator_max_merge_var_num '
                     'must be equal to CPU_NUM. But received, '
-                    'communicator_max_merge_var_num = {}, CPU_NUM = '
-                    '{}. communicator_max_merge_var_num will be forced to {}.'.format(
-                        mode_str, max_merge_var_num, num_threads, num_threads
-                    )
+                    f'communicator_max_merge_var_num = {max_merge_var_num}, CPU_NUM = '
+                    f'{num_threads}. communicator_max_merge_var_num will be forced to {num_threads}.'
                 )
                 self.runtime_configs[
                     'communicator_max_merge_var_num'
                 ] = num_threads
             if send_queue_size != num_threads:
                 print(
-                    'WARNING: In {} mode, communicator_send_queue_size '
+                    f'WARNING: In {mode_str} mode, communicator_send_queue_size '
                     'must be equal to CPU_NUM. But received, '
-                    'communicator_send_queue_size = {}, CPU_NUM = '
-                    '{}. communicator_send_queue_size will be forced to {}.'.format(
-                        mode_str, send_queue_size, num_threads, num_threads
-                    )
+                    f'communicator_send_queue_size = {send_queue_size}, CPU_NUM = '
+                    f'{num_threads}. communicator_send_queue_size will be forced to {num_threads}.'
                 )
                 self.runtime_configs[
                     'communicator_send_queue_size'
@@ -905,9 +901,7 @@ def _append_heter_op(op, current_heter_block_ops, heter_ops):
         for _, heter_block in heter_block_dict.items():
             total_heter_ops += len(heter_block)
     print(
-        "There are {} OPs in your main_program, and contains {} heter-OPs which is made up of {} heter-blocks.".format(
-            len(block.ops), total_heter_ops, heter_blocks
-        )
+        f"There are {len(block.ops)} OPs in your main_program, and contains {total_heter_ops} heter-OPs which is made up of {heter_blocks} heter-blocks."
     )
 
     return origin_program, heter_ops, default_ops, program_block_ops
@@ -1807,9 +1801,7 @@ def check_program(program):
             for var_name in input_var_names + output_var_names:
                 if not block._find_var_recursive(str(var_name)):
                     raise ValueError(
-                        'var: {} needed by op is not found in block: {}'.format(
-                            str(var_name), block_idx
-                        )
+                        f'var: {str(var_name)} needed by op is not found in block: {block_idx}'
                     )
         block_idx += 1
     print('program checked valid')
diff --git a/python/paddle/distributed/rpc/rpc.py b/python/paddle/distributed/rpc/rpc.py
index 0d88c8fef1ce5..7d5bb8f957d94 100644
--- a/python/paddle/distributed/rpc/rpc.py
+++ b/python/paddle/distributed/rpc/rpc.py
@@ -253,9 +253,7 @@ def _check_keys_ready(wait_keys):
             elapse_time = time.time() - start_time
             if datetime.timedelta(seconds=elapse_time) > timeout:
                 raise RuntimeError(
-                    "Keys {} are not ready sinck rank {} is waiting them.".format(
-                        wait_keys, global_rank
-                    )
+                    f"Keys {wait_keys} are not ready sinck rank {global_rank} is waiting them."
                 )
             wait_keys = list(
                 filter(lambda key: int(_barrier_store.get(key)) != 1, wait_keys)
diff --git a/python/paddle/distributed/transpiler/details/vars_distributed.py b/python/paddle/distributed/transpiler/details/vars_distributed.py
index 262cf068875be..404f939de1def 100644
--- a/python/paddle/distributed/transpiler/details/vars_distributed.py
+++ b/python/paddle/distributed/transpiler/details/vars_distributed.py
@@ -115,31 +115,14 @@ def equal(var1, var2):
         )
 
     def __str__(self):
-        origin_var_str = (
-            "{name} : base.{type}.shape{shape}.astype({dtype})".format(
-                name=self.origin.name,
-                type=self.origin.type,
-                shape=self.origin.shape,
-                dtype=self.origin.dtype,
-            )
-        )
+        origin_var_str = f"{self.origin.name} : base.{self.origin.type}.shape{self.origin.shape}.astype({self.origin.dtype})"
 
         slice_var_str = (
-            "{name} : base.{type}.shape{shape}.astype({dtype})"
-            ".slice({is_slice}).block({block_id}).offset({offset})".format(
-                name=self.slice.name,
-                type=self.slice.type,
-                shape=self.slice.shape,
-                dtype=self.slice.dtype,
-                is_slice=self.is_slice,
-                block_id=self.block_id,
-                offset=self.offset,
-            )
+            f"{self.slice.name} : base.{self.slice.type}.shape{self.slice.shape}.astype({self.slice.dtype})"
+            f".slice({self.is_slice}).block({self.block_id}).offset({self.offset})"
         )
 
-        return "var owned: {}, origin var: ( {} ), slice var: ( {} ), endpoint: {} ".format(
-            self.vtype, origin_var_str, slice_var_str, self.endpoint
-        )
+        return f"var owned: {self.vtype}, origin var: ( {origin_var_str} ), slice var: ( {slice_var_str} ), endpoint: {self.endpoint} "
 
 
 class VarsDistributed:
diff --git a/python/paddle/distributed/transpiler/distribute_transpiler.py b/python/paddle/distributed/transpiler/distribute_transpiler.py
index bdcdecd95c017..7c1bc950516a4 100644
--- a/python/paddle/distributed/transpiler/distribute_transpiler.py
+++ b/python/paddle/distributed/transpiler/distribute_transpiler.py
@@ -667,19 +667,13 @@ def transpile(
                 assert (
                     trainers_num
                     > self.config.hierarchical_allreduce_inter_nranks
-                ), "trainers_num:{} < hierarchical_allreduce_inter_nranks:{}".format(
-                    trainers_num,
-                    self.config.hierarchical_allreduce_inter_nranks,
-                )
+                ), f"trainers_num:{trainers_num} < hierarchical_allreduce_inter_nranks:{self.config.hierarchical_allreduce_inter_nranks}"
 
                 assert (
                     trainers_num
                     % self.config.hierarchical_allreduce_inter_nranks
                     == 0
-                ), "trainers_num:{} mod hierarchical_allreduce_inter_nranks:{} != 0".format(
-                    trainers_num,
-                    self.config.hierarchical_allreduce_inter_nranks,
-                )
+                ), f"trainers_num:{trainers_num} mod hierarchical_allreduce_inter_nranks:{self.config.hierarchical_allreduce_inter_nranks} != 0"
 
                 self.origin_program._hierarchical_allreduce_inter_nranks = int(
                     self.config.hierarchical_allreduce_inter_nranks
diff --git a/python/paddle/distributed/utils/launch_utils.py b/python/paddle/distributed/utils/launch_utils.py
index 7b2001403b593..6cfd00e5eef17 100644
--- a/python/paddle/distributed/utils/launch_utils.py
+++ b/python/paddle/distributed/utils/launch_utils.py
@@ -80,9 +80,7 @@ def get_gpus(selected_gpus):
             for x in selected_gpus.split(','):
                 assert x in cuda_visible_devices_list, (
                     "Can't find "
-                    "your selected_gpus {} in CUDA_VISIBLE_DEVICES[{}].".format(
-                        x, cuda_visible_devices
-                    )
+                    f"your selected_gpus {x} in CUDA_VISIBLE_DEVICES[{cuda_visible_devices}]."
                 )
             gpus = [
                 cuda_visible_devices_list.index(x.strip())
@@ -111,9 +109,7 @@ def is_valid(self):
         )
 
     def __str__(self):
-        return "hdfs_ugi:{} hdfs_name:{} hdfs_path{}".format(
-            self.hdfs_ugi, self.hdfs_name, self.hdfs_path
-        )
+        return f"hdfs_ugi:{self.hdfs_ugi} hdfs_name:{self.hdfs_name} hdfs_path{self.hdfs_path}"
 
     def __eq__(self, n):
         return (
@@ -134,12 +130,7 @@ def __init__(self, hdfs):
         self.job_stage_flag = None
 
     def __str__(self):
-        return "job_server:{} pods:{} job_stage_flag:{} hdfs:{}".format(
-            self.job_server,
-            [str(pod) for pod in self.pods],
-            self.job_stage_flag,
-            self.hdfs,
-        )
+        return f"job_server:{self.job_server} pods:{[str(pod) for pod in self.pods]} job_stage_flag:{self.job_stage_flag} hdfs:{self.hdfs}"
 
     def __eq__(self, cluster):
         if len(self.pods) != len(cluster.pods):
@@ -245,16 +236,7 @@ def __init__(self):
         self.gpus = []
 
     def __str__(self):
-        return (
-            "rank:{} id:{} addr:{} port:{} visible_gpu:{} trainers:{}".format(
-                self.rank,
-                self.id,
-                self.addr,
-                self.port,
-                self.gpus,
-                [str(t) for t in self.trainers],
-            )
-        )
+        return f"rank:{self.rank} id:{self.id} addr:{self.addr} port:{self.port} visible_gpu:{self.gpus} trainers:{[str(t) for t in self.trainers]}"
 
     def __eq__(self, pod):
         if (
@@ -549,17 +531,13 @@ def watch_local_trainers(procs, nranks):
         raise
     except SystemExit:
         logger.error(
-            "ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".format(
-                nranks, error_rank
-            )
+            f"ABORT!!! Out of all {nranks} trainers, the trainer process with rank={error_rank} was aborted. Please check its log."
         )
         terminate_local_procs(procs)
         raise
     except:
         logger.error(
-            "ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".format(
-                nranks, error_rank
-            )
+            f"ABORT!!! Out of all {nranks} trainers, the trainer process with rank={error_rank} was aborted. Please check its log."
         )
         terminate_local_procs(procs)
         raise
diff --git a/python/paddle/distributed/utils/nccl_utils.py b/python/paddle/distributed/utils/nccl_utils.py
index 16e445d54bb04..1bbcb66c52fe8 100644
--- a/python/paddle/distributed/utils/nccl_utils.py
+++ b/python/paddle/distributed/utils/nccl_utils.py
@@ -36,12 +36,10 @@ def check_nccl_version_for_p2p():
     nccl_version_baseline = 2804
     assert nccl_version >= nccl_version_baseline, (
         "The version of NCCL is required to be at least v2.8.4 while training with "
-        "pipeline/MoE parallelism, but we found v{}. The previous version of NCCL has "
+        f"pipeline/MoE parallelism, but we found v{nccl_version_str}. The previous version of NCCL has "
         "some bugs in p2p communication, and you can see more detailed description "
         "about this issue from ReleaseNotes of NCCL v2.8.4 "
-        "(https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-4.html#rel_2-8-4).".format(
-            nccl_version_str
-        )
+        "(https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-4.html#rel_2-8-4)."
     )
 
 
diff --git a/python/paddle/distribution/distribution.py b/python/paddle/distribution/distribution.py
index 133ecb7172add..e6d63cc7268ad 100644
--- a/python/paddle/distribution/distribution.py
+++ b/python/paddle/distribution/distribution.py
@@ -16,7 +16,6 @@
 # __all__ = ['Categorical',
 #            'MultivariateNormalDiag',
 #            'Normal',
-#            'sampling_id',
 #            'Uniform']
 
 import warnings
@@ -184,9 +183,7 @@ def _to_tensor(self, *args):
                 (float, list, tuple, np.ndarray, Variable, paddle.pir.Value),
             ):
                 raise TypeError(
-                    "Type of input args must be float, list, tuple, numpy.ndarray or Tensor, but received type {}".format(
-                        type(arg)
-                    )
+                    f"Type of input args must be float, list, tuple, numpy.ndarray or Tensor, but received type {type(arg)}"
                 )
             if isinstance(arg, paddle.pir.Value):
                 # pir.Value does not need to be converted to numpy.ndarray, so we skip here
diff --git a/python/paddle/distribution/kl.py b/python/paddle/distribution/kl.py
index 64b8f568b08db..deb8b6ade6932 100644
--- a/python/paddle/distribution/kl.py
+++ b/python/paddle/distribution/kl.py
@@ -123,12 +123,7 @@ def _dispatch(cls_p, cls_q):
 
     if _REGISTER_TABLE[left_p, left_q] is not _REGISTER_TABLE[right_p, right_q]:
         warnings.warn(
-            'Ambiguous kl_divergence({}, {}). Please register_kl({}, {})'.format(
-                cls_p.__name__,
-                cls_q.__name__,
-                left_p.__name__,
-                right_q.__name__,
-            ),
+            f'Ambiguous kl_divergence({cls_p.__name__}, {cls_q.__name__}). Please register_kl({left_p.__name__}, {right_q.__name__})',
             RuntimeWarning,
         )
 
diff --git a/python/paddle/distribution/variable.py b/python/paddle/distribution/variable.py
index e9327fdee0b73..cc145dc60db8e 100644
--- a/python/paddle/distribution/variable.py
+++ b/python/paddle/distribution/variable.py
@@ -73,9 +73,7 @@ def constraint(self, value):
         ret = self._base.constraint(value)
         if ret.dim() < self._reinterpreted_batch_rank:
             raise ValueError(
-                "Input dimensions must be equal or grater than  {}".format(
-                    self._reinterpreted_batch_rank
-                )
+                f"Input dimensions must be equal or grater than  {self._reinterpreted_batch_rank}"
             )
         return ret.reshape(
             ret.shape[: ret.dim() - self.reinterpreted_batch_rank] + (-1,)
diff --git a/python/paddle/fft.py b/python/paddle/fft.py
index d0cbbb28c8123..d73c034c0a070 100644
--- a/python/paddle/fft.py
+++ b/python/paddle/fft.py
@@ -108,9 +108,7 @@ def _check_fft_axes(x, axes):
     for axis in axes:
         if not isinstance(axis, int) or axis < -ndim or axis >= ndim:
             raise ValueError(
-                "FFT axes {} contains invalid value ({}), it should be in range [-{}, {})".format(
-                    axes, axis, ndim, ndim
-                )
+                f"FFT axes {axes} contains invalid value ({axis}), it should be in range [-{ndim}, {ndim})"
             )
 
 
@@ -1528,9 +1526,7 @@ def fftn_c2c(x, s, axes, norm, forward, name):
         if s is not None:
             if len(s) != len(axes):
                 raise ValueError(
-                    "Length of s ({}) and length of axes ({}) does not match.".format(
-                        len(s), len(axes)
-                    )
+                    f"Length of s ({len(s)}) and length of axes ({len(axes)}) does not match."
                 )
             s = [s[i] for i in axes_argsoft]
 
@@ -1578,9 +1574,7 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name):
         if s is not None:
             if len(s) != len(axes):
                 raise ValueError(
-                    "Length of s ({}) and length of axes ({}) does not match.".format(
-                        len(s), len(axes)
-                    )
+                    f"Length of s ({len(s)}) and length of axes ({len(axes)}) does not match."
                 )
             s = [s[i] for i in axes_argsoft] + [s[-1]]
 
@@ -1640,9 +1634,7 @@ def fftn_c2r(x, s, axes, norm, forward, name):
         if s is not None:
             if len(s) != len(axes):
                 raise ValueError(
-                    "Length of s ({}) and length of axes ({}) does not match.".format(
-                        len(s), len(axes)
-                    )
+                    f"Length of s ({len(s)}) and length of axes ({len(axes)}) does not match."
                 )
             s = [s[i] for i in axes_argsoft] + [s[-1]]
 
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index c0015f6704a88..821fb0d0cf577 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -137,6 +137,8 @@ def _build_saved_state_dict(state_dict):
                     raise ValueError(
                         "The saved tensor is not initialized. If you used group sharded, please use save_group_sharded_model."
                     )
+                if value.is_dense() and value.place.is_custom_place():
+                    value = paddle._C_ops.npu_identity(value, -1)
                 save_dict[key] = np.array(value.cpu())
             name_table[key] = value.name
         else:
@@ -166,9 +168,13 @@ def _load_state_dict_from_save_inference_model(model_path, config):
         # 3. construct state_dict
         load_param_dict = {}
         for var_name in persistable_var_dict:
-            load_param_dict[var_name] = np.array(
-                persistable_var_dict[var_name].cpu()
-            )
+            tmp_var = persistable_var_dict[var_name]
+            if tmp_var.is_dense() and tmp_var.place.is_custom_place():
+                load_param_dict[var_name] = np.array(
+                    paddle._C_ops.npu_identity(tmp_var, -1).cpu()
+                )
+            else:
+                load_param_dict[var_name] = np.array(tmp_var.cpu())
 
         # if *.info exists, we can recover structured_name
         var_info_filename = str(config.params_filename) + ".info"
@@ -222,6 +228,8 @@ def _load_state_dict_from_save_params(model_path):
     # 3. construct state_dict
     load_param_dict = {}
     for var in load_var_list:
+        if var.is_dense() and var.place.is_custom_place():
+            var = paddle._C_ops.npu_identity(var, -1)
         load_param_dict[var.name] = np.array(var.cpu())
 
     return load_param_dict
@@ -365,7 +373,10 @@ def _pickle_save(obj, f, protocol):
         )
 
     def reduce_varbase(self):
-        data = np.array(self.cpu())
+        if self.is_dense() and self.place.is_custom_place():
+            data = np.array(paddle._C_ops.npu_identity(self, -1).cpu())
+        else:
+            data = np.array(self.cpu())
         name = self.name
 
         return (tuple, ((name, data),))
@@ -373,7 +384,10 @@ def reduce_varbase(self):
     def reduce_LoDTensor(self):
         p = core.Place()
         p.set_place(paddle.CPUPlace())
-        data = np.array(self._copy(p))
+        if self._place().is_custom_place():
+            data = np.array(paddle._C_ops.npu_identity(self, -1)._copy(p))
+        else:
+            data = np.array(self._copy(p))
 
         return (eval, ('data', {'data': data}))
 
@@ -566,9 +580,7 @@ def _parse_every_object(obj, condition_func, convert_func):
             (str, np.ndarray, core.eager.Tensor, core.LoDTensor),
         ):
             raise NotImplementedError(
-                "The iterable objects supported are tuple, list, dict, OrderedDict, string. But received {}.".format(
-                    type(obj)
-                )
+                f"The iterable objects supported are tuple, list, dict, OrderedDict, string. But received {type(obj)}."
             )
         return obj
 
@@ -628,9 +640,7 @@ def _save_lod_tensor(tensor, file_name):
 
     else:
         raise NotImplementedError(
-            'Only supports saving objects to file or BytesIO, but received {}'.format(
-                type(file_name)
-            )
+            f'Only supports saving objects to file or BytesIO, but received {type(file_name)}'
         )
     return _seek
 
@@ -649,9 +659,7 @@ def _load_lod_tensor(file_name):
 
     else:
         raise NotImplementedError(
-            'Only supports load objects from file or BytesIO, but received {}'.format(
-                type(file_name)
-            )
+            f'Only supports load objects from file or BytesIO, but received {type(file_name)}'
         )
 
     return temp_t, _seek
@@ -671,9 +679,7 @@ def _save_selected_rows(selected_rows, file_name):
             _seek = f.tell()
     else:
         raise NotImplementedError(
-            'Only supports saving objects to file or BytesIO, but received {}'.format(
-                type(file_name)
-            )
+            f'Only supports saving objects to file or BytesIO, but received {type(file_name)}'
         )
     return _seek
 
@@ -694,9 +700,7 @@ def _load_selected_rows(file_name):
 
     else:
         raise NotImplementedError(
-            'Only supports load objects from file or BytesIO, but received {}'.format(
-                type(file_name)
-            )
+            f'Only supports load objects from file or BytesIO, but received {type(file_name)}'
         )
 
     return temp_sr, _seek
@@ -712,9 +716,7 @@ def _save_binary_var(obj, path):
     else:
         # Since the concept of 'Tensor' is only exposed to users, the error message can only contain tensor instead of 'LoDTensor' or 'SelectedRows'
         raise NotImplementedError(
-            "When use_binary_format = True, `paddle.save`  expected Tensor, but received {}.".format(
-                type(obj)
-            )
+            f"When use_binary_format = True, `paddle.save`  expected Tensor, but received {type(obj)}."
         )
 
 
@@ -872,9 +874,7 @@ def save(obj, path, protocol=4, **configs):
 
     if not isinstance(config.use_binary_format, bool):
         raise TypeError(
-            "Type of `use_binary_format` should be bool, but received {}.".format(
-                type(config.use_binary_format)
-            )
+            f"Type of `use_binary_format` should be bool, but received {type(config.use_binary_format)}."
         )
 
     if config.use_binary_format:
@@ -1181,7 +1181,12 @@ def load(path, **configs):
                     if config.return_numpy:
                         p = core.Place()
                         p.set_place(paddle.CPUPlace())
-                        return np.array(tensor._copy(p))
+                        if tensor._place().is_custom_place():
+                            return np.array(
+                                paddle._C_ops.npu_identity(tensor, -1)._copy(p)
+                            )
+                        else:
+                            return np.array(tensor._copy(p))
                     else:
                         if in_dygraph_mode():
                             return _lod_tensor2varbase(tensor)
diff --git a/python/paddle/hapi/hub.py b/python/paddle/hapi/hub.py
index 4fa1d59cc9bc5..4ebd96d8ea1ce 100644
--- a/python/paddle/hapi/hub.py
+++ b/python/paddle/hapi/hub.py
@@ -57,9 +57,7 @@ def _git_archive_link(repo_owner, repo_name, branch, source):
             f'https://github.com/{repo_owner}/{repo_name}/archive/{branch}.zip'
         )
     elif source == 'gitee':
-        return 'https://gitee.com/{}/{}/repository/archive/{}.zip'.format(
-            repo_owner, repo_name, branch
-        )
+        return f'https://gitee.com/{repo_owner}/{repo_name}/repository/archive/{branch}.zip'
 
 
 def _parse_repo_info(repo, source):
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 328f3e0078052..d4721930490e7 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -1485,9 +1485,7 @@ def _check_match(key, param):
                 raise ValueError(f"{key} is not found in the providing file.")
             if list(state.shape) != list(param.shape):
                 raise ValueError(
-                    "{} receives a shape {}, but the expected shape is {}.".format(
-                        key, list(state.shape), list(param.shape)
-                    )
+                    f"{key} receives a shape {list(state.shape)}, but the expected shape is {list(param.shape)}."
                 )
             return param, state
 
@@ -1652,9 +1650,7 @@ def _check_amp_configs(amp_config_key_set):
             }
             if amp_config_key_set - accepted_param_set:
                 raise ValueError(
-                    "Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, but {} could not be recognized.".format(
-                        tuple(amp_config_key_set - accepted_param_set)
-                    )
+                    f"Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, but {tuple(amp_config_key_set - accepted_param_set)} could not be recognized."
                 )
 
             if 'use_fp16_guard' in amp_config_key_set:
diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
index d893e342122ed..49e085c93db4e 100644
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
@@ -45,10 +45,12 @@ def summary(net, input_size=None, dtypes=None, input=None):
 
     Examples:
         .. code-block:: python
+            :name: code-example-1
 
+            >>> # example 1: Single Input Demo
             >>> import paddle
             >>> import paddle.nn as nn
-            >>> paddle.seed(2023)
+            >>> # Define Network
             >>> class LeNet(nn.Layer):
             ...     def __init__(self, num_classes=10):
             ...         super().__init__()
@@ -76,21 +78,19 @@ def summary(net, input_size=None, dtypes=None, input=None):
             ...         return x
             ...
             >>> lenet = LeNet()
-
-            >>> params_info = paddle.summary(lenet, (1, 1, 28, 28))
-            >>> print(params_info)
+            >>> params_info = paddle.summary(lenet, (1, 1, 28, 28)) # doctest: +NORMALIZE_WHITESPACE
             ---------------------------------------------------------------------------
-            Layer (type)       Input Shape          Output Shape         Param #
+             Layer (type)       Input Shape          Output Shape         Param #
             ===========================================================================
-              Conv2D-1       [[1, 1, 28, 28]]      [1, 6, 28, 28]          60
+               Conv2D-1       [[1, 1, 28, 28]]      [1, 6, 28, 28]          60
                 ReLU-1        [[1, 6, 28, 28]]      [1, 6, 28, 28]           0
               MaxPool2D-1     [[1, 6, 28, 28]]      [1, 6, 14, 14]           0
-              Conv2D-2       [[1, 6, 14, 14]]     [1, 16, 10, 10]         2,416
+               Conv2D-2       [[1, 6, 14, 14]]     [1, 16, 10, 10]         2,416
                 ReLU-2       [[1, 16, 10, 10]]     [1, 16, 10, 10]           0
               MaxPool2D-2    [[1, 16, 10, 10]]      [1, 16, 5, 5]            0
-              Linear-1          [[1, 400]]            [1, 120]           48,120
-              Linear-2          [[1, 120]]            [1, 84]            10,164
-              Linear-3          [[1, 84]]             [1, 10]              850
+               Linear-1          [[1, 400]]            [1, 120]           48,120
+               Linear-2          [[1, 120]]            [1, 84]            10,164
+               Linear-3          [[1, 84]]             [1, 10]              850
             ===========================================================================
             Total params: 61,610
             Trainable params: 61,610
@@ -101,9 +101,34 @@ def summary(net, input_size=None, dtypes=None, input=None):
             Params size (MB): 0.24
             Estimated Total Size (MB): 0.35
             ---------------------------------------------------------------------------
+            <BLANKLINE>
+            >>> print(params_info)
             {'total_params': 61610, 'trainable_params': 61610}
-            >>> # multi input demo
-            >>> class LeNetMultiInput(LeNet):
+
+        .. code-block:: python
+            :name: code-example-2
+
+            >>> # example 2: multi input demo
+            >>> import paddle
+            >>> import paddle.nn as nn
+            >>> class LeNetMultiInput(nn.Layer):
+            ...     def __init__(self, num_classes=10):
+            ...         super().__init__()
+            ...         self.num_classes = num_classes
+            ...         self.features = nn.Sequential(
+            ...             nn.Conv2D(1, 6, 3, stride=1, padding=1),
+            ...             nn.ReLU(),
+            ...             nn.MaxPool2D(2, 2),
+            ...             nn.Conv2D(6, 16, 5, stride=1, padding=0),
+            ...             nn.ReLU(),
+            ...             nn.MaxPool2D(2, 2))
+            ...
+            ...         if num_classes > 0:
+            ...             self.fc = nn.Sequential(
+            ...                 nn.Linear(400, 120),
+            ...                 nn.Linear(120, 84),
+            ...                 nn.Linear(84, 10))
+            ...
             ...     def forward(self, inputs, y):
             ...         x = self.features(inputs)
             ...
@@ -116,20 +141,19 @@ def summary(net, input_size=None, dtypes=None, input=None):
 
             >>> params_info = paddle.summary(lenet_multi_input,
             ...                              [(1, 1, 28, 28), (1, 400)],
-            ...                              dtypes=['float32', 'float32'])
-            >>> print(params_info)
+            ...                              dtypes=['float32', 'float32']) # doctest: +NORMALIZE_WHITESPACE
             ---------------------------------------------------------------------------
-            Layer (type)       Input Shape          Output Shape         Param #
+             Layer (type)       Input Shape          Output Shape         Param #
             ===========================================================================
-              Conv2D-3       [[1, 1, 28, 28]]      [1, 6, 28, 28]          60
-                ReLU-3        [[1, 6, 28, 28]]      [1, 6, 28, 28]           0
-              MaxPool2D-3     [[1, 6, 28, 28]]      [1, 6, 14, 14]           0
-              Conv2D-4       [[1, 6, 14, 14]]     [1, 16, 10, 10]         2,416
-                ReLU-4       [[1, 16, 10, 10]]     [1, 16, 10, 10]           0
-              MaxPool2D-4    [[1, 16, 10, 10]]      [1, 16, 5, 5]            0
-              Linear-4          [[1, 400]]            [1, 120]           48,120
-              Linear-5          [[1, 120]]            [1, 84]            10,164
-              Linear-6          [[1, 84]]             [1, 10]              850
+               Conv2D-1       [[1, 1, 28, 28]]      [1, 6, 28, 28]          60
+                ReLU-1        [[1, 6, 28, 28]]      [1, 6, 28, 28]           0
+              MaxPool2D-1     [[1, 6, 28, 28]]      [1, 6, 14, 14]           0
+               Conv2D-2       [[1, 6, 14, 14]]     [1, 16, 10, 10]         2,416
+                ReLU-2       [[1, 16, 10, 10]]     [1, 16, 10, 10]           0
+              MaxPool2D-2    [[1, 16, 10, 10]]      [1, 16, 5, 5]            0
+               Linear-1          [[1, 400]]            [1, 120]           48,120
+               Linear-2          [[1, 120]]            [1, 84]            10,164
+               Linear-3          [[1, 84]]             [1, 10]              850
             ===========================================================================
             Total params: 61,610
             Trainable params: 61,610
@@ -140,9 +164,36 @@ def summary(net, input_size=None, dtypes=None, input=None):
             Params size (MB): 0.24
             Estimated Total Size (MB): 0.35
             ---------------------------------------------------------------------------
+            <BLANKLINE>
+            >>> print(params_info)
             {'total_params': 61610, 'trainable_params': 61610}
+
+        .. code-block:: python
+            :name: code-example-3
+
+            >>> # example 3: List/Dict Input Demo
+            >>> import paddle
+            >>> import paddle.nn as nn
+
             >>> # list input demo
-            >>> class LeNetListInput(LeNet):
+            >>> class LeNetListInput(nn.Layer):
+            ...     def __init__(self, num_classes=10):
+            ...         super().__init__()
+            ...         self.num_classes = num_classes
+            ...         self.features = nn.Sequential(
+            ...             nn.Conv2D(1, 6, 3, stride=1, padding=1),
+            ...             nn.ReLU(),
+            ...             nn.MaxPool2D(2, 2),
+            ...             nn.Conv2D(6, 16, 5, stride=1, padding=0),
+            ...             nn.ReLU(),
+            ...             nn.MaxPool2D(2, 2))
+            ...
+            ...         if num_classes > 0:
+            ...             self.fc = nn.Sequential(
+            ...                 nn.Linear(400, 120),
+            ...                 nn.Linear(120, 84),
+            ...                 nn.Linear(84, 10))
+            ...
             ...     def forward(self, inputs):
             ...         x = self.features(inputs[0])
             ...
@@ -153,20 +204,19 @@ def summary(net, input_size=None, dtypes=None, input=None):
             ...
             >>> lenet_list_input = LeNetListInput()
             >>> input_data = [paddle.rand([1, 1, 28, 28]), paddle.rand([1, 400])]
-            >>> params_info = paddle.summary(lenet_list_input, input=input_data)
-            >>> print(params_info)
+            >>> params_info = paddle.summary(lenet_list_input, input=input_data) # doctest: +NORMALIZE_WHITESPACE
             ---------------------------------------------------------------------------
-            Layer (type)       Input Shape          Output Shape         Param #
+             Layer (type)       Input Shape          Output Shape         Param #
             ===========================================================================
-              Conv2D-5       [[1, 1, 28, 28]]      [1, 6, 28, 28]          60
-                ReLU-5        [[1, 6, 28, 28]]      [1, 6, 28, 28]           0
-              MaxPool2D-5     [[1, 6, 28, 28]]      [1, 6, 14, 14]           0
-              Conv2D-6       [[1, 6, 14, 14]]     [1, 16, 10, 10]         2,416
-                ReLU-6       [[1, 16, 10, 10]]     [1, 16, 10, 10]           0
-              MaxPool2D-6    [[1, 16, 10, 10]]      [1, 16, 5, 5]            0
-              Linear-7          [[1, 400]]            [1, 120]           48,120
-              Linear-8          [[1, 120]]            [1, 84]            10,164
-              Linear-9          [[1, 84]]             [1, 10]              850
+               Conv2D-1       [[1, 1, 28, 28]]      [1, 6, 28, 28]          60
+                ReLU-1        [[1, 6, 28, 28]]      [1, 6, 28, 28]           0
+              MaxPool2D-1     [[1, 6, 28, 28]]      [1, 6, 14, 14]           0
+               Conv2D-2       [[1, 6, 14, 14]]     [1, 16, 10, 10]         2,416
+                ReLU-2       [[1, 16, 10, 10]]     [1, 16, 10, 10]           0
+              MaxPool2D-2    [[1, 16, 10, 10]]      [1, 16, 5, 5]            0
+               Linear-1          [[1, 400]]            [1, 120]           48,120
+               Linear-2          [[1, 120]]            [1, 84]            10,164
+               Linear-3          [[1, 84]]             [1, 10]              850
             ===========================================================================
             Total params: 61,610
             Trainable params: 61,610
@@ -177,9 +227,29 @@ def summary(net, input_size=None, dtypes=None, input=None):
             Params size (MB): 0.24
             Estimated Total Size (MB): 0.35
             ---------------------------------------------------------------------------
+            <BLANKLINE>
+            >>> print(params_info)
             {'total_params': 61610, 'trainable_params': 61610}
+
             >>> # dict input demo
-            >>> class LeNetDictInput(LeNet):
+            >>> class LeNetDictInput(nn.Layer):
+            ...     def __init__(self, num_classes=10):
+            ...         super().__init__()
+            ...         self.num_classes = num_classes
+            ...         self.features = nn.Sequential(
+            ...             nn.Conv2D(1, 6, 3, stride=1, padding=1),
+            ...             nn.ReLU(),
+            ...             nn.MaxPool2D(2, 2),
+            ...             nn.Conv2D(6, 16, 5, stride=1, padding=0),
+            ...             nn.ReLU(),
+            ...             nn.MaxPool2D(2, 2))
+            ...
+            ...         if num_classes > 0:
+            ...             self.fc = nn.Sequential(
+            ...                 nn.Linear(400, 120),
+            ...                 nn.Linear(120, 84),
+            ...                 nn.Linear(84, 10))
+            ...
             ...     def forward(self, inputs):
             ...         x = self.features(inputs['x1'])
             ...
@@ -191,20 +261,20 @@ def summary(net, input_size=None, dtypes=None, input=None):
             >>> lenet_dict_input = LeNetDictInput()
             >>> input_data = {'x1': paddle.rand([1, 1, 28, 28]),
             ...               'x2': paddle.rand([1, 400])}
-            >>> params_info = paddle.summary(lenet_dict_input, input=input_data)
-            >>> print(params_info)
+            >>> # The module suffix number indicates its sequence in modules of the same type, used for differentiation identification
+            >>> params_info = paddle.summary(lenet_dict_input, input=input_data) # doctest: +NORMALIZE_WHITESPACE
             ---------------------------------------------------------------------------
-            Layer (type)       Input Shape          Output Shape         Param #
+             Layer (type)       Input Shape          Output Shape         Param #
             ===========================================================================
-              Conv2D-7       [[1, 1, 28, 28]]      [1, 6, 28, 28]          60
-                ReLU-7        [[1, 6, 28, 28]]      [1, 6, 28, 28]           0
-              MaxPool2D-7     [[1, 6, 28, 28]]      [1, 6, 14, 14]           0
-              Conv2D-8       [[1, 6, 14, 14]]     [1, 16, 10, 10]         2,416
-                ReLU-8       [[1, 16, 10, 10]]     [1, 16, 10, 10]           0
-              MaxPool2D-8    [[1, 16, 10, 10]]      [1, 16, 5, 5]            0
-              Linear-10         [[1, 400]]            [1, 120]           48,120
-              Linear-11         [[1, 120]]            [1, 84]            10,164
-              Linear-12         [[1, 84]]             [1, 10]              850
+               Conv2D-3       [[1, 1, 28, 28]]      [1, 6, 28, 28]          60
+                ReLU-3        [[1, 6, 28, 28]]      [1, 6, 28, 28]           0
+              MaxPool2D-3     [[1, 6, 28, 28]]      [1, 6, 14, 14]           0
+               Conv2D-4       [[1, 6, 14, 14]]     [1, 16, 10, 10]         2,416
+                ReLU-4       [[1, 16, 10, 10]]     [1, 16, 10, 10]           0
+              MaxPool2D-4    [[1, 16, 10, 10]]      [1, 16, 5, 5]            0
+               Linear-4          [[1, 400]]            [1, 120]           48,120
+               Linear-5          [[1, 120]]            [1, 84]            10,164
+               Linear-6          [[1, 84]]             [1, 10]              850
             ===========================================================================
             Total params: 61,610
             Trainable params: 61,610
@@ -215,6 +285,8 @@ def summary(net, input_size=None, dtypes=None, input=None):
             Params size (MB): 0.24
             Estimated Total Size (MB): 0.35
             ---------------------------------------------------------------------------
+            <BLANKLINE>
+            >>> print(params_info)
             {'total_params': 61610, 'trainable_params': 61610}
 
     """
diff --git a/python/paddle/hapi/static_flops.py b/python/paddle/hapi/static_flops.py
index a627dbb68ea4a..9b7798a35b369 100644
--- a/python/paddle/hapi/static_flops.py
+++ b/python/paddle/hapi/static_flops.py
@@ -224,9 +224,7 @@ def add_row(self, row_str):
             print('The row_str should be a list')
         if len(row_str) != self.col_num:
             print(
-                'The length of row data should be equal the length of table heads, but the data: {} is not equal table heads {}'.format(
-                    len(row_str), self.col_num
-                )
+                f'The length of row data should be equal the length of table heads, but the data: {len(row_str)} is not equal table heads {self.col_num}'
             )
         for i in range(self.col_num):
             if len(str(row_str[i])) > self.table_len[i]:
diff --git a/python/paddle/incubate/asp/asp.py b/python/paddle/incubate/asp/asp.py
index fbe1eac9b9d26..89004cfe6c01e 100644
--- a/python/paddle/incubate/asp/asp.py
+++ b/python/paddle/incubate/asp/asp.py
@@ -459,9 +459,7 @@ def prune_model(model, n=2, m=4, mask_algo='mask_1d', with_mask=True):
             place = paddle.CUDAPlace(gpu_id)
     else:
         raise TypeError(
-            "model should be paddle.nn.Layer or paddle.static.Program, but got {}".format(
-                type(model)
-            )
+            f"model should be paddle.nn.Layer or paddle.static.Program, but got {type(model)}"
         )
 
     return prune_func(
@@ -599,11 +597,9 @@ def prune_model_by_program(
                         ASPHelper._get_mask_name(param.name)
                     )
                     assert weight_mask_param is not None, (
-                        'Cannot find {} variable, please call optimizer.minimize ('
+                        f'Cannot find {ASPHelper._get_mask_name(param.name)} variable, please call optimizer.minimize ('
                         'paddle.incubate.asp.decorate(optimizer).minimize(loss)'
-                        ' and initialization (exe.run(startup_program)) first!'.format(
-                            ASPHelper._get_mask_name(param.name)
-                        )
+                        ' and initialization (exe.run(startup_program)) first!'
                     )
                     weight_mask_tensor = weight_mask_param.get_tensor()
                     weight_sparse_mask = weight_sparse_mask.astype(
@@ -650,10 +646,8 @@ def prune_model_by_layer(
                             param.name, None
                         )
                         assert weight_mask_param is not None, (
-                            'Cannot find {} variable, please call asp.decorate() to'
-                            ' decorate your optimizer first!'.format(
-                                ASPHelper._get_mask_name(param.name)
-                            )
+                            f'Cannot find {ASPHelper._get_mask_name(param.name)} variable, please call asp.decorate() to'
+                            ' decorate your optimizer first!'
                         )
                         weight_mask_param.set_value(weight_sparse_mask)
 
diff --git a/python/paddle/incubate/asp/supported_layer_list.py b/python/paddle/incubate/asp/supported_layer_list.py
index 7720a1cf7127c..dffbaeecee31d 100644
--- a/python/paddle/incubate/asp/supported_layer_list.py
+++ b/python/paddle/incubate/asp/supported_layer_list.py
@@ -39,16 +39,12 @@ def _default_pruning(weight_nparray, m, n, func_name, param_name):
     exclude_cond_shape4 = len(shape) == 4 and shape[1] < m
     if exclude_cond_shape2:
         _logger.warning(
-            '{} is not pruned because the first dimension of {} is smaller than {}'.format(
-                param_name, shape, m
-            )
+            f'{param_name} is not pruned because the first dimension of {shape} is smaller than {m}'
         )
         return weight_pruned_nparray, weight_sparse_mask
     if exclude_cond_shape4:
         _logger.warning(
-            '{} is not pruned because the second dimension of {} is smaller than {}'.format(
-                param_name, shape, m
-            )
+            f'{param_name} is not pruned because the second dimension of {shape} is smaller than {m}'
         )
         return weight_pruned_nparray, weight_sparse_mask
 
diff --git a/python/paddle/incubate/autograd/composite_rules.py b/python/paddle/incubate/autograd/composite_rules.py
index eeb99d869b581..c96f81b527430 100644
--- a/python/paddle/incubate/autograd/composite_rules.py
+++ b/python/paddle/incubate/autograd/composite_rules.py
@@ -74,7 +74,7 @@ def composite_batchnorm(
 ):
     """
     define composite rule of op batch_norm
-    As the same with op kernel, the position of savedvariance indeed return inverse std.
+    As the same with op kernel, the position of saved variance indeed return inverse std.
     """
 
     is_amp = False
@@ -136,7 +136,7 @@ def composite_batchnorm(
     run_mean_ = assign(run_mean)
     run_var_ = assign(run_var)
 
-    # reserve_space is not needed in composite rule, but still ruturn None to keep same as phi op definition.
+    # reserve_space is not needed in composite rule, but still return None to keep same as phi op definition.
     reserve_space = None
     if not use_run_stat:
         return y, run_mean_, run_var_, batch_mean_, inv_std_, reserve_space
@@ -657,22 +657,6 @@ def unsqueeze_composite(x, axis):
     return [out, None]
 
 
-@REGISTER_COMPOSITE('rsqrt')
-def rsqrt_composite(x):
-    """define composite rule of op rsqrt."""
-    # rsqrt(x) = x^(-0.5)
-    is_amp = False
-    from paddle.base.data_feeder import convert_dtype
-
-    dtype = convert_dtype(x.dtype)
-    if dtype in ["float16", "uint16"]:
-        is_amp = True
-        x = cast(x, "float32")
-    y = full(x.shape if len(x.shape) == 0 else [1], -0.5, x.dtype)
-    res = pow(x, y)
-    return res if not is_amp else cast(res, dtype)
-
-
 @REGISTER_COMPOSITE('group_norm')
 def group_norm_composite(x, scale, bias, epsilon, groups, data_layout):
     """
diff --git a/python/paddle/incubate/autograd/functional.py b/python/paddle/incubate/autograd/functional.py
index 8bec01b1c39ae..8ae915a1e4868 100644
--- a/python/paddle/incubate/autograd/functional.py
+++ b/python/paddle/incubate/autograd/functional.py
@@ -598,7 +598,7 @@ def _separate(xs):
     or ``detach`` .
 
     Internally, ``paddle.grad(xs, ys)`` is stateful API implemented based on
-    computional graph, which will reduce gradients along all path from ys to xs.
+    computational graph, which will reduce gradients along all path from ys to xs.
 
     However, functional autograd API such as ``vjp``, ``jvp`` is stateless, and
     only compute gradients with a given ``func`` .
diff --git a/python/paddle/incubate/autograd/primapi.py b/python/paddle/incubate/autograd/primapi.py
index d0c7d41ef194d..109cde97a75ca 100644
--- a/python/paddle/incubate/autograd/primapi.py
+++ b/python/paddle/incubate/autograd/primapi.py
@@ -151,9 +151,9 @@ def grad(outputs, inputs, grad_outputs=None):
     """
     if not utils.prim_enabled():
         grad_inputs = backward.gradients(outputs, inputs, grad_outputs)
-        # backward.gradients returns a list though the inputs is a signle Tensor.
+        # backward.gradients returns a list though the inputs is a single Tensor.
         # The follow code snippet fixes the problem by return the first element
-        # of grad_inputs when the inputs is a signle Tensor.
+        # of grad_inputs when the inputs is a single Tensor.
         if (
             isinstance(inputs, framework.Variable)
             and isinstance(grad_inputs, typing.Sequence)
diff --git a/python/paddle/incubate/autograd/primops.py b/python/paddle/incubate/autograd/primops.py
deleted file mode 100644
index 2e777742fb36d..0000000000000
--- a/python/paddle/incubate/autograd/primops.py
+++ /dev/null
@@ -1,549 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import functools
-import operator
-
-import paddle
-from paddle.base.layer_helper import LayerHelper
-
-from .primreg import REGISTER_FN
-
-
-def _simple_unop(helper):
-    optype = helper.layer_type
-    x, out = tuple(map(helper.kwargs.get, ('x', 'out')))
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    helper.append_op(type=optype, inputs={'X': x}, outputs={'Y': out}, attrs={})
-    return out
-
-
-def _simple_binop(helper):
-    optype = helper.layer_type
-    x, y, out = tuple(map(helper.kwargs.get, ('x', 'y', 'out')))
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    helper.append_op(
-        type=optype, inputs={'X': x, 'Y': y}, outputs={'Z': out}, attrs={}
-    )
-    return out
-
-
-def _manipulation_unop(helper):
-    optype = helper.layer_type
-    x, out = tuple(map(helper.kwargs.get, ('x', 'out')))
-
-    attrs = {
-        k: helper.kwargs[k]
-        for k in ('shape', 'axis', 'index')
-        if k in helper.kwargs
-    }
-
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    helper.append_op(
-        type=optype, inputs={'X': x}, outputs={'Y': out}, attrs=attrs
-    )
-    return out
-
-
-# Each primitive op is given a Python constructor for sake of convenience.
-def fill_const(value, shape, dtype, out=None):
-    attrs = {'value': value, 'shape': shape, 'dtype': dtype}
-    helper = LayerHelper('fill_constant_p', **locals())
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type=helper.layer_type, outputs={'Y': out}, attrs=attrs)
-    return out
-
-
-def bernoulli(shape, dtype, p, out=None):
-    attrs = {'shape': shape, 'dtype': dtype, 'p': p}
-    helper = LayerHelper('bernoulli_p', **locals())
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type=helper.layer_type, outputs={'Y': out}, attrs=attrs)
-    return out
-
-
-def neg(x, out=None):
-    zero = fill_const(0.0, x.shape, x.dtype)
-    return sub(zero, x)
-
-
-def set_value(x, y, axis, starts, ends, strides, out):
-    assert x is out, "x and out should be the same Tensor in set_value"
-    attrs = {'axes': axis, 'starts': starts, 'ends': ends, 'steps': strides}
-    helper = LayerHelper('set_value', **locals())
-    helper.append_op(
-        type=helper.layer_type,
-        inputs={'Input': x, 'ValueTensor': y},
-        outputs={'Out': out},
-        attrs=attrs,
-    )
-    return out
-
-
-def mean(x, axis=None, keepdim=False):
-    axes = axis or tuple(range(0, len(x.shape)))
-    sum = reduce_sum(x, axis=axes, keepdim=keepdim)
-    norm = fill_const(
-        shape=sum.shape,
-        value=functools.reduce(operator.mul, [x.shape[axis] for axis in axes]),
-        dtype=sum.dtype,
-    )
-    return div(sum, norm)
-
-
-def ones(shape, dtype):
-    return fill_const(1, shape, dtype)
-
-
-def zeros(shape, dtype):
-    return fill_const(0, shape, dtype)
-
-
-def batch_norm(
-    x,
-    axis,
-    gamma,
-    beta,
-    run_mean,
-    run_var,
-    eps=1e-5,
-    momentum=0.9,
-    use_run_stat=False,
-    reserve_space=None,
-):
-    """batch normalizer.
-
-    Args:
-        x (Tensor): A tensor to be normalized.
-        axis (int): The features axis.
-        gamma (Tensor): The scale factor.
-        beta (float): The shift factor.
-        run_mean (Tensor): Running mean.
-        run_var (Tensor): Running variance.
-        eps (float, optional): A value added to the denominator for numerical
-            stability. Defaults to 1e-5.
-        momentum (float, optional): The value used for the running_mean and
-            running_var computation. Can be set to None for cumulative moving
-            average (i.e. simple average). Defaults to 0.9.
-        use_run_stat (bool, optional): Whether or not using running statistics.
-            Defaults to False.
-    """
-    reduce_axes = tuple(i for i in range(len(x.shape)) if i != axis)
-    stats_shape = tuple(
-        1 if i in reduce_axes else s for i, s in enumerate(x.shape)
-    )
-
-    batch_mean = zeros(run_mean.shape, run_mean.dtype)
-    batch_var = zeros(run_var.shape, run_var.dtype)
-
-    if not use_run_stat:
-        batch_mean = mean(x, reduce_axes, keepdim=True)
-        batch_var = mean(
-            square(sub(x, broadcast(batch_mean, x.shape))),
-            reduce_axes,
-            keepdim=True,
-        )
-        x_hat = div(
-            sub(x, broadcast(batch_mean, x.shape)),
-            sqrt(
-                add(
-                    broadcast(batch_var, x.shape),
-                    fill_const(eps, x.shape, batch_var.dtype),
-                )
-            ),
-        )
-
-        momentum = fill_const(momentum, run_mean.shape, run_mean.dtype)
-        run_mean = add(
-            mul(momentum, run_mean),
-            mul(
-                sub(ones(run_mean.shape, run_mean.dtype), momentum),
-                reshape(batch_mean, run_mean.shape),
-            ),
-        )
-        run_var = add(
-            mul(momentum, run_var),
-            mul(
-                sub(ones(run_var.shape, run_var.dtype), momentum),
-                reshape(batch_var, run_var.shape),
-            ),
-        )
-    else:
-        x_hat = div(
-            sub(x, broadcast(reshape(run_mean, stats_shape), x.shape)),
-            sqrt(
-                add(
-                    broadcast(reshape(run_var, stats_shape), x.shape),
-                    fill_const(eps, x.shape, x.dtype),
-                )
-            ),
-        )
-    y = add(
-        mul(broadcast(reshape(gamma, stats_shape), x_hat.shape), x_hat),
-        broadcast(reshape(beta, stats_shape), x_hat.shape),
-    )
-
-    if reserve_space:
-        return run_mean, reserve_space, batch_mean, batch_var, run_var, y
-    else:
-        return run_mean, batch_mean, batch_var, run_var, y
-
-
-def square(x):
-    return pow(x, fill_const(2.0, x.shape, x.dtype))
-
-
-@REGISTER_FN('add_p', 'X', 'Y', 'Z')
-def add(x, y, out=None):
-    return _simple_binop(LayerHelper('add_p', **locals()))
-
-
-@REGISTER_FN('sub_p', 'X', 'Y', 'Z')
-def sub(x, y, out=None):
-    return _simple_binop(LayerHelper('sub_p', **locals()))
-
-
-@REGISTER_FN('mul_p', 'X', 'Y', 'Z')
-def mul(x, y, out=None):
-    return _simple_binop(LayerHelper('mul_p', **locals()))
-
-
-@REGISTER_FN('div_p', 'X', 'Y', 'Z')
-def div(x, y, out=None):
-    return _simple_binop(LayerHelper('div_p', **locals()))
-
-
-@REGISTER_FN('sqrt_p', 'X', 'Y')
-def sqrt(x, out=None):
-    return _simple_unop(LayerHelper('sqrt_p', **locals()))
-
-
-@REGISTER_FN('tanh_p', 'X', 'Y')
-def tanh(x, out=None):
-    return _simple_unop(LayerHelper('tanh_p', **locals()))
-
-
-@REGISTER_FN('sin_p', 'X', 'Y')
-def sin(x, out=None):
-    return _simple_unop(LayerHelper('sin_p', **locals()))
-
-
-@REGISTER_FN('cos_p', 'X', 'Y')
-def cos(x, out=None):
-    return _simple_unop(LayerHelper('cos_p', **locals()))
-
-
-@REGISTER_FN('exp_p', 'X', 'Y')
-def exp(x, out=None):
-    return _simple_unop(LayerHelper('exp_p', **locals()))
-
-
-@REGISTER_FN('abs_p', 'X', 'Y')
-def abs(x, out=None):
-    return _simple_unop(LayerHelper('abs_p', **locals()))
-
-
-@REGISTER_FN('reshape_p', 'X', 'Y')
-def reshape(x, shape, out=None):
-    return _manipulation_unop(LayerHelper('reshape_p', **locals()))
-
-
-@REGISTER_FN('broadcast_p', 'X', 'Y')
-def broadcast(x, shape, out=None):
-    return _manipulation_unop(LayerHelper('broadcast_p', **locals()))
-
-
-@REGISTER_FN('transpose_p', 'X', 'Y')
-def transpose(x, axis=None, out=None):
-    return _manipulation_unop(LayerHelper('transpose_p', **locals()))
-
-
-@REGISTER_FN('split_p', 'X', 'YS')
-def split(x, num_or_sections, axis=0, outs=None):
-    if isinstance(num_or_sections, (list, tuple)):
-        n = len(num_or_sections)
-    else:
-        if not isinstance(num_or_sections, int):
-            raise TypeError(
-                f'num_or_sections must be int, but got {type(num_or_sections)}.'
-            )
-        n = num_or_sections
-
-    attrs = {'num_or_sections': num_or_sections, 'axis': axis}
-
-    helper = LayerHelper('split_p', **locals())
-    if outs is None:
-        outs = [
-            helper.create_variable_for_type_inference(dtype=x.dtype)
-            for i in range(n)
-        ]
-    helper.append_op(
-        type=helper.layer_type,
-        inputs={'X': x},
-        outputs={'YS': outs},
-        attrs=attrs,
-    )
-    return outs
-
-
-@REGISTER_FN('concat_p', 'XS', 'Y')
-def concat(xs, axis=0, out=None):
-    if isinstance(xs, paddle.base.framework.Variable):
-        xs = [xs]
-    attrs = {'axis': axis}
-    helper = LayerHelper('concat_p', **locals())
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype=xs[0].dtype)
-    helper.append_op(
-        type=helper.layer_type,
-        inputs={'XS': xs},
-        outputs={'Y': out},
-        attrs=attrs,
-    )
-    return out
-
-
-@REGISTER_FN('reduce_sum_p', 'X', 'Y')
-def reduce_sum(x, axis=None, keepdim=False, out=None):
-    axes = axis or tuple(range(0, len(x.shape)))
-    axes = (axes,) if isinstance(axes, int) else axes
-    if not isinstance(axis, (tuple, list)):
-        raise TypeError(f'axis must be tuple or list, but got {type(axis)}')
-    if not isinstance(keepdim, bool):
-        raise TypeError(f'keepdim must be bool, but got {type(keepdim)}')
-
-    attrs = {'axis': axis, 'keepdim': keepdim}
-    helper = LayerHelper('reduce_sum_p', **locals())
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    helper.append_op(
-        type=helper.layer_type, inputs={'X': x}, outputs={'Y': out}, attrs=attrs
-    )
-    return out
-
-
-@REGISTER_FN('matmul_p', 'X', 'Y', 'Z')
-def matmul(x, y, out=None):
-    return _simple_binop(LayerHelper('matmul_p', **locals()))
-
-
-@REGISTER_FN('slice_select_p', 'X', 'Y')
-def slice_select(x, axis, starts, ends, strides, out=None):
-    if not isinstance(axis, (list, tuple)):
-        raise TypeError(
-            f'Argument type error. `axis` is supposed to be list or'
-            f' tuple but found {type(axis)}.'
-        )
-    if not isinstance(starts, (list, tuple)):
-        raise TypeError(
-            f'Argument type error. `starts` is supposed to be list or'
-            f' tuple but found {type(starts)}.'
-        )
-    if not isinstance(ends, (list, tuple)):
-        raise TypeError(
-            f'Argument type error. `ends` is supposed to be list or'
-            f' tuple but found {type(ends)}.'
-        )
-    assert len(axis) == len(starts) == len(ends) == len(strides), (
-        f'len(axis), len(starts), len(ends) and len(strides) should be equal, '
-        f'but len(axis)={len(axis)}, len(starts)={len(starts)}, '
-        f'len(ends)={len(ends)} and len(strides)={len(strides)}'
-    )
-
-    attrs = {'axis': axis, 'starts': starts, 'ends': ends, 'strides': strides}
-    helper = LayerHelper('slice_select_p', **locals())
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type=helper.layer_type, inputs={'X': x}, outputs={'Y': out}, attrs=attrs
-    )
-    return out
-
-
-@REGISTER_FN('slice_assign_p', 'X', 'Y', 'Z')
-def slice_assign(x, y, axis, starts, ends, strides, out=None):
-    assert len(starts) == len(ends) == len(strides) == len(axis), (
-        f'len(starts), len(ends), len(strides) and len(axis) should be equal, '
-        f'but len(starts)={len(starts)}, len(ends)={len(ends)}, '
-        f'len(strides)={len(strides)} and len(axis)={len(axis)}'
-    )
-    assert len(y.shape) == len(x.shape), (
-        f'len(y.shape) should be equal to len(x.shape), '
-        f'but len(y.shape)={len(y.shape)} and len(x.shape)={len(x.shape)}.'
-    )
-
-    attrs = {'axis': axis, 'starts': starts, 'ends': ends, 'strides': strides}
-    helper = LayerHelper('slice_assign_p', **locals())
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type=helper.layer_type,
-        inputs={'X': x, 'Y': y},
-        outputs={'Z': out},
-        attrs=attrs,
-    )
-    return out
-
-
-@REGISTER_FN('gather_p', 'X', 'IndexTensor', 'Y')
-def gather(x, indextensor, axis, out=None):
-    attrs = {'axis': axis}
-    helper = LayerHelper('gather_p', **locals())
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type=helper.layer_type,
-        inputs={'X': x, 'IndexTensor': indextensor},
-        outputs={'Y': out},
-        attrs=attrs,
-    )
-    return out
-
-
-@REGISTER_FN('scatter_add_p', 'X', 'Y', 'IndexTensor', 'Z')
-def scatter_add(x, y, indextensor, axis, out=None):
-    assert len(x.shape) == len(y.shape), (
-        f'len(x.shape) should be equal to len(y.shape), '
-        f'but len(x.shape)={len(x.shape)} and len(y.shape)={len(y.shape)}.'
-    )
-    assert (
-        len(indextensor.shape) == 1
-    ), f'len(indextensor.shape) must be equal to 1, but got {len(indextensor.shape)}.'
-    assert y.shape[axis] == indextensor.shape[0], (
-        f'y.shape[axis] should be equal to indextensor.shape[0], '
-        f'but y.shape[axis]={y.shape[axis]} and '
-        f'indextensor.shape[0]={indextensor.shape[0]}.'
-    )
-    attrs = {'axis': axis}
-    helper = LayerHelper('scatter_add_p', **locals())
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type=helper.layer_type,
-        inputs={'X': x, 'Y': y, 'IndexTensor': indextensor},
-        outputs={'Z': out},
-        attrs=attrs,
-    )
-    return out
-
-
-@REGISTER_FN('log_p', 'X', 'Y')
-def log(x, out=None):
-    return _simple_unop(LayerHelper('log_p', **locals()))
-
-
-@REGISTER_FN('select_p', 'Condition', 'X', 'Y', 'Z')
-def select(cond, x, y, out=None):
-    if len(cond.shape) != len(x.shape):
-        raise ValueError(
-            "len(cond.shape) should be equal to len(x.shape), but len(cond.shape)={} and len(x.shape)={}.".format(
-                len(cond.shape), len(x.shape)
-            )
-        )
-
-    if len(x.shape) != len(y.shape):
-        raise ValueError(
-            "len(x.shape) should be equal to len(y.shape), but len(x.shape)={} and len(y.shape)={}.".format(
-                len(x.shape), len(y.shape)
-            )
-        )
-
-    helper = LayerHelper('select_p', **locals())
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type=helper.layer_type,
-        inputs={'Condition': cond, 'X': x, 'Y': y},
-        outputs={'Z': out},
-    )
-    return out
-
-
-@REGISTER_FN('eq_p', 'X', 'Y', 'Z')
-def eq(x, y, out=None):
-    return _simple_binop(LayerHelper('eq_p', **locals()))
-
-
-@REGISTER_FN('gt_p', 'X', 'Y', 'Z')
-def gt(x, y, out=None):
-    return _simple_binop(LayerHelper('gt_p', **locals()))
-
-
-@REGISTER_FN('ge_p', 'X', 'Y', 'Z')
-def ge(x, y, out=None):
-    return _simple_binop(LayerHelper('ge_p', **locals()))
-
-
-@REGISTER_FN('ne_p', 'X', 'Y', 'Z')
-def ne(x, y, out=None):
-    return _simple_binop(LayerHelper('ne_p', **locals()))
-
-
-@REGISTER_FN('pow_p', 'X', 'Y', 'Z')
-def pow(x, y, out=None):
-    return _simple_binop(LayerHelper('pow_p', **locals()))
-
-
-@REGISTER_FN('max_p', 'X', 'Y', 'Z')
-def max(x, y, out=None):
-    return _simple_binop(LayerHelper('max_p', **locals()))
-
-
-@REGISTER_FN('erf_p', 'X', 'Y')
-def erf(x, out=None):
-    return _simple_unop(LayerHelper('erf_p', **locals()))
-
-
-@REGISTER_FN('cast_p', 'X', 'Y')
-def cast(x, dtype, out=None):
-    helper = LayerHelper('cast_p', **locals())
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type=helper.layer_type,
-        inputs={'X': x},
-        outputs={'Y': out},
-        attrs={'dtype': dtype},
-    )
-    return out
-
-
-@REGISTER_FN('rsqrt_p', 'X', 'Y')
-def rsqrt(x, out=None):
-    return _simple_unop(LayerHelper('rsqrt_p', **locals()))
-
-
-@REGISTER_FN('uniform_random_p', 'Out')
-def uniform_random(dtype, min_value, max_value, seed, shape=None, out=None):
-    attrs = {
-        'shape': shape,
-        'dtype': dtype,
-        'min': min_value,
-        'max': max_value,
-        'seed': seed,
-    }
-    helper = LayerHelper('uniform_random_p', **locals())
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type=helper.layer_type, outputs={'Out': out}, attrs=attrs)
-    return out
diff --git a/python/paddle/incubate/autograd/primrules.py b/python/paddle/incubate/autograd/primrules.py
index 0fd8a77dd7205..28792863325a3 100644
--- a/python/paddle/incubate/autograd/primrules.py
+++ b/python/paddle/incubate/autograd/primrules.py
@@ -11,63 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import functools
-import math
-import operator
-import typing
 
-import paddle
-
-from . import primops
-from .primops import (
-    add,
-    bernoulli,
-    broadcast,
-    concat,
-    cos,
-    div,
-    eq,
-    erf,
-    exp,
-    fill_const,
-    gather,
-    ge,
-    gt,
-    log,
-    matmul,
-    mul,
-    ne,
-    neg,
-    reduce_sum,
-    reshape,
-    rsqrt,
-    scatter_add,
-    select,
-    set_value,
-    sin,
-    slice_assign,
-    slice_select,
-    split,
-    sqrt,
-    sub,
-    tanh,
-    transpose,
-    uniform_random,
-)
 from .primreg import (
-    REGISTER_JVP,
-    REGISTER_ORIG2PRIM,
-    REGISTER_PRIM2ORIG,
-    REGISTER_TRANSPOSE,
     lookup_fn,
     lookup_jvp,
     lookup_orig2prim,
     lookup_prim2orig,
     lookup_transpose,
-    op_position_inputs,
-    op_position_output,
 )
-from .utils import INT_DTYPE_2_STRING, get_output_var_list
 
 
 def _orig2prim(op, *args):
@@ -126,1252 +77,3 @@ def linear_jvp(op, *args, **kwargs):
 slice
 p_norm
 """
-
-
-@REGISTER_ORIG2PRIM('elementwise_add')
-def elementwise_add_orig2prim(op, x, y):
-    if x.shape != y.shape:
-        y = broadcast(y, shape=x.shape)
-    return add(x, y)
-
-
-@REGISTER_ORIG2PRIM('elementwise_sub')
-def elementwise_sub_orig2prim(op, x, y):
-    if x.shape != y.shape:
-        y = broadcast(y, shape=x.shape)
-    return sub(x, y)
-
-
-@REGISTER_ORIG2PRIM('elementwise_mul')
-def elementwise_mul_orig2prim(op, x, y):
-    if x.shape != y.shape:
-        y = broadcast(y, shape=x.shape)
-    return mul(x, y)
-
-
-@REGISTER_ORIG2PRIM('elementwise_div')
-def elementwise_div_orig2prim(op, x, y):
-    if x.shape != y.shape:
-        y = broadcast(y, shape=x.shape)
-    return primops.div(x, y)
-
-
-@REGISTER_ORIG2PRIM('tanh')
-def tanh_orig2prim(op, x):
-    return tanh(x)
-
-
-@REGISTER_ORIG2PRIM('sin')
-def sin_orig2prim(op, x):
-    return sin(x)
-
-
-@REGISTER_ORIG2PRIM('cos')
-def cos_orig2prim(op, x):
-    return cos(x)
-
-
-@REGISTER_ORIG2PRIM('exp')
-def exp_orig2prim(op, x):
-    return exp(x)
-
-
-@REGISTER_ORIG2PRIM('erf')
-def erf_orig2prim(op, x):
-    return erf(x)
-
-
-@REGISTER_ORIG2PRIM('abs')
-def abs_orig2prim(op, x):
-    return primops.abs(x)
-
-
-@REGISTER_ORIG2PRIM('log')
-def log_orig2prim(op, x):
-    return log(x)
-
-
-@REGISTER_ORIG2PRIM('fill_zeros_like')
-def fill_zeros_like_orig2prim(op, x):
-    return fill_const(value=0.0, shape=x.shape, dtype=x.dtype)
-
-
-@REGISTER_ORIG2PRIM('fill_any_like')
-def fill_any_like_orig2prim(op, x):
-    if op.attr('dtype') == -1:
-        return fill_const(value=op.attr('value'), shape=x.shape, dtype=x.dtype)
-    return fill_const(
-        value=op.attr('value'),
-        shape=x.shape,
-        dtype=paddle.dtype(op.attr('dtype')),
-    )
-
-
-@REGISTER_ORIG2PRIM('fill_constant')
-def fill_const_orig2prim(
-    op, shape_tensor=None, shape_tensor_list=None, value_tensor=None
-):
-    if shape_tensor or shape_tensor_list or value_tensor:
-        raise TypeError(
-            'fill_const_orig2prim currently not support Tensor input of shape and value.'
-        )
-    return fill_const(
-        value=op.attr('value'),
-        shape=op.attr('shape'),
-        dtype=paddle.dtype(op.attr('dtype')),
-    )
-
-
-@REGISTER_ORIG2PRIM('sum')
-def sum_orig2prim(op, xs):
-    x0 = xs[0]
-    for x in xs[1:]:
-        x0 = add(x0, x)
-    return x0
-
-
-@REGISTER_ORIG2PRIM('index_select')
-def index_select_orig2prim(op, index_t, x):
-    return gather(x, indextensor=index_t, axis=op.attr('dim'))
-
-
-@REGISTER_ORIG2PRIM('scale')
-def scale_orig2prim(op, scale_t, x):
-    if scale_t is None:
-        scale_t = fill_const(
-            shape=x.shape, dtype=x.dtype, value=op.attr('scale')
-        )
-    bias_t = fill_const(shape=x.shape, dtype=x.dtype, value=op.attr('bias'))
-    if op.attr('bias_after_scale'):
-        return add(mul(x, scale_t), bias_t)
-    else:
-        return mul(add(x, bias_t), scale_t)
-
-
-@REGISTER_ORIG2PRIM('assign')
-def assign_orig2prim(op, x):
-    zero_t = fill_const(shape=x.shape, dtype=x.dtype, value=0.0)
-    return add(x, zero_t)
-
-
-@REGISTER_ORIG2PRIM('sqrt')
-def sqrt_orig2prim(op, x):
-    return sqrt(x)
-
-
-@REGISTER_ORIG2PRIM('rsqrt')
-def rsqrt_orig2prim(op, x):
-    return rsqrt(x)
-
-
-@REGISTER_ORIG2PRIM('matmul_v2')
-def matmul_v2_orig2prim(op, x, y):
-    def trans(shape):
-        ret = list(range(len(shape)))
-        ret[-1], ret[-2] = ret[-2], ret[-1]
-        return ret
-
-    assert (
-        len(x.shape) < 4 and len(y.shape) < 4
-    ), 'Do not support multi batchsize dimensions currently.'
-
-    if len(x.shape) == 1:
-        x = broadcast(x, shape=[1, x.shape[0]])
-    if len(y.shape) == 1:
-        y = broadcast(y, shape=[y.shape[0], 1])
-    if op.attr('trans_x'):
-        x = transpose(x, axis=trans(x.shape))
-    if op.attr('trans_y'):
-        y = transpose(y, axis=trans(y.shape))
-    return matmul(x, y)
-
-
-# NOTE(lml): The second output of reshape2 Xshape, which is only used in reshape2_grad, is meanlingless in new autograd mechanism, thus we use a zero tensor instead.
-@REGISTER_ORIG2PRIM('reshape2')
-def reshape2_orig2prim(op, shape_t, shape_tl, x):
-    assert (
-        shape_t is None
-    ), 'Can not lower reshape2 into prim ops with shapetensor.'
-    assert (
-        shape_tl is None
-    ), 'Can not lower reshape2 into prim ops with shapetensorlist.'
-    y, xshape = get_output_var_list(op)
-    return reshape(x, shape=y.shape), fill_const(
-        shape=xshape.shape, dtype=xshape.dtype, value=0.0
-    )
-
-
-@REGISTER_ORIG2PRIM('concat')
-def concat_orig2prim(op, axis_t, xs):
-    assert axis_t is None, 'Can not lower concat into prim ops with axistensor.'
-    return concat(xs, axis=op.attr('axis'))
-
-
-@REGISTER_ORIG2PRIM('slice')
-def slice_orig2prim(op, ends_t, ends_tl, x, starts_t, starts_tl):
-    assert (
-        starts_t is None
-    ), 'Can not lower concat into prim ops with startstensor.'
-    assert ends_t is None, 'Can not lower concat into prim ops with endstensor.'
-    assert (
-        starts_tl is None
-    ), 'Can not lower concat into prim ops with startstensorlist.'
-    assert (
-        ends_tl is None
-    ), 'Can not lower concat into prim ops with endstensorlist.'
-    starts = op.attr('starts')
-    ends = op.attr('ends')
-    strides = [1 for _ in starts]
-    axis = op.attr('axes')
-    y = slice_select(x, starts=starts, ends=ends, strides=strides, axis=axis)
-    if op.attr('decrease_axis'):
-        y = reshape(y, shape=get_output_var_list(op)[0].shape)
-    return y
-
-
-@REGISTER_ORIG2PRIM('sigmoid')
-def sigmoid_orig2prim(op, x):
-    return div(
-        fill_const(value=1.0, shape=x.shape, dtype=x.dtype),
-        (add(fill_const(value=1.0, shape=x.shape, dtype=x.dtype), exp(neg(x)))),
-    )
-
-
-@REGISTER_ORIG2PRIM('p_norm')
-def p_norm_orig2prim(op, x):
-    def num_el(shape):
-        n = 1
-        for s in shape:
-            n = n * s
-        return n
-
-    assert op.attr(
-        'asvector'
-    ), 'Only support lower pnorm when asvector=True currently'
-    if len(x.shape) > 1:
-        x = reshape(x, shape=[num_el(x.shape)])
-
-    if abs(op.attr('porder') - 2.0) < 1e-5:
-        return sqrt(reduce_sum(mul(x, x), axis=[0]))
-    elif abs(op.attr('porder') - 1.0) < 1e-5:
-        return reduce_sum(primops.abs(x), axis=[0])
-    else:
-        raise RuntimeError('Only support lower l2/l1 norm currently')
-
-
-@REGISTER_ORIG2PRIM('cast')
-def cast_orig2prim(op, x):
-    return primops.cast(x, paddle.dtype(op.attr('out_dtype')))
-
-
-# TODO: support broadcast
-@REGISTER_ORIG2PRIM('where')
-def select_orig2prim(op, condition, x, y):
-    return select(condition, x, y)
-
-
-@REGISTER_ORIG2PRIM('equal')
-def equal_orig2prim(op, x, y):
-    if x.shape != y.shape:
-        y = broadcast(y, shape=x.shape)
-    return eq(x, y)
-
-
-@REGISTER_ORIG2PRIM('not_equal')
-def ne_orig2prim(op, x, y):
-    if x.shape != y.shape:
-        y = broadcast(y, shape=x.shape)
-    return ne(x, y)
-
-
-@REGISTER_ORIG2PRIM('greater_than')
-def gt_orig2prim(op, x, y):
-    if x.shape != y.shape:
-        y = broadcast(y, shape=x.shape)
-    return gt(x, y)
-
-
-@REGISTER_ORIG2PRIM('greater_equal')
-def ge_orig2prim(op, x, y):
-    if x.shape != y.shape:
-        y = broadcast(y, shape=x.shape)
-    return ge(x, y)
-
-
-# paddle.pow API use "elementwise_pow" operator when y is a Tensor.
-@REGISTER_ORIG2PRIM('elementwise_pow')
-def elementwise_pow_orig2prim(op, x, y):
-    if x.shape != y.shape:
-        y = broadcast(y, shape=x.shape)
-    z = primops.pow(x, y)
-    return z
-
-
-# paddle.pow API use "pow" operator when y is a scalar.
-@REGISTER_ORIG2PRIM('pow')
-def pow_orig2prim(op, x, y):
-    # x is factorTensor defined in paddle phi op. Currently it is None.
-    return primops.pow(y, fill_const(op.attr('factor'), y.shape, y.dtype))
-
-
-@REGISTER_ORIG2PRIM('square')
-def square_orig2prim(op, x):
-    return primops.square(x)
-
-
-@REGISTER_ORIG2PRIM('elementwise_max')
-def elementwise_max_orig2prim(op, x, y):
-    if x.shape != y.shape:
-        y = broadcast(y, shape=x.shape)
-    return primops.max(x, y)
-
-
-@REGISTER_ORIG2PRIM('gelu')
-def gelu_orig2prim(op, x):
-    if op.attr('approximate'):
-        cdf = mul(
-            fill_const(0.5, x.shape, x.dtype),
-            add(
-                fill_const(1.0, x.shape, x.dtype),
-                tanh(
-                    mul(
-                        fill_const(math.sqrt(2 / math.pi), x.shape, x.dtype),
-                        add(
-                            x,
-                            mul(
-                                fill_const(0.044715, x.shape, x.dtype),
-                                primops.pow(
-                                    x, fill_const(3.0, x.shape, x.dtype)
-                                ),
-                            ),
-                        ),
-                    )
-                ),
-            ),
-        )
-        return mul(x, cdf)
-    else:
-        return mul(
-            mul(fill_const(0.5, x.shape, x.dtype), x),
-            add(
-                fill_const(1.0, x.shape, x.dtype),
-                erf(mul(x, fill_const(1 / math.sqrt(2.0), x.shape, x.dtype))),
-            ),
-        )
-
-
-@REGISTER_ORIG2PRIM('dropout')
-def dropout_orig2prim(op, seed_t, x):
-    assert (
-        seed_t is None
-    ), 'Can not lower dropout into prim ops with seedtensor.'
-    mask = bernoulli(shape=x.shape, dtype=x.dtype, p=op.attr('dropout_prob'))
-    if op.attr('dropout_implementation') == 'upscale_in_train':
-        if not op.attr('is_test'):
-            out = div(
-                mul(x, mask),
-                fill_const(1.0 - op.attr('dropout_prob'), x.shape, x.dtype),
-            )
-            return primops.cast(mask, dtype=paddle.uint8), out
-        else:
-            return primops.cast(mask, dtype=paddle.uint8), x
-    elif op.attr('dropout_implementation') == 'downgrade_in_infer':
-        if not op.attr('is_test'):
-            return primops.cast(mask, dtype=paddle.uint8), mul(x, mask)
-        else:
-            return primops.cast(mask, dtype=paddle.uint8), mul(
-                x, fill_const(1.0 - op.attr('dropout_prob'), x.shape, x.dtype)
-            )
-    else:
-        raise RuntimeError(
-            'Unsupported dropout_implementation, only support upscale_in_train and downgrade_in_infer'
-        )
-
-
-@REGISTER_ORIG2PRIM('uniform_random')
-def uniform_random_orig2prim(op, shape_t, shape_tl):
-    if shape_t or shape_tl:
-        raise TypeError(
-            'uniform_random_orig2prim currently not support ShapeTensor input or ShapeTensorList input.'
-        )
-    min_value = op.attr('min')
-    max_value = op.attr('max')
-    seed = op.attr('seed')
-    dtype = paddle.dtype(op.attr('dtype'))
-    shape = op.attr('shape')
-    return uniform_random(dtype, min_value, max_value, seed, shape=shape)
-
-
-@REGISTER_ORIG2PRIM('reduce_sum')
-def reduce_sum_orig2prim(op, x):
-    axes = (
-        tuple(range(0, len(x.shape)))
-        if op.attr('reduce_all')
-        else op.attr('dim')
-    )
-    return reduce_sum(x, axis=axes, keepdim=op.attr('keep_dim'))
-
-
-@REGISTER_ORIG2PRIM('reduce_mean')
-def reduce_mean_orig2prim(op, x):
-    axes = (
-        tuple(range(0, len(x.shape)))
-        if op.attr('reduce_all')
-        else op.attr('dim')
-    )
-    return primops.mean(x, axes, op.attr('keep_dim'))
-
-
-@REGISTER_ORIG2PRIM('batch_norm')
-def batch_norm_orig2prim(
-    op, bias, run_mean, momentum_tensor, scale, run_var, x
-):
-    momentum = op.attr('momentum')
-    eps = op.attr('epsilon')
-    is_test = op.attr('is_test')
-    data_layout = op.attr('data_layout')
-    use_global_stats = op.attr('use_global_stats')
-    trainable_statistics = op.attr('trainable_statistics')
-    reserve_space = (
-        None if len(op.output_names) == 5 else get_output_var_list(op)[1]
-    )
-
-    feature_axis = (
-        1 if data_layout in ('NC', 'NCL', 'NCHW', 'NCHWD') else len(x.shape) - 1
-    )
-    use_run_stat = (is_test and (not trainable_statistics)) or use_global_stats
-
-    return primops.batch_norm(
-        x,
-        feature_axis,
-        scale,
-        bias,
-        run_mean,
-        run_var,
-        eps=eps,
-        momentum=momentum,
-        use_run_stat=use_run_stat,
-        reserve_space=reserve_space,
-    )
-
-
-@REGISTER_ORIG2PRIM('size')
-def size_orig2prim(op, x):
-    return fill_const(functools.reduce(operator.mul, x.shape), (), paddle.int64)
-
-
-# Register prim2orig lower rules
-@REGISTER_PRIM2ORIG('add_p')
-def add_prim2orig(op, x, y):
-    return paddle.add(x, y)
-
-
-@REGISTER_PRIM2ORIG('sub_p')
-def sub_prim2orig(op, x, y):
-    return paddle.subtract(x, y)
-
-
-@REGISTER_PRIM2ORIG('rsqrt_p')
-def rsqrt_prim2orig(op, x):
-    return paddle.rsqrt(x)
-
-
-@REGISTER_PRIM2ORIG('mul_p')
-def mul_prim2orig(op, x, y):
-    return paddle.multiply(x, y)
-
-
-@REGISTER_PRIM2ORIG('div_p')
-def div_prim2orig(op, x, y):
-    return paddle.divide(x, y)
-
-
-@REGISTER_PRIM2ORIG('sqrt_p')
-def sqrt_prim2orig(op, x):
-    return paddle.sqrt(x)
-
-
-@REGISTER_PRIM2ORIG('tanh_p')
-def tanh_prim2orig(op, x):
-    return paddle.tanh(x)
-
-
-@REGISTER_PRIM2ORIG('sin_p')
-def sin_prim2orig(op, x):
-    return paddle.sin(x)
-
-
-@REGISTER_PRIM2ORIG('cos_p')
-def cos_prim2orig(op, x):
-    return paddle.cos(x)
-
-
-@REGISTER_PRIM2ORIG('exp_p')
-def exp_prim2orig(op, x):
-    return paddle.exp(x)
-
-
-@REGISTER_PRIM2ORIG('erf_p')
-def erf_prim2orig(op, x):
-    return paddle.erf(x)
-
-
-@REGISTER_PRIM2ORIG('abs_p')
-def abs_prim2orig(op, x):
-    return paddle.abs(x)
-
-
-@REGISTER_PRIM2ORIG('log_p')
-def log_prim2orig(op, x):
-    return paddle.log(x)
-
-
-@REGISTER_PRIM2ORIG('reshape_p')
-def reshape_prim2orig(op, x):
-    return paddle.reshape(x, shape=op.attr('shape'))
-
-
-@REGISTER_PRIM2ORIG('broadcast_p')
-def broadcast_prim2orig(op, x):
-    return paddle.broadcast_to(x, shape=op.attr('shape'))
-
-
-@REGISTER_PRIM2ORIG('transpose_p')
-def transpose_prim2orig(op, x):
-    return paddle.transpose(x, perm=op.attr('axis'))
-
-
-@REGISTER_PRIM2ORIG('split_p')
-def split_prim2orig(op, x):
-    num_or_sections = op.attr('num_or_sections')
-    if len(num_or_sections) == 1:
-        num_or_sections = num_or_sections[0]
-    return paddle.split(
-        x, num_or_sections=num_or_sections, axis=op.attr('axis')
-    )
-
-
-@REGISTER_PRIM2ORIG('concat_p')
-def concat_prim2orig(op, xs):
-    return paddle.concat(xs, axis=op.attr('axis'))
-
-
-@REGISTER_PRIM2ORIG('reduce_sum_p')
-def reduce_prim2orig(op, x):
-    return paddle.sum(x, axis=op.attr('axis'), keepdim=op.attr('keepdim'))
-
-
-@REGISTER_PRIM2ORIG('matmul_p')
-def matmul_prim2orig(op, x, y):
-    return paddle.matmul(x, y)
-
-
-@REGISTER_PRIM2ORIG('slice_select_p')
-def slice_select_prim2orig(op, x):
-    return paddle.strided_slice(
-        x,
-        axes=op.attr('axis'),
-        starts=op.attr('starts'),
-        ends=op.attr('ends'),
-        strides=op.attr('strides'),
-    )
-
-
-@REGISTER_PRIM2ORIG('slice_assign_p')
-def slice_assign_prim2orig(op, x, y):
-    x_copy = paddle.assign(x)
-    return set_value(
-        x_copy,
-        y,
-        axis=op.attr('axis'),
-        starts=op.attr('starts'),
-        ends=op.attr('ends'),
-        strides=op.attr('strides'),
-        out=x_copy,
-    )
-
-
-@REGISTER_PRIM2ORIG('gather_p')
-def gather_prim2orig(op, index_t, x):
-    return paddle.gather(x, index_t, axis=op.attr('axis'))
-
-
-@REGISTER_PRIM2ORIG('scatter_add_p')
-def scatter_add_prim2orig(op, index_t, x, y):
-    assert op.attr('axis') == 0, 'Only support axis==0 currently'
-    zeros = paddle.zeros_like(x=x, dtype=x.dtype)
-    tmp = paddle.scatter(x=zeros, index=index_t, updates=y, overwrite=False)
-    return paddle.add(x, tmp)
-
-
-@REGISTER_PRIM2ORIG('fill_constant_p')
-def fill_constant_prim2orig(op):
-    return paddle.full(
-        shape=op.attr('shape'),
-        fill_value=op.attr('value'),
-        dtype=INT_DTYPE_2_STRING[op.attr('dtype')],
-    )
-
-
-@REGISTER_PRIM2ORIG('bernoulli_p')
-def bernoulli_prim2orig(op):
-    t = paddle.full(
-        shape=op.attr('shape'),
-        fill_value=op.attr('p'),
-        dtype=INT_DTYPE_2_STRING[op.attr('dtype')],
-    )
-    return paddle.bernoulli(t)
-
-
-@REGISTER_PRIM2ORIG('uniform_random_p')
-def uniform_random_prim2orig(op):
-    return paddle.uniform(
-        shape=op.attr('shape'),
-        dtype=INT_DTYPE_2_STRING[op.attr('dtype')],
-        min=op.attr('min'),
-        max=op.attr('max'),
-        seed=op.attr('seed'),
-    )
-
-
-@REGISTER_PRIM2ORIG('select_p')
-def select_prim2orig(op, condition, x, y):
-    return paddle.where(condition, x, y)
-
-
-@REGISTER_PRIM2ORIG('eq_p')
-def eq_prim2orig(op, x, y):
-    return paddle.equal(x, y)
-
-
-@REGISTER_PRIM2ORIG('gt_p')
-def gt_prim2orig(op, x, y):
-    return paddle.greater_than(x, y)
-
-
-@REGISTER_PRIM2ORIG('ge_p')
-def ge_prim2orig(op, x, y):
-    return paddle.greater_equal(x, y)
-
-
-@REGISTER_PRIM2ORIG('ne_p')
-def ne_prim2orig(op, x, y):
-    return paddle.not_equal(x, y)
-
-
-@REGISTER_PRIM2ORIG('pow_p')
-def pow_prim2orig(op, x, y):
-    return paddle.pow(x, y)
-
-
-@REGISTER_PRIM2ORIG('max_p')
-def max_prim2orig(op, x, y):
-    return paddle.maximum(x, y)
-
-
-@REGISTER_PRIM2ORIG('cast_p')
-def cast_prim2orig(op, x):
-    return paddle.cast(x, paddle.dtype(op.attr('dtype')))
-
-
-# Register linearize rules
-@REGISTER_JVP('add_p')
-def add_jvp(op, x_dot, y_dot):
-    if x_dot is None:
-        return y_dot
-    elif y_dot is None:
-        return x_dot
-    else:
-        return linear_jvp(op, x_dot, y_dot)
-
-
-@REGISTER_JVP('sub_p')
-def sub_jvp(op, x_dot, y_dot):
-    if x_dot is None:
-        return neg(y_dot)
-    elif y_dot is None:
-        return x_dot
-    else:
-        return linear_jvp(op, x_dot, y_dot)
-
-
-@REGISTER_JVP('mul_p')
-def mul_jvp(op, x_dot, y_dot):
-    if x_dot is None and y_dot is None:
-        return None
-    x, y = op_position_inputs(op)
-    if x_dot is None:
-        return mul(x, y_dot)
-    elif y_dot is None:
-        return mul(x_dot, y)
-    else:
-        t1, t2 = mul(x_dot, y), mul(x, y_dot)
-        z_dot = add(t1, t2)
-        return z_dot
-
-
-@REGISTER_JVP('div_p')
-def div_jvp(op, x_dot, y_dot):
-    if x_dot is None and y_dot is None:
-        return None
-    x, y = op_position_inputs(op)
-    if y_dot is None:
-        return div(x_dot, y)
-    elif x_dot is None:
-        return neg(div(mul(x, y_dot), mul(y, y)))
-    else:
-        t1 = div(x_dot, y)
-        t2 = div(mul(x, y_dot), mul(y, y))
-        return sub(t1, t2)
-
-
-@REGISTER_JVP('sqrt_p')
-def sqrt_jvp(op, x_dot):
-    if x_dot is None:
-        return None
-    y = op_position_output(op)
-    c2 = fill_const(value=2.0, shape=y.shape, dtype=y.dtype)
-    y_dot = div(x_dot, mul(c2, y))
-    return y_dot
-
-
-@REGISTER_JVP('tanh_p')
-def tanh_jvp(op, x_dot):
-    if x_dot is None:
-        return None
-    y = op_position_output(op)
-    c1 = fill_const(value=1.0, shape=y.shape, dtype=y.dtype)
-    y_dot = mul(x_dot, sub(c1, mul(y, y)))
-    return y_dot
-
-
-@REGISTER_JVP('sin_p')
-def sin_jvp(op, x_dot):
-    if x_dot is None:
-        return None
-    (x,) = op_position_inputs(op)
-    return mul(x_dot, cos(x))
-
-
-@REGISTER_JVP('cos_p')
-def cos_jvp(op, x_dot):
-    if x_dot is None:
-        return None
-    (x,) = op_position_inputs(op)
-    return mul(x_dot, neg(sin(x)))
-
-
-@REGISTER_JVP('exp_p')
-def exp_jvp(op, x_dot):
-    if x_dot is None:
-        return None
-    y = op_position_output(op)
-    return mul(x_dot, y)
-
-
-@REGISTER_JVP('erf_p')
-def erf_jvp(op, x_dot):
-    if x_dot is None:
-        return None
-    (x,) = op_position_inputs(op)
-    return mul(
-        fill_const(2.0 / math.sqrt(math.pi), x.shape, x.dtype),
-        mul(x_dot, exp(neg(primops.pow(x, fill_const(2.0, x.shape, x.dtype))))),
-    )
-
-
-@REGISTER_JVP('abs_p')
-def abs_jvp(op, x_dot):
-    if x_dot is None:
-        return None
-    (x,) = op_position_inputs(op)
-    return select(ge(x, fill_const(0.0, x.shape, x.dtype)), x_dot, neg(x_dot))
-
-
-@REGISTER_JVP('log_p')
-def log_jvp(op, x_dot):
-    if x_dot is None:
-        return None
-    (x,) = op_position_inputs(op)
-    return div(x_dot, x)
-
-
-@REGISTER_JVP('reshape_p')
-def reshape_jvp(op, x_dot):
-    if x_dot is None:
-        return None
-    shape = op.attr('shape')
-    return linear_jvp(op, x_dot, shape=shape)
-
-
-@REGISTER_JVP('broadcast_p')
-def broadcast_jvp(op, x_dot):
-    if x_dot is None:
-        return None
-    shape = op.attr('shape')
-    return linear_jvp(op, x_dot, shape=shape)
-
-
-@REGISTER_JVP('transpose_p')
-def transpose_jvp(op, x_dot):
-    if x_dot is None:
-        return None
-    axis = op.attr('axis')
-    return linear_jvp(op, x_dot, axis=axis)
-
-
-@REGISTER_JVP('split_p')
-def split_jvp(op, x_dot):
-    if x_dot is None:
-        return None
-    num_or_sections = op.attr('num_or_sections')
-    axis = op.attr('axis')
-    return linear_jvp(op, x_dot, num_or_sections=num_or_sections, axis=axis)
-
-
-@REGISTER_JVP('concat_p')
-def concat_jvp(op, xs_dot):
-    if xs_dot is None:
-        return None
-    axis = op.attr('axis')
-    return linear_jvp(op, xs_dot, axis=axis)
-
-
-@REGISTER_JVP('reduce_sum_p')
-def reduce_sum_jvp(op, x_dot):
-    if x_dot is None:
-        return None
-    axis = op.attr('axis')
-    keepdim = op.attr('keepdim')
-    return linear_jvp(op, x_dot, axis=axis, keepdim=keepdim)
-
-
-@REGISTER_JVP('matmul_p')
-def matmul_jvp(op, x_dot, y_dot):
-    if x_dot is None and y_dot is None:
-        return None
-    x, y = op_position_inputs(op)
-    if x_dot is None:
-        return matmul(x, y_dot)
-    elif y_dot is None:
-        return matmul(x_dot, y)
-    else:
-        t1 = matmul(x, y_dot)
-        t2 = matmul(x_dot, y)
-        return add(t1, t2)
-
-
-@REGISTER_JVP('slice_select_p')
-def slice_select_jvp(op, x_dot):
-    if x_dot is None:
-        return x_dot
-    axis = op.attr('axis')
-    starts = op.attr('starts')
-    ends = op.attr('ends')
-    strides = op.attr('strides')
-    return linear_jvp(
-        op, x_dot, axis=axis, starts=starts, ends=ends, strides=strides
-    )
-
-
-@REGISTER_JVP('slice_assign_p')
-def slice_assign_jvp(op, x_dot, y_dot):
-    x, y = op_position_inputs(op)
-    assert (
-        x_dot is not None or y_dot is not None
-    ), "x_dot and y_dot can't be None at the same time. "
-    axis = op.attr('axis')
-    starts = op.attr('starts')
-    ends = op.attr('ends')
-    strides = op.attr('strides')
-    if x_dot is None:
-        return linear_jvp(
-            op,
-            fill_const(value=0.0, shape=x.shape, dtype=x.dtype),
-            y_dot,
-            axis=axis,
-            starts=starts,
-            ends=ends,
-            strides=strides,
-        )
-    elif y_dot is None:
-        return linear_jvp(
-            op,
-            x_dot,
-            fill_const(value=0.0, shape=y.shape, dtype=y.dtype),
-            axis=axis,
-            starts=starts,
-            ends=ends,
-            strides=strides,
-        )
-    return add(
-        linear_jvp(
-            op,
-            fill_const(value=0.0, shape=x.shape, dtype=x.dtype),
-            y_dot,
-            axis=axis,
-            starts=starts,
-            ends=ends,
-            strides=strides,
-        ),
-        linear_jvp(
-            op,
-            x_dot,
-            fill_const(value=0.0, shape=y.shape, dtype=y.dtype),
-            axis=axis,
-            starts=starts,
-            ends=ends,
-            strides=strides,
-        ),
-    )
-
-
-@REGISTER_JVP('gather_p')
-def gather_jvp(op, x_dot, indextensor):
-    if x_dot is None:
-        return None
-    _, indextensor = op_position_inputs(op)
-    axis = op.attr('axis')
-    return linear_jvp(op, x_dot, indextensor, axis=axis)
-
-
-@REGISTER_JVP('scatter_add_p')
-def scatter_add_jvp(op, x_dot, y_dot):
-    if x_dot is None:
-        return None
-    _, _, indextensor = op_position_inputs(op)
-    axis = op.attr('axis')
-    return linear_jvp(op, x_dot, y_dot, indextensor, axis=axis)
-
-
-@REGISTER_JVP('select_p')
-def select_jvp(op, cond_dot, x_dot, y_dot):
-    if x_dot is None and y_dot is None:
-        return None
-
-    cond, x, y = op_position_inputs(op)
-    if x_dot is None:
-        x_dot = fill_const(value=0.0, shape=y.shape, dtype=y.dtype)
-    if y_dot is None:
-        y_dot = fill_const(value=0.0, shape=y.shape, dtype=y.dtype)
-    return select(cond, x_dot, y_dot)
-
-
-@REGISTER_JVP('eq_p')
-def eq_jvp(op, x_dot, y_dot):
-    if x_dot is None and y_dot is None:
-        return None
-    x, _ = op_position_inputs(op)
-    z_dot = fill_const(value=0.0, shape=x.shape, dtype=x.dtype)
-    return z_dot
-
-
-@REGISTER_JVP('gt_p')
-def gt_jvp(op, x_dot, y_dot):
-    if x_dot is None and y_dot is None:
-        return None
-    x, _ = op_position_inputs(op)
-    z_dot = fill_const(value=0.0, shape=x.shape, dtype=x.dtype)
-    return z_dot
-
-
-@REGISTER_JVP('ge_p')
-def ge_jvp(op, x_dot, y_dot):
-    if x_dot is None and y_dot is None:
-        return None
-    x, _ = op_position_inputs(op)
-    z_dot = fill_const(value=0.0, shape=x.shape, dtype=x.dtype)
-    return z_dot
-
-
-@REGISTER_JVP('ne_p')
-def ne_jvp(op, x_dot, y_dot):
-    if x_dot is None and y_dot is None:
-        return None
-    x, _ = op_position_inputs(op)
-    z_dot = fill_const(value=0.0, shape=x.shape, dtype=x.dtype)
-    return z_dot
-
-
-@REGISTER_JVP('pow_p')
-def pow_jvp(op, x_dot, y_dot):
-    def _compute_t1(x, y):
-        zero_y = fill_const(value=0.0, shape=y.shape, dtype=y.dtype)
-        one_y = fill_const(value=1.0, shape=y.shape, dtype=y.dtype)
-
-        cond = eq(y, zero_y)
-        new_y = select(cond, one_y, sub(y, one_y))
-        t1 = mul(x_dot, mul(y, primops.pow(x, new_y)))
-        return t1
-
-    if x_dot is None and y_dot is None:
-        return None
-    x, y = op_position_inputs(op)
-    z = op_position_output(op)
-
-    if y_dot is None:
-        return _compute_t1(x, y)
-    elif x_dot is None:
-        return mul(y_dot, mul(log(x), z))
-    else:
-        t1, t2 = _compute_t1(x, y), mul(y_dot, mul(log(x), z))
-        z_dot = add(t1, t2)
-        return z_dot
-
-
-@REGISTER_JVP('max_p')
-def max_jvp(op, x_dot, y_dot):
-    if x_dot is None and y_dot is None:
-        return None
-
-    x, y = op_position_inputs(op)
-    z = op_position_output(op)
-    z_zeros = fill_const(value=0.0, shape=z.shape, dtype=z.dtype)
-
-    # To make the grad of max_p consistent with paddle.maximum when x==y,
-    # we just let z_dot = y_dot when compute z_dot to y and x==y,
-    # instead of using balance_eq like Jax.
-    if y_dot is None:
-        return select(eq(y, z), z_zeros, x_dot)
-    elif x_dot is None:
-        return select(eq(y, z), y_dot, z_zeros)
-    else:
-        return select(eq(y, z), y_dot, x_dot)
-
-
-@REGISTER_JVP('cast_p')
-def cast_jvp(op, x_dot):
-    y = op_position_output(op)
-    return primops.cast(x_dot, y.dtype)
-
-
-@REGISTER_JVP('rsqrt_p')
-def rsqrt_jvp(op, x_dot):
-    if x_dot is None:
-        return None
-    y = op_position_output(op)
-    x = op_position_inputs(op)
-    c2 = fill_const(value=-2.0, shape=y.shape, dtype=y.dtype)
-    y_dot = mul(x_dot, div(div(y, x), c2))
-    return y_dot
-
-
-# Register transpose rules
-
-
-@REGISTER_TRANSPOSE('add_p')
-def add_transpose(op, check_dot, z_bar):
-    x, y = op_position_inputs(op)
-    assert check_dot(x) or check_dot(y), (
-        f'(check_dot(x) or check_dot(y)) must be True, '
-        f'but check_dot(x)={check_dot(x)} and check_dot(y)={check_dot(y)}.'
-    )
-    x_bar = z_bar if check_dot(x) else None
-    y_bar = z_bar if check_dot(y) else None
-    return x_bar, y_bar
-
-
-@REGISTER_TRANSPOSE('sub_p')
-def sub_transpose(op, check_dot, z_bar):
-    x, y = op_position_inputs(op)
-    assert check_dot(x) or check_dot(y), (
-        f'(check_dot(x) or check_dot(y)) must be True, '
-        f'but check_dot(x)={check_dot(x)} and check_dot(y)={check_dot(y)}.'
-    )
-    x_bar = z_bar if check_dot(x) else None
-    y_bar = neg(z_bar) if check_dot(y) else None
-    return x_bar, y_bar
-
-
-@REGISTER_TRANSPOSE('mul_p')
-def mul_transpose(op, check_dot, z_bar):
-    x, y = op_position_inputs(op)
-    assert check_dot(x) ^ check_dot(y), (
-        f'(check_dot(x) ^ check_dot(y)) must be True, '
-        f'but check_dot(x)={check_dot(x)} and check_dot(y)={check_dot(y)}.'
-    )
-    if check_dot(x):
-        return mul(z_bar, y), None
-    else:
-        return None, mul(x, z_bar)
-
-
-@REGISTER_TRANSPOSE('div_p')
-def div_transpose(op, check_dot, z_bar):
-    x, y = op_position_inputs(op)
-    assert not check_dot(y), 'check_dot(y) must be False'
-    x_bar = div(z_bar, y) if check_dot(x) else None
-    return x_bar, None
-
-
-@REGISTER_TRANSPOSE('reshape_p')
-def reshape_transpose(op, check_dot, y_bar):
-    (x,) = op_position_inputs(op)
-    assert check_dot(x), 'check_dot(x) must be True'
-    return reshape(y_bar, shape=x.shape)
-
-
-@REGISTER_TRANSPOSE('broadcast_p')
-def broadcast_transpose(op, check_dot, y_bar):
-    (x,) = op_position_inputs(op)
-    assert check_dot(x), 'check_dot(x) must be True'
-    bat = len(y_bar.shape) - len(x.shape)
-    axis = list(range(bat))
-    keepdim = [(bat + i) for i, s in enumerate(x.shape) if s == 1]
-    axis += keepdim
-    # TODO: Change it. keepdim boolean
-    out = reduce_sum(y_bar, axis=axis, keepdim=False)
-    return reshape(out, x.shape)
-
-
-@REGISTER_TRANSPOSE('transpose_p')
-def transpose_transpose(op, check_dot, y_bar):
-    (x,) = op_position_inputs(op)
-    assert check_dot(x), 'check_dot(x) must be True'
-    axis = op.attr('axis')
-    reordered = sorted((k, i) for i, k in enumerate(axis))
-    axis = [i for k, i in reordered]
-    return transpose(y_bar, axis=axis)
-
-
-@REGISTER_TRANSPOSE('split_p')
-def split_transpose(op, check_dot, ys_bar):
-    (x,) = op_position_inputs(op)
-    assert check_dot(x), 'check_dot(x) must be True'
-    return concat(ys_bar, axis=op.attr('axis'))
-
-
-@REGISTER_TRANSPOSE('concat_p')
-def concat_transpose(op, check_dot, y_bar):
-    (xs,) = op_position_inputs(op)
-    if not isinstance(xs, typing.Sequence):
-        xs = [xs]
-    for x in xs:
-        assert check_dot(x), 'check_dot(x) must be True'
-    axis = op.attr('axis')
-    sections = [x.shape[axis] for x in xs]
-    if len(sections) == 1:
-        return y_bar
-    return split(y_bar, num_or_sections=sections, axis=axis)
-
-
-@REGISTER_TRANSPOSE('reduce_sum_p')
-def reduce_sum_transpose(op, check_dot, y_bar):
-    (x,) = op_position_inputs(op)
-    assert check_dot(x), 'check_dot(x) must be True'
-    axes = op.attr('axis')
-    shape = tuple(1 if i in axes else size for i, size in enumerate(x.shape))
-    t = reshape(y_bar, shape=shape)
-    return broadcast(t, shape=x.shape)
-
-
-@REGISTER_TRANSPOSE('matmul_p')
-def matmul_transpose(op, check_dot, z_bar):
-    x, y = op_position_inputs(op)
-    assert check_dot(x) ^ check_dot(y), (
-        f'(check_dot(x) ^ check_dot(y)) must be True, '
-        f'but check_dot(x)={check_dot(x)} and check_dot(y)={check_dot(y)}.'
-    )
-    # TODO: replace it. this is hacky
-    axis = [1, 0] if len(x.shape) == 2 else [0, 2, 1]
-    if check_dot(x):
-        return matmul(z_bar, transpose(y, axis=axis)), None
-    else:
-        return None, matmul(transpose(x, axis=axis), z_bar)
-
-
-@REGISTER_TRANSPOSE('slice_select_p')
-def slice_select_transpose(op, check_dot, y_bar):
-    (x,) = op_position_inputs(op)
-    assert check_dot(x), 'check_dot(x) must be True'
-    zeros = fill_const(value=0.0, shape=x.shape, dtype=x.dtype)
-    axis = op.attr('axis')
-    starts = op.attr('starts')
-    ends = op.attr('ends')
-    strides = op.attr('strides')
-    return slice_assign(
-        zeros, y_bar, axis=axis, starts=starts, ends=ends, strides=strides
-    )
-
-
-@REGISTER_TRANSPOSE('slice_assign_p')
-def slice_assign_transpose(op, check_dot, z_bar):
-    x, y = op_position_inputs(op)
-    assert check_dot(x) ^ check_dot(y), (
-        f'(check_dot(x) ^ check_dot(y)) must be True, '
-        f'but check_dot(x)={check_dot(x)} and check_dot(y)={check_dot(y)}.'
-    )
-    zeros = fill_const(value=0.0, shape=y.shape, dtype=y.dtype)
-    axis = op.attr('axis')
-    starts = op.attr('starts')
-    ends = op.attr('ends')
-    strides = op.attr('strides')
-    if check_dot(x):
-        return (
-            slice_assign(
-                z_bar,
-                zeros,
-                axis=axis,
-                starts=starts,
-                ends=ends,
-                strides=strides,
-            ),
-            None,
-        )
-    return None, slice_select(
-        z_bar, axis=axis, starts=starts, ends=ends, strides=strides
-    )
-
-
-@REGISTER_TRANSPOSE('gather_p')
-def gather_transpose(op, check_dot, y_bar):
-    x, indextensor = op_position_inputs(op)
-    assert check_dot(x), 'check_dot(x) must be True'
-    axis = op.attr('axis')
-    zeros = fill_const(0.0, x.shape, x.dtype)
-    x_bar = scatter_add(zeros, y_bar, indextensor, axis=axis)
-    indextensor_bar = None
-    return x_bar, indextensor_bar
-
-
-@REGISTER_TRANSPOSE('scatter_add_p')
-def scatter_add_transpose(op, check_dot, z_bar):
-    x, y, indextensor = op_position_inputs(op)
-    assert check_dot(x) and check_dot(y), (
-        f'(check_dot(x) and check_dot(y)) must be True, '
-        f'but check_dot(x)={check_dot(x)} and check_dot(y)={check_dot(y)}.'
-    )
-    axis = op.attr('axis')
-    zeros = fill_const(value=0.0, shape=y.shape, dtype=y.dtype)
-    x_bar = scatter_add(z_bar, zeros, indextensor, axis=axis)
-    y_bar = gather(z_bar, indextensor, axis=axis)
-    indextensor_bar = None
-    return x_bar, y_bar, indextensor_bar
-
-
-@REGISTER_TRANSPOSE('select_p')
-def select_transpose(op, check_dot, z_bar):
-    cond, x, y = op_position_inputs(op)
-    assert check_dot(cond) or check_dot(x) or check_dot(y), (
-        f'check_dot(cond) ^ (check_dot(x) ^ check_dot(y)) must be True, '
-        f'but check_dot(cond)={check_dot(cond)}, check_dot(x)={check_dot(x)} and check_dot(y)={check_dot(y)}.'
-    )
-
-    zeros_x = fill_const(value=0.0, shape=x.shape, dtype=x.dtype)
-    zeros_y = fill_const(value=0.0, shape=y.shape, dtype=y.dtype)
-
-    cond_bar = (
-        fill_const(value=0.0, shape=y.shape, dtype=cond.dtype)
-        if check_dot(cond)
-        else None
-    )
-    x_bar = select(cond, z_bar, zeros_x) if check_dot(x) else None
-    y_bar = select(cond, zeros_y, z_bar) if check_dot(y) else None
-
-    return cond_bar, x_bar, y_bar
-
-
-@REGISTER_TRANSPOSE('cast_p')
-def cast_transpose(op, check_dot, y_bar):
-    (x,) = op_position_inputs(op)
-    return primops.cast(y_bar, x.dtype)
diff --git a/python/paddle/incubate/autograd/primx.py b/python/paddle/incubate/autograd/primx.py
index 892010e28bac0..901e23a649974 100644
--- a/python/paddle/incubate/autograd/primx.py
+++ b/python/paddle/incubate/autograd/primx.py
@@ -23,17 +23,13 @@
 from paddle.incubate.autograd.utils import as_tensors
 
 from .composite_rules import _composite
-from .primops import add, fill_const
 from .primreg import (
     lookup_composite,
     lookup_orig2prim,
     lookup_prim2orig,
-    op_position_inputs,
-    op_position_output,
 )
-from .primrules import _jvp, _orig2prim, _prim2orig, _transpose
+from .primrules import _orig2prim, _prim2orig
 from .utils import (
-    flatten,
     flatten_and_remove_none,
     get_input_var_list,
     get_output_var_list,
@@ -189,7 +185,7 @@ def contain_value(self, value_var):
 # TODO(lml): supporting control flow, nested blocks, and block other than current block of main program.
 class Transform:
     """An object that maintains the state of transformations applied to a
-    primitve program."""
+    primitive program."""
 
     def __init__(self, block):
         assert (
@@ -262,178 +258,6 @@ def dot2bar_rec(self, dots):
         bars = [self.dot2bar_rec(dot) for dot in dots]
         return bars
 
-    def linearize(self, xs, ys, xs_dot=None):
-        """Performs the linearization transform, a.k.a, forward mode AD
-        transform, on a primitive lowered program.
-
-        Args:
-            xs: a list of input variables
-            ys: a list of output variables
-            xs_dot: optional, a list of gradient input variables. The list size
-                must be equal to `len(xs)`. The shape and dtype of each element
-                must be the same as in `xs`
-
-        Returns:
-            (xs_dot, ys_dot): a tuple of two lists. `xs_dot` is the list of
-            gradient inputs of the resulting linearized program. `ys_dot` is
-            the list gradient outputs of the resulting linearized program
-
-        """
-        if xs_dot is None:
-            xs_dot = [fill_const(1.0, shape=x.shape, dtype=x.dtype) for x in xs]
-            self.add_vars(xs_dot)
-        else:
-            assert len(xs) == len(xs_dot), (
-                f'len(xs) should be equal to len(xs_dot), '
-                f'but len(xs)={len(xs)} and len(xs_dot)={len(xs_dot)}'
-            )
-
-        for x, dot in zip(xs, xs_dot):
-            assert x.dtype == dot.dtype, (
-                f'x.dtype should be equal to dot.dtype, '
-                f'but x.dtype={x.dtype} and dot.dtype={dot.dtype}'
-            )
-            assert x.shape == dot.shape, (
-                f'x.shape should be equal to dot.shape, '
-                f'but x.shape={x.shape} and dot.shape={dot.shape}'
-            )
-            self.var2dot.add(x, dot)
-
-        path, unused_xs, _ = topo_path(xs, ys, self.block)
-
-        # No need to track unused inputs
-        for x in unused_xs:
-            self.var2dot.delete(x)
-
-        for op in path:
-            # An input var may not be on the input-output path, which implies
-            # there may be None's in `ins_dot`. In this case we place
-            # the original input in the position of the otherwise forward
-            # gradient.
-            ins = op_position_inputs(op)
-            jvp_ins = self.var2dot_rec(ins)
-            # apply op's forward ad rule
-            outs_dot = _jvp(op, *jvp_ins)
-            self.add_vars_rec(outs_dot)
-            outs = op_position_output(op)
-            self.var2dot.add_rec(outs, outs_dot)
-
-        ys_dot = [self.var2dot.lookup(y) for y in ys]
-        return xs_dot, ys_dot
-
-    def transpose(self, ys_dot, xs_dot, ys_bar=None, retain_fwd=False):
-        """Performs the transpose transform, a.k.a, reverse mode AD
-        transform, on a linearized primitive program.
-
-        Note, `transpose` is supposed to be used in couple with `linearize`.
-
-        Args:
-            ys_dot: a list of outputs of the linearized program.
-            xs_dot: a list of inputs of the linearized program.
-            ys_bar: optional, a list of inputs of the resulting transposed
-                program. The list size must be equal to `len(ys_dot)`. The shape
-                and dtype of each element must be the same as in `ys_dot`
-
-        Returns:
-            (ys_bar, xs_bar): a tuple of two lists. `ys_bar` is the list of
-            inputs of the resulting transposed program. `xs_bar` is
-            the list outputs of the resulting transposed program
-
-        """
-        assert all(v is not None for v in xs_dot), '`xs_dot` includes None.'
-        assert all(v is not None for v in ys_dot), '`ys_dot` includes None.'
-
-        if ys_bar is None:
-            ys_bar = []
-            for y in ys_dot:
-                ys_bar.append(fill_const(1.0, shape=y.shape, dtype=y.dtype))
-            self.add_vars(ys_bar)
-        else:
-            assert len(ys_dot) == len(ys_bar), (
-                f'len(ys_dot) should be equal to len(ys_bar), '
-                f'but len(ys_dot)={len(ys_dot)} and len(ys_bar)={len(ys_bar)}'
-            )
-            for y_dot, y_bar in zip(ys_dot, ys_bar):
-                assert y_dot.shape == y_bar.shape, (
-                    f'y_dot.shape should be equal to y_bar.shape, '
-                    f'but y_dot.shape={y_dot.shape} and y_bar.shape={y_bar.shape}'
-                )
-                assert y_dot.dtype == y_bar.dtype, (
-                    f'y_dot.dtype should be equal to y_bar.dtype, '
-                    f'but y_dot.dtype={y_dot.dtype} and y_bar.dtype={y_bar.dtype}'
-                )
-
-        for dot, bar in zip(ys_dot, ys_bar):
-            self.dot2bar.add(dot, bar)
-
-        # find all the relevant forward gradients
-        path, unused_xs_dot, _ = topo_path(xs_dot, ys_dot, self.block)
-
-        # No need to track unused inputs
-        for dot in unused_xs_dot:
-            self.dot2bar.delete(dot)
-
-        dotvars = output_vars_on_path(path)
-        dotvars.update((id(var), var) for var in xs_dot)
-
-        is_dot = lambda v: id(v) in dotvars
-
-        for op in reversed(path):
-            out = op_position_output(op)
-            out_bar_rec = self.dot2bar_rec(out)
-            ins_bar_rec = _transpose(op, is_dot, out_bar_rec)
-
-            # TODO(Tongxin): this is hacky. Tuple implies the Transpose rule
-            # returns multiple entities. There should be better ways to handle
-            # outputs.
-            if isinstance(ins_bar_rec, tuple):
-                ins_bar_rec = list(ins_bar_rec)
-            else:
-                ins_bar_rec = [ins_bar_rec]
-            self.add_vars_rec(ins_bar_rec)
-
-            ins_bar = flatten(ins_bar_rec)
-            ins = flatten(op_position_inputs(op))
-            assert len(ins) == len(ins_bar), (
-                f'len(ins) should be equal to len(ins_bar), '
-                f'but len(ins)={len(ins)} and len(ins_bar)={len(ins_bar)}'
-            )
-
-            for dot, bar in zip(ins, ins_bar):
-                if bar is not None:
-                    # aggregate gradient
-                    grad = self.dot2bar.lookup(dot)
-                    if grad is None:
-                        self.dot2bar.add(dot, bar)
-                    else:
-                        grad = add(grad, bar)
-                        self.add_vars([grad])
-                        self.dot2bar.add(dot, grad)
-
-        xs_bar = [self.dot2bar.lookup(x) for x in xs_dot]
-
-        if not retain_fwd and len(path) > 0:
-            vars_to_remove = set()
-            for op in path:
-                vars_to_remove.update(
-                    flatten_and_remove_none(get_output_var_list(op))
-                )
-
-            op_indexes = []
-
-            block = self.block
-            for i, op in enumerate(block.ops):
-                if op in path:
-                    op_indexes.append(i)
-                    path.pop(0)
-                    if len(path) == 0:
-                        break
-
-            self.erase_ops(op_indexes)
-            self.erase_dots(vars_to_remove)
-
-        return ys_bar, xs_bar
-
 
 # TODO(lml): supporting control flow, nested blocks, and block other than current block of main program.
 def _lower(block, reverse, blacklist):
@@ -555,7 +379,7 @@ def _lower_composite(
     start_idx=-1,
     backward_length=-1,
 ):
-    """The operators in block wich satisfy the filter conditon will be decomposite into primitives."""
+    """The operators in block which satisfy the filter condition will be decomposite into primitives."""
 
     def bind(args, to_bind, value_table):
         for i in range(len(args)):
@@ -633,12 +457,12 @@ def expand_nested_list(xs):
             op_name = op.type
 
             # NOTE: why need _sync_with_cpp here
-            # _sync_wich_cpp after every copied operator is very slow.
-            # However, _sync_wich_cpp only support continuous block currently.
+            # _sync_with_cpp after every copied operator is very slow.
+            # However, _sync_with_cpp only support continuous block currently.
             # The lowering transformation will generate program which is
             # crossed combination of copy block and lower block, such as
             # op1(copy) -> op2(copy) -> op3(lower) -> op4(lower) -> op5(copy) -> op6(copy)
-            # It will cause _sync_wich_cpp error.
+            # It will cause _sync_with_cpp error.
             # So, _sync_with_cpp will be executed only once after every continuous copy block.
             lower = (
                 (lookup_fn(op_name) is not None)
diff --git a/python/paddle/incubate/distributed/fleet/collective.py b/python/paddle/incubate/distributed/fleet/collective.py
index 0a63ddb71dffb..b744646ecaa92 100644
--- a/python/paddle/incubate/distributed/fleet/collective.py
+++ b/python/paddle/incubate/distributed/fleet/collective.py
@@ -222,8 +222,6 @@ def __init__(self):
         self.use_amp = False  # use mixed precision optimizer
         self.amp_loss_scaling = 2**15
 
-        self.exec_strategy = base.ExecutionStrategy()
-
         # configurations below are used for unit test
         self._ut4grad_allreduce = False
 
@@ -355,10 +353,8 @@ def _transpile(self, startup_program, main_program):
 
         if self.print_config:
             print(
-                "worker_endpoints:{} trainers_num:{} current_endpoint:{} \
-                  trainer_id:{}".format(
-                    worker_endpoints, trainers_num, current_endpoint, trainer_id
-                )
+                f"worker_endpoints:{worker_endpoints} trainers_num:{trainers_num} current_endpoint:{current_endpoint} \
+                  trainer_id:{trainer_id}"
             )
 
         # call transpiler
@@ -412,8 +408,6 @@ def _try_to_compile(self, startup_program, main_program):
         node_num = self._node_num()
         assert node_num >= 1, "nccl2 node_num must >= 1, now:{}" % node_num
 
-        exec_strategy = self._strategy.exec_strategy
-
         if node_num <= 1:
             if self._strategy.nccl_comm_num > 1:
                 logging.warn("set nccl_comm_num=1 since you only have 1 node.")
@@ -426,22 +420,12 @@ def _try_to_compile(self, startup_program, main_program):
             self._strategy.use_hierarchical_allreduce = False
 
         sync_allreduce = os.getenv("FLAGS_sync_nccl_allreduce")
-        if sync_allreduce is None or sync_allreduce == "1":
-            exec_strategy.num_threads = self._strategy.nccl_comm_num + 1
-            if self._strategy.use_hierarchical_allreduce:
-                exec_strategy.num_threads = 2 * self._strategy.nccl_comm_num + 1
-            if exec_strategy.num_threads > 4:
-                logging.warn(
-                    "if you use use_hierarchical_allreduce or "
-                    "with multi nccl comm, please export FLAGS_sync_nccl_allreduce = 0"
-                )
 
         # NOTE. open sync_batch_norm will hang when use multi num_threads
         sync_batch_norm = self._strategy.sync_batch_norm
         if sync_batch_norm is not None and sync_batch_norm is True:
             self._strategy.nccl_comm_num = 1
             self._strategy.use_hierarchical_allreduce = False
-            exec_strategy.num_threads = 1
             logging.warn(
                 "use sync_batch_norm will hang when set num_threads > 1, so "
                 "set num_threads=1, nccl_comm_num=1, use_hierarchical_allreduce=False."
@@ -451,8 +435,6 @@ def _try_to_compile(self, startup_program, main_program):
             print(
                 "node_num:",
                 node_num,
-                "num_threads:",
-                exec_strategy.num_threads,
                 "use_hierarchical_allreduce:",
                 self._strategy.use_hierarchical_allreduce,
                 "nccl_comm_num:",
diff --git a/python/paddle/incubate/distributed/fleet/fleet_util.py b/python/paddle/incubate/distributed/fleet/fleet_util.py
index 9af91e4f5b148..c56504a221732 100644
--- a/python/paddle/incubate/distributed/fleet/fleet_util.py
+++ b/python/paddle/incubate/distributed/fleet/fleet_util.py
@@ -1676,20 +1676,9 @@ def print_global_metrics(
             total_ins_num_name,
         )
         self.rank0_print(
-            "{} global AUC={:.6f} BUCKET_ERROR={:.6f} MAE={:.6f} "
-            "RMSE={:.6f} Actural_CTR={:.6f} Predicted_CTR={:.6f} "
-            "COPC={:.6f} MEAN Q_VALUE={:.6f} Ins number={}".format(
-                print_prefix,
-                auc,
-                bucket_error,
-                mae,
-                rmse,
-                actual_ctr,
-                predicted_ctr,
-                copc,
-                mean_predict_qvalue,
-                total_ins_num,
-            )
+            f"{print_prefix} global AUC={auc:.6f} BUCKET_ERROR={bucket_error:.6f} MAE={mae:.6f} "
+            f"RMSE={rmse:.6f} Actural_CTR={actual_ctr:.6f} Predicted_CTR={predicted_ctr:.6f} "
+            f"COPC={copc:.6f} MEAN Q_VALUE={mean_predict_qvalue:.6f} Ins number={total_ins_num}"
         )
 
     def program_type_trans(self, prog_dir, prog_fn, is_text):
diff --git a/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/__init__.py b/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/__init__.py
index c6b6eec025107..65834782bbe5f 100644
--- a/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/__init__.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/__init__.py
@@ -56,7 +56,7 @@
 from paddle.incubate.distributed.fleet.parameter_server.pslib.optimizer_factory import (
     DistributedAdam,  # noqa: F401
 )
-from paddle.incubate.distributed.fleet.role_maker import MPISymetricRoleMaker
+from paddle.incubate.distributed.fleet.role_maker import MPISymmetricRoleMaker
 from paddle.static import (
     Executor,
     Program,
@@ -99,7 +99,7 @@ def __init__(self):
 
     def init(self, role_maker=None):
         if role_maker is None:
-            role_maker = MPISymetricRoleMaker()
+            role_maker = MPISymmetricRoleMaker()
         super().init(role_maker)
         if self._fleet_ptr is None:
             self._fleet_ptr = core.Fleet()
@@ -144,7 +144,7 @@ def get_sparse_attrs():
 
                 if len(dist_varnames) != 0:
                     raise ValueError(
-                        "GeoStrategy can not support large scale embeding now, please use paddle.static.nn.embedding"
+                        "GeoStrategy can not support large scale embedding now, please use paddle.static.nn.embedding"
                     )
 
                 init_attrs = []
@@ -174,10 +174,10 @@ def get_sparse_attrs():
             kwargs["sparse_attrs"] = get_sparse_attrs()
             return kwargs
 
-        # if MPISymetricRoleMaker is defined
+        # if MPISymmetricRoleMaker is defined
         # we suppose a user wants to submit job on mpi cluster
 
-        if isinstance(self._role_maker, MPISymetricRoleMaker):
+        if isinstance(self._role_maker, MPISymmetricRoleMaker):
             # check whether server has been initialized
             wait_server_ready(self.server_endpoints(to_string=False))
 
@@ -333,7 +333,7 @@ def stop_worker(self):
 
         if self._inner_mode == PSMode.TRANSPILER:
             self._communicator.stop()
-            if isinstance(self._role_maker, MPISymetricRoleMaker):
+            if isinstance(self._role_maker, MPISymmetricRoleMaker):
                 self._role_maker._finalize()
             self._executor.close()
         else:
@@ -510,9 +510,7 @@ def _get_optimizer_status(self, op, param_name):
 
         if op not in supported_opts:
             raise ValueError(
-                "fleet can not support optimizer: {}, only this can be supported: {}".format(
-                    op, supported_opts
-                )
+                f"fleet can not support optimizer: {op}, only this can be supported: {supported_opts}"
             )
 
         reshaped_names = [
diff --git a/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/distributed_strategy.py b/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
index f9e803fb45910..d7fdfd6e4d996 100644
--- a/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
@@ -105,24 +105,20 @@ def get_communicator_flags(self):
             ]
             if max_merge_var_num != num_threads:
                 print(
-                    'WARNING: In {} mode, communicator_max_merge_var_num '
+                    f'WARNING: In {mode_str} mode, communicator_max_merge_var_num '
                     'must be equal to CPU_NUM. But received, '
-                    'communicator_max_merge_var_num = {}, CPU_NUM = '
-                    '{}. communicator_max_merge_var_num will be forced to {}.'.format(
-                        mode_str, max_merge_var_num, num_threads, num_threads
-                    )
+                    f'communicator_max_merge_var_num = {max_merge_var_num}, CPU_NUM = '
+                    f'{num_threads}. communicator_max_merge_var_num will be forced to {num_threads}.'
                 )
                 self.runtime_configs[
                     'communicator_max_merge_var_num'
                 ] = num_threads
             if send_queue_size != num_threads:
                 print(
-                    'WARNING: In {} mode, communicator_send_queue_size '
+                    f'WARNING: In {mode_str} mode, communicator_send_queue_size '
                     'must be equal to CPU_NUM. But received, '
-                    'communicator_send_queue_size = {}, CPU_NUM = '
-                    '{}. communicator_send_queue_size will be forced to {}.'.format(
-                        mode_str, send_queue_size, num_threads, num_threads
-                    )
+                    f'communicator_send_queue_size = {send_queue_size}, CPU_NUM = '
+                    f'{num_threads}. communicator_send_queue_size will be forced to {num_threads}.'
                 )
                 self.runtime_configs[
                     'communicator_send_queue_size'
@@ -171,10 +167,8 @@ def __init__(self):
         self._server_runtime_config = ServerRuntimeConfig()
         num_threads = int(os.getenv("CPU_NUM", "1"))
 
-        self._execute_strategy = base.ExecutionStrategy()
         self._build_strategy = base.BuildStrategy()
 
-        self._execute_strategy.num_threads = num_threads
         if num_threads > 1:
             self._build_strategy.reduce_strategy = (
                 base.BuildStrategy.ReduceStrategy.Reduce
@@ -281,31 +275,6 @@ def check_server_runtime_config(self):
             "check_server_runtime_config must be implemented by derived class. You should use StrategyFactory to create DistributedStrategy."
         )
 
-    def get_execute_strategy(self):
-        return self._execute_strategy
-
-    def set_execute_strategy(self, config):
-        if isinstance(config, base.ExecutionStrategy):
-            self._execute_strategy = config
-        elif isinstance(config, dict):
-            for key in config:
-                if hasattr(self._execute_strategy, key):
-                    setattr(self._execute_strategy, key, config[key])
-                else:
-                    raise ValueError(
-                        f"ExecutionStrategy doesn't have key: {key}"
-                    )
-        else:
-            raise TypeError(
-                "execute_strategy only accept input type: dict or ExecutionStrategy"
-            )
-        self.check_execute_strategy()
-
-    def check_execute_strategy(self):
-        raise NotImplementedError(
-            "check_execute_strategy must be implemented by derived class. You should use StrategyFactory to create DistributedStrategy."
-        )
-
     def get_build_strategy(self):
         return self._build_strategy
 
@@ -337,7 +306,6 @@ def __init__(self):
         self.check_trainer_runtime_config()
         self.check_server_runtime_config()
         self.check_build_strategy()
-        self.check_execute_strategy()
 
     def check_trainer_runtime_config(self):
         self._trainer_runtime_config.mode = DistributedMode.SYNC
@@ -351,9 +319,6 @@ def check_program_config(self):
     def check_server_runtime_config(self):
         pass
 
-    def check_execute_strategy(self):
-        self._execute_strategy.use_thread_barrier = True
-
     def check_build_strategy(self):
         self._build_strategy.async_mode = True
 
@@ -365,7 +330,6 @@ def __init__(self):
         self.check_trainer_runtime_config()
         self.check_server_runtime_config()
         self.check_build_strategy()
-        self.check_execute_strategy()
 
     def check_trainer_runtime_config(self):
         self._trainer_runtime_config.mode = DistributedMode.ASYNC
@@ -377,9 +341,6 @@ def check_program_config(self):
     def check_server_runtime_config(self):
         pass
 
-    def check_execute_strategy(self):
-        pass
-
     def check_build_strategy(self):
         self._build_strategy.async_mode = True
 
@@ -391,7 +352,6 @@ def __init__(self):
         self.check_trainer_runtime_config()
         self.check_server_runtime_config()
         self.check_build_strategy()
-        self.check_execute_strategy()
 
     def check_trainer_runtime_config(self):
         self._trainer_runtime_config.mode = DistributedMode.HALF_ASYNC
@@ -404,9 +364,6 @@ def check_program_config(self):
     def check_server_runtime_config(self):
         pass
 
-    def check_execute_strategy(self):
-        self._execute_strategy.use_thread_barrier = True
-
     def check_build_strategy(self):
         self._build_strategy.async_mode = True
 
@@ -419,7 +376,6 @@ def __init__(self, update_frequency=100):
         self.check_trainer_runtime_config()
         self.check_server_runtime_config()
         self.check_build_strategy()
-        self.check_execute_strategy()
 
     def check_program_config(self):
         self._program_config.sync_mode = False
@@ -440,9 +396,6 @@ def check_trainer_runtime_config(self):
     def check_server_runtime_config(self):
         pass
 
-    def check_execute_strategy(self):
-        pass
-
     def check_build_strategy(self):
         self._build_strategy.async_mode = True
 
diff --git a/python/paddle/incubate/distributed/fleet/parameter_server/ir/public.py b/python/paddle/incubate/distributed/fleet/parameter_server/ir/public.py
index 13bda751f8ed0..5e07a8632cc20 100755
--- a/python/paddle/incubate/distributed/fleet/parameter_server/ir/public.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/ir/public.py
@@ -1478,9 +1478,7 @@ def _get_lr_scheduler_program(lr_scheduler, lr_param_dict, lr_decay_steps):
             )
     else:
         raise ValueError(
-            "Not supported current LearningRate strategy, please use follow decay strategy: {}".format(
-                scheduler_decay
-            )
+            f"Not supported current LearningRate strategy, please use follow decay strategy: {scheduler_decay}"
         )
 
     return decay_main_program, decay_startup_program, lr_name
diff --git a/python/paddle/incubate/distributed/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/incubate/distributed/fleet/parameter_server/ir/trainer_pass.py
index a42abe95356a0..5578c991a2b90 100644
--- a/python/paddle/incubate/distributed/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/ir/trainer_pass.py
@@ -967,9 +967,7 @@ def _append_heter_op(op, current_heter_block_ops, heter_ops):
         for _, heter_block in heter_block_dict.items():
             total_heter_ops += len(heter_block)
     print(
-        "There are {} OPs in your main_program, and contains {} heter-OPs which is made up of {} heter-blocks.".format(
-            len(block.ops), total_heter_ops, heter_blocks
-        )
+        f"There are {len(block.ops)} OPs in your main_program, and contains {total_heter_ops} heter-OPs which is made up of {heter_blocks} heter-blocks."
     )
 
     return origin_program, heter_ops, default_ops, program_block_ops
diff --git a/python/paddle/incubate/distributed/fleet/parameter_server/ir/vars_metatools.py b/python/paddle/incubate/distributed/fleet/parameter_server/ir/vars_metatools.py
index eb6447d19c711..92976d5892600 100644
--- a/python/paddle/incubate/distributed/fleet/parameter_server/ir/vars_metatools.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/ir/vars_metatools.py
@@ -69,15 +69,7 @@ def __init__(self, name, shape, dtype, type, lod_level, persistable):
         self.m_size *= dtype_to_size[dtype]
 
     def __str__(self):
-        return "N: {}, S: {}, D: {}, T: {}, LL: {}, P: {}, M: {}".format(
-            self.name,
-            self.shape,
-            self.dtype,
-            self.type,
-            self.lod_level,
-            self.persistable,
-            self.m_size,
-        )
+        return f"N: {self.name}, S: {self.shape}, D: {self.dtype}, T: {self.type}, LL: {self.lod_level}, P: {self.persistable}, M: {self.m_size}"
 
 
 class VarDistributed:
@@ -156,31 +148,14 @@ def equal(var1, var2):
         )
 
     def __str__(self):
-        origin_var_str = (
-            "{name} : base.{type}.shape{shape}.astype({dtype})".format(
-                name=self.origin.name,
-                type=self.origin.type,
-                shape=self.origin.shape,
-                dtype=self.origin.dtype,
-            )
-        )
+        origin_var_str = f"{self.origin.name} : base.{self.origin.type}.shape{self.origin.shape}.astype({self.origin.dtype})"
 
         slice_var_str = (
-            "{name} : base.{type}.shape{shape}.astype({dtype})"
-            ".slice({is_slice}).block({block_id}).offset({offset})".format(
-                name=self.slice.name,
-                type=self.slice.type,
-                shape=self.slice.shape,
-                dtype=self.slice.dtype,
-                is_slice=self.is_slice,
-                block_id=self.block_id,
-                offset=self.offset,
-            )
+            f"{self.slice.name} : base.{self.slice.type}.shape{self.slice.shape}.astype({self.slice.dtype})"
+            f".slice({self.is_slice}).block({self.block_id}).offset({self.offset})"
         )
 
-        return "var owned: {}, origin var: ( {} ), slice var: ( {} ), endpoint: {} ".format(
-            self.vtype, origin_var_str, slice_var_str, self.endpoint
-        )
+        return f"var owned: {self.vtype}, origin var: ( {origin_var_str} ), slice var: ( {slice_var_str} ), endpoint: {self.endpoint} "
 
 
 class VarsDistributed:
diff --git a/python/paddle/incubate/distributed/fleet/parameter_server/pslib/__init__.py b/python/paddle/incubate/distributed/fleet/parameter_server/pslib/__init__.py
index 0e5f922e8ea83..23e242f12ede4 100644
--- a/python/paddle/incubate/distributed/fleet/parameter_server/pslib/__init__.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/pslib/__init__.py
@@ -27,7 +27,7 @@
 )
 from paddle.incubate.distributed.fleet.role_maker import (
     HeterRoleMaker,
-    MPISymetricRoleMaker,
+    MPISymmetricRoleMaker,
 )
 
 from .optimizer_factory import (
@@ -52,7 +52,7 @@ def __init__(self):
 
     def init(self, role_maker=None):
         if role_maker is None:
-            role_maker = MPISymetricRoleMaker()
+            role_maker = MPISymmetricRoleMaker()
         super().init(role_maker)
         self._fleet_ptr = core.Fleet()
         self._heter_ptr = None
@@ -224,7 +224,7 @@ def run_server(self):
             self._fleet_ptr.init_server(
                 self._dist_desc_str, self._role_maker.server_index() * 2
             )
-            if isinstance(self._role_maker, MPISymetricRoleMaker):
+            if isinstance(self._role_maker, MPISymmetricRoleMaker):
                 self._local_ip = self._fleet_ptr.run_server()
             else:
                 local_endpoint = self._role_maker.get_local_endpoint()
diff --git a/python/paddle/incubate/distributed/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/incubate/distributed/fleet/parameter_server/pslib/optimizer_factory.py
index 1b69c7e110e33..409d58c7e2964 100644
--- a/python/paddle/incubate/distributed/fleet/parameter_server/pslib/optimizer_factory.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/pslib/optimizer_factory.py
@@ -403,13 +403,9 @@ def _check_config_fleet_with_program_op(
                 and strategy.get("use_cvm") is True
             ):
                 logger.warning(
-                    "sparse embedding dim for table name '{}' is: {}, while sparse_embedx_dim "
+                    f"sparse embedding dim for table name '{table_name}' is: {emb_to_size[table_name]}, while sparse_embedx_dim "
                     "with same sparse table name is not set in config_fleet.py. "
-                    "Hence automatically set sparse_embedx_dim = {} - 3.".format(
-                        table_name,
-                        emb_to_size[table_name],
-                        emb_to_size[table_name],
-                    )
+                    f"Hence automatically set sparse_embedx_dim = {emb_to_size[table_name]} - 3."
                 )
                 st["sparse_embedx_dim"] = emb_to_size[table_name] - 3
             if (
@@ -417,13 +413,9 @@ def _check_config_fleet_with_program_op(
                 and strategy.get("use_cvm") is False
             ):
                 logger.warning(
-                    "sparse embedding dim for table name '{}' is: {}, while sparse_embedx_dim "
+                    f"sparse embedding dim for table name '{table_name}' is: {emb_to_size[table_name]}, while sparse_embedx_dim "
                     "with same sparse table name is not set in config_fleet.py. "
-                    "Hence automatically set sparse_embedx_dim = {} - 1.".format(
-                        table_name,
-                        emb_to_size[table_name],
-                        emb_to_size[table_name],
-                    )
+                    f"Hence automatically set sparse_embedx_dim = {emb_to_size[table_name]} - 1."
                 )
                 st["sparse_embedx_dim"] = emb_to_size[table_name] - 1
         elif accessor == "DownpourSparseValueAccessor":
@@ -439,13 +431,9 @@ def _check_config_fleet_with_program_op(
                 )
             if st.get("sparse_embedx_dim") is None:
                 logger.warning(
-                    "sparse embedding dim for table name '{}' is: {}, while sparse_embedx_dim "
+                    f"sparse embedding dim for table name '{table_name}' is: {emb_to_size[table_name]}, while sparse_embedx_dim "
                     "with same sparse table name is not set in config_fleet.py. "
-                    "Hence automatically set sparse_embedx_dim = {}.".format(
-                        table_name,
-                        emb_to_size[table_name],
-                        emb_to_size[table_name],
-                    )
+                    f"Hence automatically set sparse_embedx_dim = {emb_to_size[table_name]}."
                 )
                 st["sparse_embedx_dim"] = emb_to_size[table_name]
 
@@ -623,10 +611,8 @@ def _minimize(
             emb_to_size = FLEET_GLOBAL_DICT["emb_to_size"]
             if len(sparse_table_to_index) != len(emb_to_table):
                 raise ValueError(
-                    "sparse tables from  program != sparse tables from op: {} "
-                    "vs {}".format(
-                        len(sparse_table_to_index), len(emb_to_table)
-                    )
+                    f"sparse tables from  program != sparse tables from op: {len(sparse_table_to_index)} "
+                    f"vs {len(emb_to_table)}"
                 )
             for key in sparse_table_to_index:
                 if (
diff --git a/python/paddle/incubate/distributed/fleet/role_maker.py b/python/paddle/incubate/distributed/fleet/role_maker.py
index 61767e6f2c34e..c554fde93e45a 100644
--- a/python/paddle/incubate/distributed/fleet/role_maker.py
+++ b/python/paddle/incubate/distributed/fleet/role_maker.py
@@ -142,12 +142,7 @@ def get_pserver_endpoints(self):
         return self._server_endpoints
 
     def to_string(self):
-        return "role: {}, current_id: {}, worker_endpoints: {}, server_endpoints: {}".format(
-            self._role,
-            self._current_id,
-            self._worker_endpoints,
-            self._server_endpoints,
-        )
+        return f"role: {self._role}, current_id: {self._current_id}, worker_endpoints: {self._worker_endpoints}, server_endpoints: {self._server_endpoints}"
 
     def all_gather(self, input):
         """
@@ -263,9 +258,9 @@ def generate_role(self):
         raise NotImplementedError("Please implement this method in child class")
 
 
-class MPISymetricRoleMaker(MPIRoleMaker):
+class MPISymmetricRoleMaker(MPIRoleMaker):
     """
-    MPISymetricRoleMaker is designed for worker and server assignment
+    MPISymmetricRoleMaker is designed for worker and server assignment
     under MPI. Typically, a worker and a server node will be appointed
     on each physical node. This role maker can be only used under MPI.
     """
diff --git a/python/paddle/incubate/distributed/fleet/utils.py b/python/paddle/incubate/distributed/fleet/utils.py
index 98945ca7092e0..ca2ed77da9278 100644
--- a/python/paddle/incubate/distributed/fleet/utils.py
+++ b/python/paddle/incubate/distributed/fleet/utils.py
@@ -119,13 +119,7 @@ def check_pruned_program_vars(train_prog, pruned_prog):
             or var.dtype != train_prog_var.dtype
         ):
             logger.error(
-                "variable: {} not match. in pruned program shape: {} dtype:{}, in train program shape: {} dtype: {}".format(
-                    var_name,
-                    var.shape,
-                    var.dtype,
-                    train_prog_var.shape,
-                    train_prog_var.dtype,
-                )
+                f"variable: {var_name} not match. in pruned program shape: {var.shape} dtype:{var.dtype}, in train program shape: {train_prog_var.shape} dtype: {train_prog_var.dtype}"
             )
             is_match = False
     return is_match
@@ -265,10 +259,8 @@ def try_load_model_vars(
             orig_shape = orig_para_shape.get(each_var.name)
             if new_shape != orig_shape:
                 raise RuntimeError(
-                    "Shape not matching: the Program requires a parameter with a shape of ({}), "
-                    "while the loaded parameter (namely [ {} ]) has a shape of  ({}).".format(
-                        orig_shape, each_var.name, new_shape
-                    )
+                    f"Shape not matching: the Program requires a parameter with a shape of ({orig_shape}), "
+                    f"while the loaded parameter (namely [ {each_var.name} ]) has a shape of  ({new_shape})."
                 )
 
         # check feed/fetch vars in program and config
@@ -284,9 +276,7 @@ def try_load_model_vars(
             and feed_target_names != feed_config.feeded_vars_names
         ):
             logger.warning(
-                "feed vars in program and config are diff: feed in program: {}. feed in config {}.".format(
-                    feed_target_names, feed_config.feeded_vars_names
-                )
+                f"feed vars in program and config are diff: feed in program: {feed_target_names}. feed in config {feed_config.feeded_vars_names}."
             )
             feed_name_list = feed_config.feeded_vars_names
             # remove feed op in inference_program. new feed op will be added in exe.run
@@ -303,9 +293,7 @@ def try_load_model_vars(
             and fetch_targets_names != fetch_config.fetch_vars_names
         ):
             logger.warning(
-                "fetch vars in program and config are diff: fetch in program: {}. fetch in config {}.".format(
-                    fetch_targets_names, fetch_config.fetch_vars_names
-                )
+                f"fetch vars in program and config are diff: fetch in program: {fetch_targets_names}. fetch in config {fetch_config.fetch_vars_names}."
             )
             fetch_list = [
                 inference_program.global_block().var(i)
@@ -344,11 +332,7 @@ def try_load_model_vars(
             var_shape = var.shape[1:]
             if tensor_shape != var_shape:
                 raise RuntimeError(
-                    "feed variable '{}' shape not match. infer program  shape: {}. feed tensor shape: {}".format(
-                        feed_config.feeded_vars_names[i],
-                        var_shape,
-                        tensor_shape,
-                    )
+                    f"feed variable '{feed_config.feeded_vars_names[i]}' shape not match. infer program  shape: {var_shape}. feed tensor shape: {tensor_shape}"
                 )
 
         if not feed_config.feeded_vars_filelist:
diff --git a/python/paddle/incubate/distributed/models/moe/moe_layer.py b/python/paddle/incubate/distributed/models/moe/moe_layer.py
index 986096ad4ccc8..276e9c52633d7 100644
--- a/python/paddle/incubate/distributed/models/moe/moe_layer.py
+++ b/python/paddle/incubate/distributed/models/moe/moe_layer.py
@@ -395,9 +395,7 @@ def __init__(
                 )
             else:
                 raise AssertionError(
-                    "We only support naive gate,                                 gshard gate and switch gate,                                 but you choose {} gate.".format(
-                        str(gate)
-                    )
+                    f"We only support naive gate,                                 gshard gate and switch gate,                                 but you choose {str(gate)} gate."
                 )
         elif isinstance(gate, NaiveGate):
             self.top_k = gate.top_k
diff --git a/python/paddle/incubate/distributed/utils/io/dist_save.py b/python/paddle/incubate/distributed/utils/io/dist_save.py
index 0db97e4d1f3ee..d0fbc1e7d97f9 100644
--- a/python/paddle/incubate/distributed/utils/io/dist_save.py
+++ b/python/paddle/incubate/distributed/utils/io/dist_save.py
@@ -156,7 +156,7 @@ def save(state_dict, path, **configs):
             paddle.save(gathered_state_dict, path, **configs)
     except:
         raise RuntimeError(
-            f'''Saving failed. Follwing are some suggestions:
+            f'''Saving failed. Following are some suggestions:
     1) pass the param max_grouped_size to turn the grouped size smaller (current value of max_grouped_size is {max_size})
     2) if sharding stage is 1, use paddle.save rather than paddle.distributed.save
     3) Concat the developers
@@ -247,7 +247,7 @@ def _gather_state_dict(state_dict, dst, group, max_size="3G"):
         group(ProcessGroup):
             group across which the state dicts are gathered
         max_size(int|str):
-            The max limitation of the gathered tensor group size transformered a time. Default is 3G bits.
+            The max limitation of the gathered tensor group size transformed a time. Default is 3G bits.
             Each rank 's max tensor group before gathering is max_size // group.size
     Returns:
         Gathered state dict
@@ -306,10 +306,10 @@ def _grouped_gather_data_dict(state_data_dict, dst, group, max_size):
         group(ProcessGroup):
             group across which the state dicts are gathered
         max_size(int|str):
-            The max limitation of the gathered tensor group size transformered a time. Default is 3G bits.
+            The max limitation of the gathered tensor group size transformed a time. Default is 3G bits.
             Each rank 's max tensor group before gathering is max_size // group.size
     Returns:
-        Gatherd state_data_dict
+        Gathered state_data_dict
 
     """
     numpy_dict = {}
@@ -343,7 +343,7 @@ def _grouped_gather_data_dict(state_data_dict, dst, group, max_size):
             f"s list size: {sum(len(s) for s in s_list)} output: {len(output_state)}"
         )
 
-    # Because each size of groups may be different, here we should wait all objects gatherd.
+    # Because each size of groups may be different, here we should wait all objects gathered.
     # The while block breaks until all objects from every rank are empty, which means all of the objects transforming is done.
     while True:
         s_list = []
diff --git a/python/paddle/incubate/layers/__init__.py b/python/paddle/incubate/layers/__init__.py
index f25a845d0a4dc..5430d1108cecb 100644
--- a/python/paddle/incubate/layers/__init__.py
+++ b/python/paddle/incubate/layers/__init__.py
@@ -14,7 +14,6 @@
 
 from . import nn  # noqa: F401
 from .nn import (  # noqa: F401
-    _pull_box_extended_sparse,
     _pull_box_sparse,
     _pull_gpups_sparse,
     batch_fc,
diff --git a/python/paddle/incubate/layers/nn.py b/python/paddle/incubate/layers/nn.py
index b3f57dd76f7d2..aee7f2b9088de 100644
--- a/python/paddle/incubate/layers/nn.py
+++ b/python/paddle/incubate/layers/nn.py
@@ -841,10 +841,8 @@ def tdm_sampler(
     if len(neg_samples_num_list) != len(layer_node_num_list):
         raise ValueError(
             "The shape of negative samples list must match the shape of layers. "
-            "But received len of neg_samples_num_list: {},"
-            "and len of layer_node_num_list: {}, please check your input.".format(
-                len(neg_samples_num_list), len(layer_node_num_list)
-            )
+            f"But received len of neg_samples_num_list: {len(neg_samples_num_list)},"
+            f"and len of layer_node_num_list: {len(layer_node_num_list)}, please check your input."
         )
     assert leaf_node_num is not None, "leaf_node_num should not be None here."
 
@@ -858,13 +856,8 @@ def tdm_sampler(
         if neg_samples_num_list[layer_idx] >= layer_node_num_list[layer_idx]:
             raise ValueError(
                 "The number of negative samples must be less than the number of nodes "
-                "in the layer {}, But received negative nums {}, and num of node at layer {} "
-                "is {}, please check your input.".format(
-                    layer_idx,
-                    neg_samples_num_list[layer_idx],
-                    layer_idx,
-                    layer_node_num_list[layer_idx],
-                )
+                f"in the layer {layer_idx}, But received negative nums {neg_samples_num_list[layer_idx]}, and num of node at layer {layer_idx} "
+                f"is {layer_node_num_list[layer_idx]}, please check your input."
             )
     assert (
         leaf_node_num < node_nums
@@ -1089,55 +1082,6 @@ def batch_fc(input, param_size, param_attr, bias_size, bias_attr, act=None):
     return helper.append_activation(pre_act)
 
 
-def _pull_box_extended_sparse(input, size, extend_size=64, dtype='float32'):
-    r"""
-    **Pull Box Extended Sparse Layer**
-    This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
-    BoxPS lookup table. The result of this lookup is the embedding of each ID in the
-    :attr:`input`.
-
-    Args:
-        input (Tensor): Input is a Tensor<int64>, which contains the IDs information.
-        size (int): The embedding size parameter, which indicates the size of
-            each embedding vector respectively.
-        extend_size (int, optional): The embedding size parameter in extended dim,
-            which indicates the size of each embedding vector respectively. Default is 64.
-        dtype (str, optional): The dtype refers to the data type of output tensor. Only supports float32 now. Default is float32.
-
-    Returns:
-        Tensor: The tensor storing the embeddings of the supplied inputs.
-
-    Examples:
-        .. code-block:: python
-
-            >>> import paddle
-            >>> paddle.enable_static()
-
-            >>> data = paddle.static.data(name='sequence', shape=[-1, 1], dtype='int64', lod_level=1)
-            >>> emb, emb_ex = paddle.incubate.layers._pull_box_extended_sparse(input=data, size=8, extend_size=128)
-    """
-    helper = LayerHelper('pull_box_extended_sparse', **locals())
-    helper.input_dtype()
-    inputs = helper.multiple_input()
-    outs = [
-        helper.create_variable_for_type_inference(dtype)
-        for i in range(len(inputs))
-    ]
-    outs_extend = [
-        helper.create_variable_for_type_inference(dtype)
-        for i in range(len(inputs))
-    ]
-    helper.append_op(
-        type='pull_box_extended_sparse',
-        inputs={'Ids': inputs},
-        outputs={'Out': outs, 'OutExtend': outs_extend},
-        attrs={'emb_size': size, 'emb_extended_size': extend_size},
-    )
-    if len(outs) == 1:
-        return outs[0], outs_extend[0]
-    return outs, outs_extend
-
-
 def bilateral_slice(x, guide, grid, has_offset, name=None):
     """
     :alias_main: paddle.nn.functional.bilateral_slice
diff --git a/python/paddle/incubate/multiprocessing/reductions.py b/python/paddle/incubate/multiprocessing/reductions.py
index 829259e21ab43..e5486e953bff8 100644
--- a/python/paddle/incubate/multiprocessing/reductions.py
+++ b/python/paddle/incubate/multiprocessing/reductions.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import copy
+import multiprocessing
 
 # TODO: check the hooks of tensor
 # TODO: check serializing named tensor
@@ -117,8 +118,53 @@ def _reduce_tensor(tensor):
         )
 
 
-def _rebuild_lodtensor_filename(cls, ipc_name, size, type_idx, dims, lod):
-    lodtensor = cls._new_shared_filename((ipc_name, size, type_idx, dims, lod))
+def _rebuild_lodtensor_filename(
+    cls,
+    ipc_name,
+    shared_fd,
+    size,
+    type_idx,
+    dims,
+    lod,
+    dataloader_use_file_descriptor,
+):
+    lodtensor = cls._new_shared_filename(
+        (
+            ipc_name,
+            shared_fd,
+            size,
+            type_idx,
+            dims,
+            lod,
+            dataloader_use_file_descriptor,
+        )
+    )
+    lodtensor._shared_decref()
+    return lodtensor
+
+
+def _rebuild_lodtensor_filedescriptor(
+    cls,
+    ipc_name,
+    shared_fd,
+    size,
+    type_idx,
+    dims,
+    lod,
+    dataloader_use_file_descriptor,
+):
+    shared_fd = shared_fd.detach()
+    lodtensor = cls._new_shared_filename(
+        (
+            ipc_name,
+            shared_fd,
+            size,
+            type_idx,
+            dims,
+            lod,
+            dataloader_use_file_descriptor,
+        )
+    )
     lodtensor._shared_decref()
     return lodtensor
 
@@ -161,15 +207,23 @@ def _reduce_lodtensor(lodtensor):
             if dim == 0:
                 # Empty tensors have nothing be mapped.
                 return (_rebuild_lodtensor_empty, (type(lodtensor),))
-
+        dataloader_use_file_descriptor = paddle.base.core.globals()[
+            "FLAGS_dataloader_use_file_descriptor"
+        ]
         # Default use share filename strategy
-        metadata = (
-            lodtensor._share_filename()
-        )  # ipc_name, size, type_idx, dims, lod
-        rebuild = _rebuild_lodtensor_filename
+        metadata = lodtensor._share_filename(
+            dataloader_use_file_descriptor
+        )  # ipc_name, fd, size, type_idx, dims, lod
+
+        if dataloader_use_file_descriptor:
+            metalist = list(metadata)
+            metalist[1] = multiprocessing.reduction.DupFd(metalist[1])
+            metadata = tuple(metalist)
+            rebuild = _rebuild_lodtensor_filedescriptor
+        else:
+            rebuild = _rebuild_lodtensor_filename
         lodtensor._shared_incref()
         # TODO, maintain reference for lodtensor
-        # TODO: support file_descriptor strategy
     elif lodtensor._place().is_gpu_place():
         metadata = lodtensor._share_cuda()
         rebuild = _rebuild_cuda_tensor
diff --git a/python/paddle/incubate/nn/functional/block_multihead_attention.py b/python/paddle/incubate/nn/functional/block_multihead_attention.py
index 9ee18feaad3c8..6409f160aaf69 100644
--- a/python/paddle/incubate/nn/functional/block_multihead_attention.py
+++ b/python/paddle/incubate/nn/functional/block_multihead_attention.py
@@ -83,7 +83,7 @@ def block_multihead_attention(
         block_size (Int): The block_size of cache. Default is 64.
         use_neox_style (Bool): Whether neox_style RoPE is used or not. Default is False.
         use_dynamic_cachekv_quant (Bool): Whether dynamic cache kv quantization is applied or not. Default is False.
-        quant_round_type (Int): The quant rount type in cache kv quantization and fmha_out quantization. If 0 is set, value will be rounding to nearest ties to even. If 1 is set, value will be rounding to nearest ties away from zero.
+        quant_round_type (Int): The quant round type in cache kv quantization and fmha_out quantization. If 0 is set, value will be rounding to nearest ties to even. If 1 is set, value will be rounding to nearest ties away from zero.
         quant_max_bound (Float32): The max bound of float type to int type.
         quant_min_bound (Float32): The min bound of float type to int type.
         out_scale (Float32): The quant scale of fmha_out. Default is -1, which means do not apply quantization for fmha_out.
diff --git a/python/paddle/incubate/nn/functional/fused_rotary_position_embedding.py b/python/paddle/incubate/nn/functional/fused_rotary_position_embedding.py
index 59984b9a68e69..b94d37acd7b4d 100644
--- a/python/paddle/incubate/nn/functional/fused_rotary_position_embedding.py
+++ b/python/paddle/incubate/nn/functional/fused_rotary_position_embedding.py
@@ -27,6 +27,7 @@ def fused_rotary_position_embedding(
     position_ids=None,
     use_neox_rotary_style=True,
     time_major=False,
+    rotary_emb_base=10000.0,
 ):
     r"""
     Fused rotary position embedding.
@@ -40,6 +41,7 @@ def fused_rotary_position_embedding(
         position_ids (Tensor, optional): The input tensor. The data type is int64. The shape of position_ids must be [batch_size, seq_len].
         use_neox_rotary_style(optional|bool): When the use_neox_rotary_style is True, every two adjacent numbers are calculated. When the use_neox_rotary_style is False, the numbers corresponding to the positions of the front half and back half segments are calculated. Default True.
         time_major(optional|bool): Whether the first dimension of the q, k, v input means the time steps. If time_major is True, the shape of Tensor is [seq_len, batch_size, num_heads, head_dim], otherwise [batch_size, seq_len, num_heads, head_dime]. Defaults to False. `time_steps` means the length of input sequence.
+        rotary_emb_base(optional|float): the base of the rotary embedding. Default 10000.
 
     Returns:
         out_q/out_k/out_v Tensor representing the fused rotary position embedding, has same shape and data type as `q` .
@@ -89,9 +91,25 @@ def fused_rotary_position_embedding(
               [[ 0.07116699, -0.90966797],
                [-0.03628540, -0.20202637]]]])
     """
+    if (sin is None) or (cos is None):
+        assert (
+            position_ids is None
+        ), "position_ids without sin/cos is not correctly supported now."
+        assert (
+            use_neox_rotary_style
+        ), "rotate_half without sin/cos is not correctly supported now."
+
     if in_dynamic_or_pir_mode():
         return _C_ops.fused_rotary_position_embedding(
-            q, k, v, sin, cos, position_ids, use_neox_rotary_style, time_major
+            q,
+            k,
+            v,
+            sin,
+            cos,
+            position_ids,
+            use_neox_rotary_style,
+            time_major,
+            rotary_emb_base,
         )
 
     helper = LayerHelper('fused_rotary_position_embedding', **locals())
@@ -123,6 +141,7 @@ def fused_rotary_position_embedding(
         attrs={
             'use_neox_rotary_style': use_neox_rotary_style,
             'time_major': time_major,
+            'rotary_emb_base': rotary_emb_base,
         },
     )
 
diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py
index fc148b7d621f9..c21e8245bef4d 100644
--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -183,14 +183,7 @@ def forward(self, x, residual):
 
     def extra_repr(self):
         name_str = f', name={self.name}' if self.name else ''
-        return 'embed_dim={}, seq_len={}, dropout_rate={}, epsilon={}, dtype={}{}'.format(
-            self.embed_dim,
-            self.seq_len,
-            self.dropout_rate,
-            self._epsilon,
-            self._dtype,
-            name_str,
-        )
+        return f'embed_dim={self.embed_dim}, seq_len={self.seq_len}, dropout_rate={self.dropout_rate}, epsilon={self._epsilon}, dtype={self._dtype}{name_str}'
 
 
 class FusedMultiHeadAttention(Layer):
@@ -465,19 +458,7 @@ def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
 
     def extra_repr(self):
         name_str = f', name={self.name}' if self.name else ''
-        return 'embed_dim={}, num_heads={}, dropout_rate={}, attn_dropout_rate={}, epsilon={}, kdim={}, vdim={}, normalize_before={}, need_weights={}, dtype={}{}'.format(
-            self.embed_dim,
-            self.num_heads,
-            self.dropout_rate,
-            self.attn_dropout_rate,
-            self._epsilon,
-            self.kdim,
-            self.vdim,
-            self.normalize_before,
-            self.need_weights,
-            self._dtype,
-            name_str,
-        )
+        return f'embed_dim={self.embed_dim}, num_heads={self.num_heads}, dropout_rate={self.dropout_rate}, attn_dropout_rate={self.attn_dropout_rate}, epsilon={self._epsilon}, kdim={self.kdim}, vdim={self.vdim}, normalize_before={self.normalize_before}, need_weights={self.need_weights}, dtype={self._dtype}{name_str}'
 
     def _amp_decorate(self, dtype):
         # tmp fix for amp.decorator(O2)
@@ -588,9 +569,7 @@ def __init__(
         ), f"Expected d_model to be greater than 0, but received {d_model}"
         assert (
             dim_feedforward > 0
-        ), "Expected dim_feedforward to be greater than 0, but received {}".format(
-            dim_feedforward
-        )
+        ), f"Expected dim_feedforward to be greater than 0, but received {dim_feedforward}"
 
         self._dtype = self._helper.get_default_dtype()
         self._d_model = d_model
@@ -693,17 +672,7 @@ def forward(self, src, cache=None):
 
     def extra_repr(self):
         name_str = f', name={self.name}' if self.name else ''
-        return 'd_model={}, dim_feedforward={}, dropout_rate={}, epsilon={}, activation={}, act_dropout_rate={}, normalize_before={}, dtype={}{}'.format(
-            self._d_model,
-            self._dim_feedforward,
-            self._dropout_rate,
-            self._epsilon,
-            self._act_method,
-            self._act_dropout_rate,
-            self._normalize_before,
-            self._dtype,
-            name_str,
-        )
+        return f'd_model={self._d_model}, dim_feedforward={self._dim_feedforward}, dropout_rate={self._dropout_rate}, epsilon={self._epsilon}, activation={self._act_method}, act_dropout_rate={self._act_dropout_rate}, normalize_before={self._normalize_before}, dtype={self._dtype}{name_str}'
 
     def _amp_decorate(self, dtype):
         # tmp fix for amp.decorator(O2)
@@ -1224,9 +1193,7 @@ def __init__(
         )
         assert (
             dim_feedforward > 0
-        ), "Expected dim_feedforward to be greater than 0, but received {}".format(
-            dim_feedforward
-        )
+        ), f"Expected dim_feedforward to be greater than 0, but received {dim_feedforward}"
 
         self.normalize_before = normalize_before
         self._dtype = self._helper.get_default_dtype()
diff --git a/python/paddle/incubate/operators/resnet_unit.py b/python/paddle/incubate/operators/resnet_unit.py
index 8a0030bff16df..af2faa4cac44a 100644
--- a/python/paddle/incubate/operators/resnet_unit.py
+++ b/python/paddle/incubate/operators/resnet_unit.py
@@ -203,9 +203,7 @@ def __init__(
         valid_format = {'NHWC', 'NCHW'}
         if data_format not in valid_format:
             raise ValueError(
-                "conv_format must be one of {}, but got conv_format='{}'".format(
-                    valid_format, data_format
-                )
+                f"conv_format must be one of {valid_format}, but got conv_format='{data_format}'"
             )
 
         def _get_default_param_initializer(channels):
diff --git a/python/paddle/incubate/optimizer/functional/bfgs.py b/python/paddle/incubate/optimizer/functional/bfgs.py
index a8e5843895378..8e4b8b173993f 100644
--- a/python/paddle/incubate/optimizer/functional/bfgs.py
+++ b/python/paddle/incubate/optimizer/functional/bfgs.py
@@ -173,9 +173,7 @@ def body(k, done, is_converge, num_func_calls, xk, value, g1, Hk):
             )
         else:
             raise NotImplementedError(
-                "Currently only support line_search_fn = 'strong_wolfe', but the specified is '{}'".format(
-                    line_search_fn
-                )
+                f"Currently only support line_search_fn = 'strong_wolfe', but the specified is '{line_search_fn}'"
             )
         num_func_calls += ls_func_calls
 
diff --git a/python/paddle/incubate/optimizer/functional/lbfgs.py b/python/paddle/incubate/optimizer/functional/lbfgs.py
index f07d8427aa1ce..6d4134c8136be 100644
--- a/python/paddle/incubate/optimizer/functional/lbfgs.py
+++ b/python/paddle/incubate/optimizer/functional/lbfgs.py
@@ -252,9 +252,7 @@ def body(i, r):
             )
         else:
             raise NotImplementedError(
-                "Currently only support line_search_fn = 'strong_wolfe', but the specified is '{}'".format(
-                    line_search_fn
-                )
+                f"Currently only support line_search_fn = 'strong_wolfe', but the specified is '{line_search_fn}'"
             )
         paddle.assign(num_func_calls + ls_func_calls, num_func_calls)
 
diff --git a/python/paddle/incubate/optimizer/gradient_merge.py b/python/paddle/incubate/optimizer/gradient_merge.py
index 6d617a9d08007..cf9440ef7261f 100644
--- a/python/paddle/incubate/optimizer/gradient_merge.py
+++ b/python/paddle/incubate/optimizer/gradient_merge.py
@@ -160,14 +160,10 @@ def _remove_op_role_var(self, param, grad):
         var_attr = op.all_attrs()[op_maker.kOpRoleVarAttrName()]
         assert (
             param.name in var_attr
-        ), 'when using GradientMergeOptimizer, param={} must be in var_attr={}'.format(
-            param.name, var_attr
-        )
+        ), f'when using GradientMergeOptimizer, param={param.name} must be in var_attr={var_attr}'
         assert (
             grad.name in var_attr
-        ), 'when using GradientMergeOptimizer, grad={} must be in var_attr={}'.format(
-            param.name, var_attr
-        )
+        ), f'when using GradientMergeOptimizer, grad={param.name} must be in var_attr={var_attr}'
 
         # remove (param, grad) from op_role_var
         var_attr.remove(param.name)
diff --git a/python/paddle/incubate/optimizer/pipeline.py b/python/paddle/incubate/optimizer/pipeline.py
index 02aef51b881e6..446dfb26782aa 100644
--- a/python/paddle/incubate/optimizer/pipeline.py
+++ b/python/paddle/incubate/optimizer/pipeline.py
@@ -481,13 +481,13 @@ def _get_op_device_attr(self, op):
         if device:
             assert device[0:3] == 'gpu', (
                 "Now, only gpu devices are "
-                "supported in pipeline parallemism."
+                "supported in pipeline parallelism."
             )
         return device
 
     def _add_op_device_attr_for_op(self, op, idx, block):
         """
-        Add op_device attrribute for ops that have not that attribute set.
+        Add op_device attribute for ops that have not that attribute set.
         We use "gpu:all" to represent the op should be put on all
         sub-programs, such as lr-related ops. Note that: "gpu:all"
         is only used by pipeline as an indicator.
@@ -609,7 +609,7 @@ def _add_op_device_attr_for_op(self, op, idx, block):
 
     def _add_op_device_attr(self, block):
         """
-        Add op_device attrribute for ops in block that have
+        Add op_device attribute for ops in block that have
         not that attribute set.
         """
         for idx, op in enumerate(list(block.ops)):
@@ -660,9 +660,7 @@ def _check_validation(self, block):
             op_role = op.attr(self._op_role_key)
             assert (
                 int(op_role) in valid_op_role_value
-            ), "op_role {} for op {} must be one of {}".format(
-                op_role, op.type, valid_op_role_value
-            )
+            ), f"op_role {op_role} for op {op.type} must be one of {valid_op_role_value}"
 
             assert op.has_attr(
                 self._op_device_key
@@ -752,16 +750,12 @@ def _check_stage(cur_id, prev_id):
                     if is_forward:
                         assert prev_id < cur_id, (
                             "In forward, send/recv can only be passed forward, but now "
-                            "prev_stage={} great than cur_stage={}, please check op_device of op={}".format(
-                                prev_id, cur_id, op
-                            )
+                            f"prev_stage={prev_id} great than cur_stage={cur_id}, please check op_device of op={op}"
                         )
                     elif is_backward:
                         assert prev_id > cur_id, (
                             "In backward, send/recv can only be passed backward, but now "
-                            "prev_stage={} less than cur_stage={}, please check op_device of op={}".format(
-                                prev_id, cur_id, op
-                            )
+                            f"prev_stage={prev_id} less than cur_stage={cur_id}, please check op_device of op={op}"
                         )
 
                 def _insert_send_recv(cur_id, prev_id):
@@ -1011,7 +1005,7 @@ def _rename_gradient_var_name(self, block):
             if op.type == 'cast' or op.type == "c_sync_comm_stream":
                 continue
             # append "MERGED" to the names of parameter gradients,
-            # and mofify the op_role_var attribute (by rename_arg func).
+            # and modify the op_role_var attribute (by rename_arg func).
             for name in in_out_names:
                 if core.grad_var_suffix() not in name:
                     continue
diff --git a/python/paddle/incubate/optimizer/recompute.py b/python/paddle/incubate/optimizer/recompute.py
index bf4a3d55adf4d..c60246034680b 100644
--- a/python/paddle/incubate/optimizer/recompute.py
+++ b/python/paddle/incubate/optimizer/recompute.py
@@ -335,9 +335,7 @@ def _record_offload_op(self, idx, checkpoint_name):
         expected_checkpoint_name = self.un_offload_checkpoint_names.pop(0)
         assert (
             checkpoint_name == expected_checkpoint_name
-        ), "expected to offload [{}] but got [{}]".format(
-            expected_checkpoint_name, checkpoint_name
-        )
+        ), f"expected to offload [{expected_checkpoint_name}] but got [{checkpoint_name}]"
         logging.debug(f"Record offload [{checkpoint_name}]")
         self.idx2insertions[idx] = ("offload", checkpoint_name)
 
@@ -395,9 +393,7 @@ def _parse_backward(self):
                         # should check the current used checkpoint is ths last fetch one
                         assert (
                             second_to_last_fetch_checkpoint == input_var
-                        ), "Current recompute segment should use [{}] BUT got [{}]".format(
-                            second_to_last_fetch_checkpoint, input_var
-                        )
+                        ), f"Current recompute segment should use [{second_to_last_fetch_checkpoint}] BUT got [{input_var}]"
                         # rename
                         self.block.ops[idx]._rename_input(
                             input_var,
@@ -430,9 +426,7 @@ def _update_backward(self):
         self.block._sync_with_cpp()
         assert (
             len(self.idx2insertions) == 0
-        ), "{} checkpoints left un-Fetched".format(
-            [ele[1] for ele in self.idx2insertions.values()]
-        )
+        ), f"{[ele[1] for ele in self.idx2insertions.values()]} checkpoints left un-Fetched"
 
     def _parse_forward(self):
         self.idx2insertions = {}
@@ -469,9 +463,7 @@ def _parse_forward(self):
                 if output_var in need_offload_checkpoint_names:
                     assert (
                         len(output_vars) == 1
-                    ), "checkpoint should be the only Output of a certain op, but [{}] is from [{}]".format(
-                        output_var, op
-                    )
+                    ), f"checkpoint should be the only Output of a certain op, but [{output_var}] is from [{op}]"
 
                     if output_var in self.un_offload_checkpoint_names:
                         # insert sync op if last checkpoint has not been sync
@@ -493,9 +485,7 @@ def _parse_forward(self):
                                 )
                                 assert (
                                     last_usage_idx > 0
-                                ), "last_usage_idx of checkpoint [{}] should large than 0".format(
-                                    last_offload_checkpoint
-                                )
+                                ), f"last_usage_idx of checkpoint [{last_offload_checkpoint}] should large than 0"
                                 self._record_sync_op(
                                     last_usage_idx + 1, last_offload_checkpoint
                                 )
@@ -504,25 +494,17 @@ def _parse_forward(self):
                         last_offload_checkpoint = output_var
                     else:
                         raise ValueError(
-                            "There should be just ONE op that output checkpoint [{}]".format(
-                                output_var
-                            )
+                            f"There should be just ONE op that output checkpoint [{output_var}]"
                         )
                 # need to sync the last need to offload checkpoint before the last checkpoint as output op
                 if output_var == last_checkpoint:
                     assert (
                         len(output_vars) == 1
-                    ), "checkpoint should be the only Output of a certain op, but [{}] is from [{}]".format(
-                        output_var, op
-                    )
+                    ), f"checkpoint should be the only Output of a certain op, but [{output_var}] is from [{op}]"
                     assert (
                         last_offload_checkpoint
                         == self.sorted_checkpoint_names[-2]
-                    ), "the last offload checkpoint before [{}] is suppose to be [{}], but got [{}]".format(
-                        last_checkpoint,
-                        self.sorted_checkpoint_names[-2],
-                        last_offload_checkpoint,
-                    )
+                    ), f"the last offload checkpoint before [{last_checkpoint}] is suppose to be [{self.sorted_checkpoint_names[-2]}], but got [{last_offload_checkpoint}]"
                     # sync if last checkpoint has not been sync
                     if (
                         self.checkpoint_usage_count_and_idx[
@@ -537,9 +519,7 @@ def _parse_forward(self):
                         ]['idx']
                         assert (
                             last_usage_idx > 0
-                        ), "last_usage_idx of checkpoint [{}] should large than 0".format(
-                            last_offload_checkpoint
-                        )
+                        ), f"last_usage_idx of checkpoint [{last_offload_checkpoint}] should large than 0"
                         self._record_sync_op(
                             last_usage_idx + 1, last_offload_checkpoint
                         )
@@ -557,9 +537,7 @@ def _parse_forward(self):
         ), f"{self.un_fetch_checkpoint_names} checkpoints have NOT been Recorded"
         assert len(self.synced_checkpoints) == len(
             need_offload_checkpoint_names
-        ), "{} checkpoints have NOT been Recorded".format(
-            set(need_offload_checkpoint_names) - set(self.synced_checkpoints)
-        )
+        ), f"{set(need_offload_checkpoint_names) - set(self.synced_checkpoints)} checkpoints have NOT been Recorded"
 
     def _update_forward(self):
         if len(self.idx2insertions) == 0:
@@ -583,9 +561,7 @@ def _update_forward(self):
         self.block._sync_with_cpp()
         assert (
             len(self.idx2insertions) == 0
-        ), "{} checkpoints left un-Offloaded".format(
-            [ele[1] for ele in self.idx2insertions.values()]
-        )
+        ), f"{[ele[1] for ele in self.idx2insertions.values()]} checkpoints left un-Offloaded"
 
     def _check_offload_fetch(self):
         # TODO(JZ-LIANG) the single stream offload need no sync
@@ -607,14 +583,10 @@ def _offload(self, loss, startup_program=None):
         with program_guard(self._main_program, startup_program):
             assert (
                 len(self.checkpoint_shape) > 0
-            ), "checkpoints shape {} should be an non empty list like: [12, 512, 1024]".format(
-                self.checkpoint_shape
-            )
+            ), f"checkpoints shape {self.checkpoint_shape} should be an non empty list like: [12, 512, 1024]"
             assert all(
                 ele > 0 for ele in self.checkpoint_shape
-            ), "all ele in checkpoints shape {} should be a determined integer larger than 0".format(
-                self.checkpoint_shape
-            )
+            ), f"all ele in checkpoints shape {self.checkpoint_shape} should be a determined integer larger than 0"
             self.checkpoint_name2pinned_name = {}
             self.checkpoint_name2fetch_name = {}
             for checkpoint_varname in self.sorted_checkpoint_names:
diff --git a/python/paddle/incubate/passes/ir.py b/python/paddle/incubate/passes/ir.py
index f46cd9851c9de..97752e910a043 100644
--- a/python/paddle/incubate/passes/ir.py
+++ b/python/paddle/incubate/passes/ir.py
@@ -97,9 +97,7 @@ def _func_to_program_desc(self, func, ops):
                     op_outs = out.Outputs()
                     if len(op_outs) != 1:
                         raise ValueError(
-                            "Operator '{}' has multiple outputs, please specify one output variable.".format(
-                                out._type
-                            )
+                            f"Operator '{out._type}' has multiple outputs, please specify one output variable."
                         )
                     for op_out in op_outs.values():
                         vars.extend(op_out)
@@ -315,9 +313,7 @@ class OpHelper:
         def _to_readable_code(self, skip_op_callstack=True):
             assert isinstance(
                 skip_op_callstack, bool
-            ), "skip_op_callstack parameter's type is error, expect bool, received {}".format(
-                type(skip_op_callstack)
-            )
+            ), f"skip_op_callstack parameter's type is error, expect bool, received {type(skip_op_callstack)}"
             outputs_str = "{"
             outputs_str += ", ".join(
                 [f"{k}={v}" for k, v in self._outputs.items()]
@@ -354,16 +350,12 @@ def __call__(self, *args, **kwargs):
                 op_input = self._inputs.get(in_name)
                 if op_input is None:
                     raise ValueError(
-                        "Operator '{}' does not have input named '{}'.".format(
-                            self._type, in_name
-                        )
+                        f"Operator '{self._type}' does not have input named '{in_name}'."
                     )
                 if isinstance(in_args, (list, tuple)):
                     if len(in_args) == 0:
                         raise ValueError(
-                            "Input '{}' of operator '{}' cannot be empty.".format(
-                                in_name, self._type
-                            )
+                            f"Input '{in_name}' of operator '{self._type}' cannot be empty."
                         )
                 else:
                     in_args = [in_args]
@@ -372,9 +364,7 @@ def __call__(self, *args, **kwargs):
                         op_outs = in_arg.Outputs()
                         if len(op_outs) != 1:
                             raise ValueError(
-                                "The size of outputs of operator '{}' is not equal 1, please specify one output variable.".format(
-                                    in_arg._type
-                                )
+                                f"The size of outputs of operator '{in_arg._type}' is not equal 1, please specify one output variable."
                             )
                         for op_out in op_outs.values():
                             op_input.extend(op_out)
diff --git a/python/paddle/incubate/xpu/resnet_block.py b/python/paddle/incubate/xpu/resnet_block.py
index 2459c146c906e..b64576a68ee4e 100644
--- a/python/paddle/incubate/xpu/resnet_block.py
+++ b/python/paddle/incubate/xpu/resnet_block.py
@@ -151,7 +151,7 @@ def resnet_basic_block(
             var2,
             mean3,
             var3,
-            *attrs
+            *attrs,
         )
         return out
     helper = LayerHelper('resnet_basic_block', **locals())
@@ -517,9 +517,7 @@ def __init__(
         valid_format = {'NCHW'}
         if data_format not in valid_format:
             raise ValueError(
-                "conv_format must be one of {}, but got conv_format={}".format(
-                    valid_format, data_format
-                )
+                f"conv_format must be one of {valid_format}, but got conv_format={data_format}"
             )
 
         def _get_default_param_initializer(channels, kernel_size):
diff --git a/python/paddle/io/dataloader/dataloader_iter.py b/python/paddle/io/dataloader/dataloader_iter.py
index aaa2eae2a7864..a89c9cbe68f4d 100644
--- a/python/paddle/io/dataloader/dataloader_iter.py
+++ b/python/paddle/io/dataloader/dataloader_iter.py
@@ -705,8 +705,8 @@ def _get_data(self):
                     self._exit_thread_unexpectedly()
                     pids = ', '.join(str(w.pid) for w in failed_workers)
                     logging.warning(
-                        "DataLoader {} workers exit unexpectedly, "
-                        "pids: {}".format(len(failed_workers), pids)
+                        f"DataLoader {len(failed_workers)} workers exit unexpectedly, "
+                        f"pids: {pids}"
                     )
                     return
 
diff --git a/python/paddle/io/dataloader/dataset.py b/python/paddle/io/dataloader/dataset.py
index 666c9afe7bab6..267ef23b4fc8e 100755
--- a/python/paddle/io/dataloader/dataset.py
+++ b/python/paddle/io/dataloader/dataset.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 import bisect
+import math
+import warnings
 from typing import Iterable
 
 import paddle
@@ -487,7 +489,7 @@ def random_split(dataset, lengths, generator=None):
 
     Args:
         dataset (Dataset): Dataset to be split
-        lengths (sequence): lengths of splits to be produced
+        lengths (sequence): lengths or fractions of splits to be produced
         generator (Generator, optional): Generator used for the random permutation. Default is None then the DefaultGenerator is used in manual_seed().
 
     Returns:
@@ -522,6 +524,28 @@ def random_split(dataset, lengths, generator=None):
             5 3
             6 8
     """
+    if math.isclose(sum(lengths), 1) and sum(lengths) <= 1:
+        subset_lengths = []
+        for i, frac in enumerate(lengths):
+            if frac < 0 or frac > 1:
+                raise ValueError(
+                    f"Fraction at index {i} is not between 0 and 1"
+                )
+            n_items_in_split = int(math.floor(len(dataset) * frac))
+            subset_lengths.append(n_items_in_split)
+        remainder = len(dataset) - sum(subset_lengths)
+
+        for i in range(remainder):
+            idx_to_add_at = i % len(subset_lengths)
+            subset_lengths[idx_to_add_at] += 1
+        lengths = subset_lengths
+        for i, length in enumerate(lengths):
+            if length == 0:
+                warnings.warn(
+                    f"Length of split at index {i} is 0. "
+                    f"This might result in an empty dataset."
+                )
+
     # Cannot verify that dataset is Sized
     if sum(lengths) != len(dataset):  # type: ignore
         raise ValueError(
diff --git a/python/paddle/io/dataloader/sampler.py b/python/paddle/io/dataloader/sampler.py
index 9fdfe15c64122..eef54ae3c380e 100644
--- a/python/paddle/io/dataloader/sampler.py
+++ b/python/paddle/io/dataloader/sampler.py
@@ -160,8 +160,7 @@ class RandomSampler(Sampler):
                 object which implemented :code:`__len__` to get indices as the range of :code:`dataset` length. Default None.
         replacement(bool, optional): If False, sample the whole dataset, If True,
                 set :attr:`num_samples` for how many samples to draw. Default False.
-        num_samples(int, optional): set sample number to draw if :attr:`replacement`
-                is True, then it will take samples according to the number you set. Default None, disabled.
+        num_samples(int, optional): set sample number to draw. Default None, which is set to the length of `data_source`.
         generator(Generator, optional): specify a generator to sample the :code:`data_source`. Default None, disabled.
 
     Returns:
@@ -212,9 +211,10 @@ def __init__(
                 f"replacement={self.replacement}"
             )
 
-        if self._num_samples is not None and not replacement:
+        if not self.replacement and self.num_samples > len(self.data_source):
             raise ValueError(
-                "num_samples should not be specified while replacement is False"
+                "num_samples should be smaller than or equal to length of data_source when replacement is False, "
+                f"but got num_samples: {self.num_samples} > data_source: {len(self.data_source)}"
             )
 
         if not isinstance(self.num_samples, int) or self.num_samples <= 0:
@@ -246,7 +246,7 @@ def __iter__(self):
                     yield index
             else:
                 for index in np.random.choice(
-                    np.arange(n), n, replace=False
+                    np.arange(n), self.num_samples, replace=False
                 ).tolist():
                     yield index
 
diff --git a/python/paddle/io/dataloader/worker.py b/python/paddle/io/dataloader/worker.py
index 46d4539e69c44..a559a616bb296 100644
--- a/python/paddle/io/dataloader/worker.py
+++ b/python/paddle/io/dataloader/worker.py
@@ -179,9 +179,7 @@ def __init__(self, worker_id, exc_info=None):
         self.exc_msg = "".join(traceback.format_exception(*exc_info))
 
     def reraise(self):
-        msg = "DataLoader worker({}) caught {} with message:\n{}".format(
-            self.worker_id, self.exc_type.__name__, self.exc_msg
-        )
+        msg = f"DataLoader worker({self.worker_id}) caught {self.exc_type.__name__} with message:\n{self.exc_msg}"
         if getattr(self.exc_type, "message", None):
             raise self.exc_type(message=msg)
         raise self.exc_type(msg)
diff --git a/python/paddle/jit/api.py b/python/paddle/jit/api.py
index f81cb801d14bc..05e9b9d56e11c 100644
--- a/python/paddle/jit/api.py
+++ b/python/paddle/jit/api.py
@@ -232,9 +232,7 @@ def decorated(python_func):
     build_strategy = build_strategy or BuildStrategy()
     if not isinstance(build_strategy, BuildStrategy):
         raise TypeError(
-            "Required type(build_strategy) shall be `paddle.static.BuildStrategy`, but received {}".format(
-                type(build_strategy).__name__
-            )
+            f"Required type(build_strategy) shall be `paddle.static.BuildStrategy`, but received {type(build_strategy).__name__}"
         )
     _check_and_set_backend(backend, build_strategy)
 
@@ -244,9 +242,7 @@ def decorated(python_func):
             if isinstance(function.forward, StaticFunction):
                 class_name = function.__class__.__name__
                 logging_utils.warn(
-                    "`{}.forward` has already been decorated somewhere. It will be redecorated to replace previous one.".format(
-                        class_name
-                    )
+                    f"`{class_name}.forward` has already been decorated somewhere. It will be redecorated to replace previous one."
                 )
             function.forward = decorated(function.forward)
             return function
diff --git a/python/paddle/jit/dy2static/convert_call_func.py b/python/paddle/jit/dy2static/convert_call_func.py
index c150b5216c804..ea0ac57a4ff62 100644
--- a/python/paddle/jit/dy2static/convert_call_func.py
+++ b/python/paddle/jit/dy2static/convert_call_func.py
@@ -69,9 +69,7 @@ def attach(self, func):
             setattr(func, CONVERSION_OPTIONS, self)
         else:
             translator_logger.warn(
-                "Only support @not_to_static to type(function) or type(method), but received {}".format(
-                    type(func)
-                )
+                f"Only support @not_to_static to type(function) or type(method), but received {type(func)}"
             )
 
 
@@ -226,9 +224,7 @@ def convert_call(func):
         translator_logger.warn(
             "\n\n"
             + "*" * number_of_stars
-            + "\nYour function:`{}` doesn't support to transform to static function because it is a generator function, it will be run as-is.".format(
-                func.__name__
-            )
+            + f"\nYour function:`{func.__name__}` doesn't support to transform to static function because it is a generator function, it will be run as-is."
             + "\n"
             + "*" * number_of_stars
             + "\n\n"
diff --git a/python/paddle/jit/dy2static/convert_operators.py b/python/paddle/jit/dy2static/convert_operators.py
index 7bf19a802e409..4e74f03c77a4d 100644
--- a/python/paddle/jit/dy2static/convert_operators.py
+++ b/python/paddle/jit/dy2static/convert_operators.py
@@ -66,7 +66,11 @@ def convert_attr(x, attr):
     # Value and Tensor are unified. So we don't need to transform
     # the size attr into a method call. The AttributeJstTransformer and
     # convert_attr can be safely removed.
-    if isinstance(x, Variable) and attr == "size":
+    if (
+        isinstance(x, Variable)
+        and not isinstance(x, paddle.Tensor)
+        and attr == "size"
+    ):
         return x.size()
     else:
         return getattr(x, attr)
@@ -554,9 +558,7 @@ def _check_no_undefined_var(outs, names, branch_name):
     for var, name in zip(list(outs), names):
         if isinstance(var, UndefinedVar):
             raise ValueError(
-                "Required '{}' must be initialized both in if-else branch, but found it not initialized in '{}'.".format(
-                    name, branch_name
-                )
+                f"Required '{name}' must be initialized both in if-else branch, but found it not initialized in '{branch_name}'."
             )
 
 
@@ -734,9 +736,7 @@ def convert_var_dtype(var, dtype):
             'int32',
             'int64',
             'uint8',
-        ], "The dtype of var {} is {}, which is not supported in the cast op.".format(
-            var.name, src_dtype
-        )
+        ], f"The dtype of var {var.name} is {src_dtype}, which is not supported in the cast op."
         assert dtype in [
             'bool',
             'int',
diff --git a/python/paddle/jit/dy2static/error.py b/python/paddle/jit/dy2static/error.py
index 8dab5f51a0d65..737e9bc77fa78 100644
--- a/python/paddle/jit/dy2static/error.py
+++ b/python/paddle/jit/dy2static/error.py
@@ -144,9 +144,7 @@ def __init__(self, location, function_name):
     def formatted_message(self):
         msg = (
             ' ' * BLANK_COUNT_BEFORE_FILE_STR
-            + 'File "{}", line {}, in {}\n'.format(
-                self.location.filepath, self.location.lineno, self.function_name
-            )
+            + f'File "{self.location.filepath}", line {self.location.lineno}, in {self.function_name}\n'
         )
         # add empty line after range code
         return msg + '\n'.join(self.source_code)
@@ -225,9 +223,7 @@ def numpy_api_check(self, format_exception, error_line):
 
         if is_numpy_api_err and func_str:
             return [
-                "TypeError: Code '{}' called numpy API {}, please use Paddle API to replace it.".format(
-                    error_line, func_str
-                ),
+                f"TypeError: Code '{error_line}' called numpy API {func_str}, please use Paddle API to replace it.",
                 "           values will be changed to variables by dy2static, numpy api can not handle variables",
             ]
         else:
diff --git a/python/paddle/jit/dy2static/function_spec.py b/python/paddle/jit/dy2static/function_spec.py
index b6b3f53a36e34..b8fd186d8f2d6 100644
--- a/python/paddle/jit/dy2static/function_spec.py
+++ b/python/paddle/jit/dy2static/function_spec.py
@@ -78,13 +78,7 @@ def unified_args_and_kwargs(self, args, kwargs):
             New arguments tuple containing default kwargs value.
         """
         if len(self._arg_names) < len(args):
-            error_msg = "The decorated function `{}` requires {} arguments: {}, but received {} with {}.".format(
-                self._dygraph_function.__name__,
-                len(self._arg_names),
-                self._arg_names,
-                len(args),
-                args,
-            )
+            error_msg = f"The decorated function `{self._dygraph_function.__name__}` requires {len(self._arg_names)} arguments: {self._arg_names}, but received {len(args)} with {args}."
             if args and inspect.isclass(args[0]):
                 error_msg += "\n\tMaybe the function has more than one decorator, we don't support this for now."
                 raise NotImplementedError(error_msg)
@@ -101,12 +95,7 @@ def unified_args_and_kwargs(self, args, kwargs):
             else:
                 if arg_name not in self._default_kwargs:
                     raise ValueError(
-                        "`{}()` requires `{}` arguments, but not found in input `args`: {} and `kwargs`: {}.".format(
-                            self._dygraph_function.__name__,
-                            arg_name,
-                            args,
-                            kwargs,
-                        )
+                        f"`{self._dygraph_function.__name__}()` requires `{arg_name}` arguments, but not found in input `args`: {args} and `kwargs`: {kwargs}."
                     )
                 args.append(self._default_kwargs[arg_name])
 
@@ -134,9 +123,7 @@ def args_to_input_spec(self, args, kwargs):
             # So we don't support to deal this case while specifying `input_spec` currently.
             if kwargs:
                 raise ValueError(
-                    "{} got unexpected keyword arguments: {}. Cannot trace the function when `input_spec` is specified.".format(
-                        self._dygraph_function.__name__, kwargs
-                    )
+                    f"{self._dygraph_function.__name__} got unexpected keyword arguments: {kwargs}. Cannot trace the function when `input_spec` is specified."
                 )
 
             # Note: The length of `input_spec` can be greater than `args`,
@@ -144,9 +131,7 @@ def args_to_input_spec(self, args, kwargs):
             # after `unified_args_and_kwargs`.
             if len(args) < len(self._input_spec):
                 raise ValueError(
-                    "Requires len(arguments) >= len(input_spec), but received len(args):{} < len(InputSpec): {}".format(
-                        len(args), len(self._input_spec)
-                    )
+                    f"Requires len(arguments) >= len(input_spec), but received len(args):{len(args)} < len(InputSpec): {len(self._input_spec)}"
                 )
 
             # replace argument with corresponding InputSpec.
@@ -279,9 +264,7 @@ def _verify_input_spec(self, input_spec):
         """
         if not isinstance(input_spec, (tuple, list)):
             raise TypeError(
-                "The type(input_spec) should be one of (tuple, list), but received {}.".format(
-                    type_name(input_spec)
-                )
+                f"The type(input_spec) should be one of (tuple, list), but received {type_name(input_spec)}."
             )
 
         return tuple(input_spec)
@@ -330,9 +313,7 @@ def get_parameters(layer_instance, include_sublayer=True):
                 params = layer_instance._parameters
         else:
             raise TypeError(
-                "Type of `layer_instance` should be nn.Layer, but received {}".format(
-                    type_name(layer_instance)
-                )
+                f"Type of `layer_instance` should be nn.Layer, but received {type_name(layer_instance)}"
             )
 
     return params
@@ -354,9 +335,7 @@ def get_buffers(layer_instance, include_sublayer=True):
                 buffers = layer_instance._buffers
         else:
             raise TypeError(
-                "Type of `layer_instance` should be nn.Layer, but received {}".format(
-                    type_name(layer_instance)
-                )
+                f"Type of `layer_instance` should be nn.Layer, but received {type_name(layer_instance)}"
             )
     return buffers
 
@@ -443,9 +422,7 @@ def check_type_and_len(input, spec, check_length=False):
             )
         if check_length and len(input) < len(spec):
             raise ValueError(
-                'Requires len(inputs) >= len(input_spec), but received len(inputs):{} < len(input_spec):{}'.format(
-                    len(inputs), len(input_spec)
-                )
+                f'Requires len(inputs) >= len(input_spec), but received len(inputs):{len(inputs)} < len(input_spec):{len(input_spec)}'
             )
 
     if isinstance(input_spec, (tuple, list)):
@@ -462,10 +439,8 @@ def check_type_and_len(input, spec, check_length=False):
             for rest_input in inputs[len(input_spec) :]:
                 if isinstance(rest_input, (core.eager.Tensor, np.ndarray)):
                     logging_utils.warn(
-                        "The inputs contain `{}` without specifying InputSpec, its shape and dtype will be treated immutable. "
-                        "Please specific InputSpec information in `@to_static` if you expect them as mutable inputs.".format(
-                            type_name(rest_input)
-                        )
+                        f"The inputs contain `{type_name(rest_input)}` without specifying InputSpec, its shape and dtype will be treated immutable. "
+                        "Please specific InputSpec information in `@to_static` if you expect them as mutable inputs."
                     )
         input_with_spec.extend(inputs[len(input_spec) :])
 
diff --git a/python/paddle/jit/dy2static/logging_utils.py b/python/paddle/jit/dy2static/logging_utils.py
index f3e6c10d3aa5d..d9e20b2a81d5c 100644
--- a/python/paddle/jit/dy2static/logging_utils.py
+++ b/python/paddle/jit/dy2static/logging_utils.py
@@ -162,13 +162,9 @@ def log_transformed_code(
         if self.has_code_level(level):
             source_code = ast_to_source_code(ast_node)
             if level == LOG_AllTransformer:
-                header_msg = "After the last level ast transformer: '{}', the transformed code:\n".format(
-                    transformer_name
-                )
+                header_msg = f"After the last level ast transformer: '{transformer_name}', the transformed code:\n"
             else:
-                header_msg = "After the level {} ast transformer: '{}', the transformed code:\n".format(
-                    level, transformer_name
-                )
+                header_msg = f"After the level {level} ast transformer: '{transformer_name}', the transformed code:\n"
 
             msg = header_msg + source_code
             self.logger.info(msg, *args, **kwargs)
diff --git a/python/paddle/jit/dy2static/origin_info.py b/python/paddle/jit/dy2static/origin_info.py
index 96e7b9c60c8f6..824a4d9a9a079 100644
--- a/python/paddle/jit/dy2static/origin_info.py
+++ b/python/paddle/jit/dy2static/origin_info.py
@@ -65,19 +65,11 @@ def __init__(self, location, function_name, source_code):
         self.source_code = source_code
 
     def __str__(self):
-        return "{} \nsource_code: {}  in function {}\n  ".format(
-            self.location, self.source_code, self.function_name
-        )
+        return f"{self.location} \nsource_code: {self.source_code}  in function {self.function_name}\n  "
 
     def formatted_message(self):
         flag_for_origin_info = "(* user code *)"
-        return '    File "{}", line {}, in {} {}\n\t{}'.format(
-            self.location.filepath,
-            self.location.lineno,
-            self.function_name,
-            flag_for_origin_info,
-            self.source_code.lstrip(),
-        )
+        return f'    File "{self.location.filepath}", line {self.location.lineno}, in {self.function_name} {flag_for_origin_info}\n\t{self.source_code.lstrip()}'
 
     def as_frame(self):
         return (
@@ -164,9 +156,7 @@ def create_and_update_origin_info_map(
     for t_node, s_node in ast_walk(transformed_node, static_node):
         assert type(t_node) == type(
             s_node
-        ), "The node types should be the same, but received type(t_node) is {}, and type(s_node) is {}.".format(
-            type(t_node), type(s_node)
-        )
+        ), f"The node types should be the same, but received type(t_node) is {type(t_node)}, and type(s_node) is {type(s_node)}."
         dygraph_info = getattr(t_node, ORIGIN_INFO, None)
         static_info = getattr(s_node, ORIGIN_INFO, None)
 
@@ -243,9 +233,7 @@ def _as_list(x):
 
         assert type(t_node) == type(
             s_node
-        ), "The node types should be the same, but received type(t_node) is {}, and type(s_node) is {}.".format(
-            type(t_node), type(s_node)
-        )
+        ), f"The node types should be the same, but received type(t_node) is {type(t_node)}, and type(s_node) is {type(s_node)}."
 
         yield t_node, s_node
 
diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py
index 7b0bcc0d322fa..8571740db2659 100644
--- a/python/paddle/jit/dy2static/partial_program.py
+++ b/python/paddle/jit/dy2static/partial_program.py
@@ -81,11 +81,9 @@ def _check_non_variable(self, need_check):
                     warning_types.add(type(var))
             if warning_types:
                 logging_utils.warn(
-                    "Output of traced function contains non-tensor type values: {}. "
+                    f"Output of traced function contains non-tensor type values: {list(warning_types)}. "
                     "Currently, We don't support to update them while training and will return "
-                    "what we first saw. Please try to return them as tensor.".format(
-                        list(warning_types)
-                    )
+                    "what we first saw. Please try to return them as tensor."
                 )
 
     @property
@@ -241,7 +239,7 @@ def __call__(self, inputs):
                 program_id=self.program_id, use_scope_cache=True
             ),
             self._cuda_graph_vec,
-            *attrs
+            *attrs,
         )
 
         restored_nest_out = self._restore_out(out_vars)
@@ -268,7 +266,7 @@ def sot_call(self, inputs):
                 program_id=self.program_id, use_scope_cache=True
             ),
             self._cuda_graph_vec,
-            *attrs
+            *attrs,
         )
 
         return out_vars
@@ -1119,9 +1117,7 @@ def _check_params_all_inited(self, main_program):
             # self._params contains parameters and buffers with persistable=True.
             if not isinstance(var, core.eager.Tensor):
                 raise TypeError(
-                    'Type of self._params[{}] in PartialProgramLayer should be Parameter or Variable, but received {}.'.format(
-                        i, type(var)
-                    )
+                    f'Type of self._params[{i}] in PartialProgramLayer should be Parameter or Variable, but received {type(var)}.'
                 )
             param_and_buffer_names_set.add(var.name)
 
@@ -1155,7 +1151,7 @@ def partial_program_from(concrete_program, from_method=False):
         inputs,
         concrete_program.outputs,
         concrete_program.parameters,
-        **concrete_program.kwargs
+        **concrete_program.kwargs,
     )
 
 
diff --git a/python/paddle/jit/dy2static/pir_partial_program.py b/python/paddle/jit/dy2static/pir_partial_program.py
index f57ccc7b01019..cddac384213ab 100644
--- a/python/paddle/jit/dy2static/pir_partial_program.py
+++ b/python/paddle/jit/dy2static/pir_partial_program.py
@@ -29,10 +29,13 @@
 from paddle.optimizer.lr import LRScheduler
 from paddle.pir import Value, fake_value, is_fake_value
 
-from .utils import RETURN_NO_VALUE_MAGIC_NUM, backend_guard
+from .logging_utils import TranslatorLogger
+from .utils import RETURN_NO_VALUE_MAGIC_NUM, backend_guard, cinn_is_enabled
 
 __all__ = []
 
+prog_logger = TranslatorLogger()
+
 
 class NestSequence:
     """
@@ -283,11 +286,28 @@ def pass_fn(forward_program, backward_program):
         """
         origin_fwd = self.forward_program
         origin_bwd = self.backward_program
+
+        prog_logger.log(
+            1,
+            f"******** [JIT] PIR forward program before PIR PASS ********\n{origin_fwd} ",
+        )
+        prog_logger.log(
+            1,
+            f"******** [JIT] PIR backward program before PIR PASS ********\n{origin_bwd} ",
+        )
         # NOTE(dev): Add this line to trigger program_name_attr logic
         program_name_attr = self.program_name_attr
         self.forward_program, self.backward_program = pass_fn(
             origin_fwd, origin_bwd
         )
+        prog_logger.log(
+            1,
+            f"******** [JIT] PIR forward program after PIR PASS ********\n{origin_fwd} ",
+        )
+        prog_logger.log(
+            1,
+            f"******** [JIT] PIR backward program after PIR PASS ********\n{origin_bwd} ",
+        )
 
     # cached property can ensure program is splited only once.
     @cached_property
@@ -316,11 +336,11 @@ def program_attr(self):
         value_program_attr = {}
         for k, ns in self.program_name_attr.items():
             if k.startswith("f"):
-                values = [fwd_map[n] for n in ns]
+                values = [fwd_map.get(n, fake_value()) for n in ns]
             elif k.startswith("b"):
-                values = [bwd_map[n] for n in ns]
+                values = [bwd_map.get(n, fake_value()) for n in ns]
             elif k == "no_need_buffers":
-                values = [fwd_map[n] for n in ns]
+                values = [fwd_map.get(n, fake_value()) for n in ns]
             else:
                 raise ValueError(f"Unknown program attr: {k}")
             value_program_attr[k] = values
@@ -503,7 +523,8 @@ def sot_call(self, inputs):
             self._cuda_graph_vec,
             *attrs,
         )
-        return out_vars
+        restored_nest_out = self._restore_out(out_vars)
+        return restored_nest_out
 
     @cached_property
     def origin_runnable_program(self):
@@ -565,8 +586,12 @@ def pass_fn(forward_program, backward_program):
                 pm.run(forward_program)
 
                 # if-else pass
-                if self._build_strategy.build_cinn_pass:
+                if cinn_is_enabled(self._build_strategy, self._backend):
                     paddle.base.libpaddle.pir.apply_cinn_pass(forward_program)
+                else:
+                    paddle.base.libpaddle.pir.check_infer_symbolic_if_need(
+                        forward_program
+                    )
 
                 return forward_program, backward_program
 
@@ -585,9 +610,13 @@ def pass_fn(forward_program, backward_program):
             self._set_grad_type(self._params, train_program)
 
             def pass_fn(forward_program, backward_program):
-                if self._build_strategy.build_cinn_pass:
+                if cinn_is_enabled(self._build_strategy, self._backend):
                     paddle.base.libpaddle.pir.apply_cinn_pass(forward_program)
                     paddle.base.libpaddle.pir.apply_cinn_pass(backward_program)
+                else:
+                    paddle.base.libpaddle.pir.check_infer_symbolic_if_need(
+                        forward_program
+                    )
                 return forward_program, backward_program
 
             train_program.apply_pir_program_pass(pass_fn)
@@ -1050,9 +1079,7 @@ def _check_params_all_inited(self, main_program):
             # self._params contains parameters and buffers with persistable=True.
             if not isinstance(var, core.eager.Tensor):
                 raise TypeError(
-                    'Type of self._params[{}] in PartialProgramLayer should be Parameter or Variable, but received {}.'.format(
-                        i, type(var)
-                    )
+                    f'Type of self._params[{i}] in PartialProgramLayer should be Parameter or Variable, but received {type(var)}.'
                 )
             param_and_buffer_names_set.add(var.name)
 
diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py
index 0f2b5f8aa7207..b66943243ce48 100644
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
@@ -294,12 +294,7 @@ def __neq__(self, other):
         return not self == other
 
     def __repr__(self):
-        return "id(function_spec): {}, input_args_with_spec: {}, input_kwargs_with_spec: {}, class_instance: {}".format(
-            id(self.function_spec),
-            self.input_args_with_spec,
-            self.input_kwargs_with_spec,
-            self.class_instance,
-        )
+        return f"id(function_spec): {id(self.function_spec)}, input_args_with_spec: {self.input_args_with_spec}, input_kwargs_with_spec: {self.input_kwargs_with_spec}, class_instance: {self.class_instance}"
 
 
 def unwrap_decorators(func):
@@ -398,10 +393,8 @@ def train(self):
             and self._class_instance.training is False
         ):
             raise RuntimeError(
-                "Failed to switch train mode. {} is a Layer's method, "
-                "please use Layer.train() to switch train mode.".format(
-                    self.dygraph_function
-                )
+                f"Failed to switch train mode. {self.dygraph_function} is a Layer's method, "
+                "please use Layer.train() to switch train mode."
             )
         self._training = True
 
@@ -411,10 +404,8 @@ def eval(self):
             and self._class_instance.training is True
         ):
             raise RuntimeError(
-                "Failed to switch eval mode. {} is a Layer's method, "
-                "please use Layer.eval() to switch eval mode.".format(
-                    self.dygraph_function
-                )
+                f"Failed to switch eval mode. {self.dygraph_function} is a Layer's method, "
+                "please use Layer.eval() to switch eval mode."
             )
         self._training = False
 
@@ -612,9 +603,7 @@ def rollback_impl(class_instance):
         func_name = self._dygraph_function.__name__
         assert (
             func_name in self._class_instance._original_funcs
-        ), "Not Found function '{}' in class '{}'.".format(
-            func_name, self._class_instance.__class__
-        )
+        ), f"Not Found function '{func_name}' in class '{self._class_instance.__class__}'."
         func = self._class_instance._original_funcs[func_name]
         setattr(
             self._class_instance, func_name, func.__get__(self._class_instance)
@@ -661,10 +650,8 @@ def __deepcopy__(self, memo):
             net_name = type(self._class_instance).__name__
             logging_utils.log(
                 level=-1,
-                msg="Not recommend to deepcopy '{}' decorated with @to_static, it has side effect that will"
-                " rollback into original state before @to_static. Please deepcopy '{}' before applying @to_static.".format(
-                    net_name, net_name
-                ),
+                msg=f"Not recommend to deepcopy '{net_name}' decorated with @to_static, it has side effect that will"
+                f" rollback into original state before @to_static. Please deepcopy '{net_name}' before applying @to_static.",
             )
             self.rollback()
             return self._dygraph_function.__get__(
@@ -968,18 +955,14 @@ def concrete_program_specify_input_spec(
                 flatten(input_spec), flatten(self._function_spec.input_spec)
             ):
                 raise ValueError(
-                    "The `input_spec`: {} used to construct concrete_program is conflict with the `input_spec`: {} in `@paddle.jit.to_static`".format(
-                        input_spec, self._function_spec.input_spec
-                    )
+                    f"The `input_spec`: {input_spec} used to construct concrete_program is conflict with the `input_spec`: {self._function_spec.input_spec} in `@paddle.jit.to_static`"
                 )
             # NOTE(chenweihang): we should always translated program based on the `input_spec`
             # decorated on forward if it is valid
             desired_input_spec = self._function_spec.input_spec
             if input_spec is not None:
                 logging_utils.warn(
-                    "\n\nYou have specified `input_spec` both in function definition (higher priority) and `paddle.jit.save` (will be ignored.)\n\n\t Using: {}\n\n\t Ignore: {}\n".format(
-                        desired_input_spec, input_spec
-                    )
+                    f"\n\nYou have specified `input_spec` both in function definition (higher priority) and `paddle.jit.save` (will be ignored.)\n\n\t Using: {desired_input_spec}\n\n\t Ignore: {input_spec}\n"
                 )
 
         has_input_spec = desired_input_spec is not None
@@ -998,9 +981,7 @@ def concrete_program_specify_input_spec(
                 )
                 if cached_program_len > 1:
                     logging_utils.warn(
-                        "Current {} has more than one cached programs: {}, the last traced progam will be return by default.".format(
-                            self._function_spec, cached_program_len
-                        )
+                        f"Current {self._function_spec} has more than one cached programs: {cached_program_len}, the last traced progam will be return by default."
                     )
 
                 cache_key = self._program_cache._recent_cache_key
@@ -1020,9 +1001,7 @@ def concrete_program_specify_input_spec(
 
             else:
                 raise ValueError(
-                    "No valid transformed program for {}.\n\t    Please specific `input_spec` in `@paddle.jit.to_static` or feed input tensor to call the decorated function at once.\n".format(
-                        self._function_spec
-                    )
+                    f"No valid transformed program for {self._function_spec}.\n\t    Please specific `input_spec` in `@paddle.jit.to_static` or feed input tensor to call the decorated function at once.\n"
                 )
 
     @property
@@ -1081,10 +1060,8 @@ def _verify_init_in_dynamic_mode(class_instance):
         if not class_instance._init_in_dynamic_mode:
             raise RuntimeError(
                 " `paddle.jit.to_static` is only available in dynamic mode. Please call `paddle.disable_static()` before "
-                "initializing your Layer class `{}` . Because parameters of Layer class should be initialized firstly "
-                "in dynamic mode while applying transformation.".format(
-                    class_instance
-                )
+                f"initializing your Layer class `{class_instance}` . Because parameters of Layer class should be initialized firstly "
+                "in dynamic mode while applying transformation."
             )
 
 
@@ -1447,50 +1424,6 @@ def save_checkpoint(self):
         return ckp
 
 
-class FallbackProgramLayer:
-    __slots__ = [
-        '_instance',
-        '_dy_func',
-        'training',
-        '_cuda_graph_capture_mode',
-        '_cuda_graph_pool_id',
-        '_debug_name',
-    ]
-
-    def __init__(self, instance, dy_func):
-        self._instance = instance
-        self._dy_func = dy_func
-
-    def __call__(self, inputs):
-        return self._dy_func(*inputs)
-
-    def __getattr__(self, key):
-        if key not in self.__slots__:
-            raise RuntimeError(
-                "There raises a exception after applying `@paddle.jit.to_static()` and already switch into fallback mode. \n"
-                "You can't get attribute for a fallback program layer. Please check `to_static.error` file for detail."
-            )
-        elif key in ['training']:
-            if self._instance is not None:
-                return getattr(self._instance, key)
-            return
-
-        return super().__getattr__(key)
-
-    def __setattr__(self, key, value):
-        if key not in self.__slots__:
-            raise RuntimeError(
-                "There raises a exception after applying `@paddle.jit.to_static()` and already switch into fallback mode. \n"
-                "You can't get attribute for a fallback program layer. Please check `to_static.error` file for detail."
-            )
-        elif key in ['training']:
-            if self._instance is not None:
-                return setattr(self._instance, key, value)
-            return
-
-        return super().__setattr__(key, value)
-
-
 class PirPrimHooker(PirPartialProgramLayerHook):
     def __init__(self, original_program, backend):
         self.backend = backend
@@ -1582,8 +1515,6 @@ class ProgramCache:
     Wrapper class for the program functions defined by dygraph function.
     """
 
-    dy2static_error_file = "to_static.error"
-
     def __init__(self):
         # {hash_id : (concrete_program, partial_layer)}
         self._caches = collections.OrderedDict()
@@ -1595,43 +1526,22 @@ def _build_once(self, cache_key):
         # TODO(Aurelius84): Need a gloabl FLAGS to enable/disable to_prim
         enable_prim = cache_key.kwargs['build_strategy'].build_cinn_pass
 
-        # NOTE(xiongkun): Need a global FLAGS to enable/disable fallback
-        enable_fallback = enable_prim
-        try:
-            if use_pir_api():
-                concrete_program = ConcreteProgram.pir_from_func_spec(
-                    func_spec=cache_key.function_spec,
-                    input_spec=cache_key.input_args_with_spec,
-                    input_kwargs_spec=cache_key.input_kwargs_with_spec,
-                    class_instance=cache_key.class_instance,
-                    **cache_key.kwargs,
-                )
-            else:
-                concrete_program = ConcreteProgram.from_func_spec(
-                    func_spec=cache_key.function_spec,
-                    input_spec=cache_key.input_args_with_spec,
-                    input_kwargs_spec=cache_key.input_kwargs_with_spec,
-                    class_instance=cache_key.class_instance,
-                    **cache_key.kwargs,
-                )
-        except Exception as e:
-            if enable_fallback:
-                warnings.warn(
-                    "Exception is thrown while applying @paddle.jit.to_static. It will fallback into dygraph mode for training.\n"
-                    "1. You can check `to_static.error` file in current workspace directory for detail.\n"
-                    "2. In fallback mode, you can only do training, can't call paddle.jit.save(). Please modify model code according `to_static.error` firstly"
-                )
-                # TODO(xiongkun) change different file name to avoid overwrite.
-                with open(self.dy2static_error_file, "w") as fp:
-                    fp.write(str(e))
-
-                fallback_layer = FallbackProgramLayer(
-                    cache_key.class_instance,
-                    cache_key.function_spec.dygraph_function,
-                )
-                return fallback_layer, fallback_layer
-            else:
-                raise
+        if use_pir_api():
+            concrete_program = ConcreteProgram.pir_from_func_spec(
+                func_spec=cache_key.function_spec,
+                input_spec=cache_key.input_args_with_spec,
+                input_kwargs_spec=cache_key.input_kwargs_with_spec,
+                class_instance=cache_key.class_instance,
+                **cache_key.kwargs,
+            )
+        else:
+            concrete_program = ConcreteProgram.from_func_spec(
+                func_spec=cache_key.function_spec,
+                input_spec=cache_key.input_args_with_spec,
+                input_kwargs_spec=cache_key.input_kwargs_with_spec,
+                class_instance=cache_key.class_instance,
+                **cache_key.kwargs,
+            )
 
         backend = cache_key.kwargs['backend']
         if (
@@ -1641,9 +1551,7 @@ def _build_once(self, cache_key):
             for var in concrete_program.main_program.list_vars():
                 if var.type not in NO_SHAPE_VAR_TYPE and -1 in var.shape:
                     warnings.warn(
-                        "Now prim and cinn do not support -1 shape, but the shape of var {} is {}".format(
-                            var.name, var.shape
-                        )
+                        f"Now prim and cinn do not support -1 shape, but the shape of var {var.name} is {var.shape}"
                     )
 
         if use_pir_api():
@@ -1687,10 +1595,8 @@ def __getitem__(self, item):
             current_tracing_count = len(self._caches)
             if current_tracing_count > MAX_TRACED_PROGRAM_COUNT:
                 logging_utils.warn(
-                    "Current traced program number: {} > `max_tracing_count`:{}. Too much cached programs will bring expensive overhead. "
-                    "The reason may be: (1) passing tensors with different shapes, (2) passing python objects instead of tensors.".format(
-                        current_tracing_count, MAX_TRACED_PROGRAM_COUNT
-                    )
+                    f"Current traced program number: {current_tracing_count} > `max_tracing_count`:{MAX_TRACED_PROGRAM_COUNT}. Too much cached programs will bring expensive overhead. "
+                    "The reason may be: (1) passing tensors with different shapes, (2) passing python objects instead of tensors."
                 )
 
         return self._caches[item_id]
diff --git a/python/paddle/jit/dy2static/transformers/base.py b/python/paddle/jit/dy2static/transformers/base.py
index ffc270b24a969..c13f21ee6272d 100644
--- a/python/paddle/jit/dy2static/transformers/base.py
+++ b/python/paddle/jit/dy2static/transformers/base.py
@@ -384,8 +384,8 @@ def _build_var_len_assign_node(self):
         else:
             iter_var_name = ast_to_source_code(self.iter_node).strip()
 
-        convert_len_node_source_str = '{} = _jst.Len({})'.format(
-            self.iter_var_len_name, iter_var_name
+        convert_len_node_source_str = (
+            f'{self.iter_var_len_name} = _jst.Len({iter_var_name})'
         )
 
         convert_len_node = gast.parse(convert_len_node_source_str).body[0]
@@ -408,8 +408,8 @@ def _build_iter_node(self):
         ):
             if self.iter_node.func.id == 'zip':
                 iter_var_name = ast_to_source_code(self.iter_node).strip()
-                zip_to_list_str = "{target} = list({value})".format(
-                    target=self.iter_zip_to_list_name, value=iter_var_name
+                zip_to_list_str = (
+                    f"{self.iter_zip_to_list_name} = list({iter_var_name})"
                 )
                 zip_to_list_node = gast.parse(zip_to_list_str).body[0]
                 new_nodes.append(zip_to_list_node)
@@ -464,9 +464,7 @@ def _build_cond_stmt(self, step_node, compare_node):
         if not isinstance(step_node, (gast.Constant, gast.UnaryOp)):
             raise NotImplementedError(
                 "Dynamic-to-Static only supports the step value is a constant or negative constant in 'for-range' statements, "
-                "such as '2', '-3'. But received: '{}'. Please fix code to be compatible with Dynamic-to-Static.".format(
-                    ast_to_source_code(step_node).strip()
-                )
+                f"such as '2', '-3'. But received: '{ast_to_source_code(step_node).strip()}'. Please fix code to be compatible with Dynamic-to-Static."
             )
 
         if isinstance(step_node, gast.UnaryOp) or step_node.value < 0:
@@ -519,9 +517,7 @@ def _build_index_increase_node(self, step_node):
         )
 
     def _build_assign_var_slice_node(self):
-        var_slice_str = "{}[{}]".format(
-            ast_to_source_code(self.iter_node).strip(), self.iter_idx_name
-        )
+        var_slice_str = f"{ast_to_source_code(self.iter_node).strip()}[{self.iter_idx_name}]"
         var_slice_node = gast.parse(var_slice_str).body[0].value
         new_iter_var_name = unique_name.generate(FOR_ITER_VAR_NAME_PREFIX)
         target_node, assign_node = create_assign_node(
diff --git a/python/paddle/jit/dy2static/transformers/decorator_transformer.py b/python/paddle/jit/dy2static/transformers/decorator_transformer.py
index c19ce1f95b587..484678c9f1f25 100644
--- a/python/paddle/jit/dy2static/transformers/decorator_transformer.py
+++ b/python/paddle/jit/dy2static/transformers/decorator_transformer.py
@@ -69,9 +69,7 @@ def visit_FunctionDef(self, node):
                 # 1: @_jst.Call(a.b.c.d.deco)()
                 # 2: @q.w.e.r.deco()
                 re_tmp = re.match(
-                    r'({module})*({name}\(){{0,1}}({module})*({name})(\)){{0,1}}\(.*$'.format(
-                        name=RE_PYNAME, module=RE_PYMODULE
-                    ),
+                    rf'({RE_PYMODULE})*({RE_PYNAME}\(){{0,1}}({RE_PYMODULE})*({RE_PYNAME})(\)){{0,1}}\(.*$',
                     deco_full_name,
                 )
                 deco_name = re_tmp.group(4)
@@ -103,31 +101,17 @@ def visit_FunctionDef(self, node):
                     re_name = rematch.group(1)
                     re_args = rematch.group(2)
                     re_args_with_func = deco_target + ', ' + re_args
-                    decofun_str = 'try:\n\t{0} = _jst.Call({1})({2})\nexcept:\n\t{0} = _jst.Call({1})({3})({4})'.format(
-                        decoded_func,
-                        re_name,
-                        re_args_with_func,
-                        re_args,
-                        deco_target,
-                    )
+                    decofun_str = f'try:\n\t{decoded_func} = _jst.Call({re_name})({re_args_with_func})\nexcept:\n\t{decoded_func} = _jst.Call({re_name})({re_args})({deco_target})'
                 else:
                     # paddle api will not be transformed to '_jst.Call'
                     rematch = re.match(r'(.+?)\((.*)\)', deco_full_name)
                     re_name = rematch.group(1)
                     re_args = rematch.group(2)
                     re_args_with_func = deco_target + ', ' + re_args
-                    decofun_str = 'try:\n\t{0} = {1}({2})\nexcept:\n\t{0} = {1}({3})({4})'.format(
-                        decoded_func,
-                        re_name,
-                        re_args_with_func,
-                        re_args,
-                        deco_target,
-                    )
+                    decofun_str = f'try:\n\t{decoded_func} = {re_name}({re_args_with_func})\nexcept:\n\t{decoded_func} = {re_name}({re_args})({deco_target})'
 
             else:
-                decofun_str = '{} = _jst.Call({})({})'.format(
-                    decoded_func, deco_full_name, deco_target
-                )
+                decofun_str = f'{decoded_func} = _jst.Call({deco_full_name})({deco_target})'
 
             decofun_nodes.extend(gast.parse(decofun_str).body)
             deco_target = decoded_func
diff --git a/python/paddle/jit/dy2static/transformers/loop_transformer.py b/python/paddle/jit/dy2static/transformers/loop_transformer.py
index 272837e67d43e..9dcf2e3aa3999 100644
--- a/python/paddle/jit/dy2static/transformers/loop_transformer.py
+++ b/python/paddle/jit/dy2static/transformers/loop_transformer.py
@@ -92,17 +92,7 @@ def create_while_nodes(
         assign_loop_var_names.append(name)
 
     while_func_name = "_jst.While"
-    while_node_str = (
-        "{}({}, {}, {}, {}, return_name_ids={}, push_pop_names={})".format(
-            while_func_name,
-            condition_name,
-            body_name,
-            getter_name,
-            setter_name,
-            create_name_str(loop_var_names),
-            create_name_str(push_pop_names),
-        )
-    )
+    while_node_str = f"{while_func_name}({condition_name}, {body_name}, {getter_name}, {setter_name}, return_name_ids={create_name_str(loop_var_names)}, push_pop_names={create_name_str(push_pop_names)})"
     while_node = gast.parse(while_node_str).body[0]
 
     ret = [while_node]
diff --git a/python/paddle/jit/dy2static/transformers/return_transformer.py b/python/paddle/jit/dy2static/transformers/return_transformer.py
index 18d9dfa59e600..a6c3fac812a3e 100644
--- a/python/paddle/jit/dy2static/transformers/return_transformer.py
+++ b/python/paddle/jit/dy2static/transformers/return_transformer.py
@@ -209,11 +209,7 @@ def append_assign_to_return_node(
         assert value in [True, False], "value must be True or False."
         if isinstance(parent_node_of_return, gast.If):
             # Prepend control flow boolean nodes such as '__return@1 = True'
-            node_str = "{} = _jst.create_bool_as_type({}, {})".format(
-                return_name,
-                ast_to_source_code(parent_node_of_return.test).strip(),
-                value,
-            )
+            node_str = f"{return_name} = _jst.create_bool_as_type({ast_to_source_code(parent_node_of_return.test).strip()}, {value})"
 
             assign_node = gast.parse(node_str).body[0]
             assign_nodes.append(assign_node)
diff --git a/python/paddle/jit/dy2static/transformers/transform.py b/python/paddle/jit/dy2static/transformers/transform.py
index 8b1ba4de28d9a..ffab19dcb7146 100644
--- a/python/paddle/jit/dy2static/transformers/transform.py
+++ b/python/paddle/jit/dy2static/transformers/transform.py
@@ -19,6 +19,8 @@
 
 import os
 
+from paddle.framework import use_pir_api
+
 from .. import logging_utils
 from ..utils import ast_to_source_code
 from .assert_transformer import AssertTransformer
@@ -102,7 +104,7 @@ def transfer_from_node_type(self, node):
             LogicalTransformer,  # logical and/or/not
             CreateVariableTransformer,  # create undefined var for if / while / for
             LoopTransformer,  # for/while -> while_op
-            IfElseTransformer,  # if/else -> cond_op
+            IfElseTransformer,  # if/else -> if_op
             AssertTransformer,  # assert statement
             CallTransformer,  # transform call recursively
             CastTransformer,  # type casting statement
@@ -110,6 +112,10 @@ def transfer_from_node_type(self, node):
             NameloadJstTransformer,
         ]
 
+        if use_pir_api():
+            # It's unnecessary in PIR mode
+            transformers.remove(AttributeJstTransformer)
+
         apply_optimization(transformers)
 
         for index, transformer in enumerate(transformers):
diff --git a/python/paddle/jit/dy2static/utils.py b/python/paddle/jit/dy2static/utils.py
index 901a2e23bdc5a..279176a025dcc 100644
--- a/python/paddle/jit/dy2static/utils.py
+++ b/python/paddle/jit/dy2static/utils.py
@@ -380,9 +380,7 @@ def func_to_source_code(function, dedent=True):
         function = function.func
     if not (inspect.isfunction(function) or inspect.ismethod(function)):
         raise TypeError(
-            "The type of 'function' should be a function or method, but received {}.".format(
-                type(function).__name__
-            )
+            f"The type of 'function' should be a function or method, but received {type(function).__name__}."
         )
 
     source_code_list, _ = inspect.getsourcelines(function)
diff --git a/python/paddle/jit/pir_dy2static/parameter_recorder.py b/python/paddle/jit/pir_dy2static/parameter_recorder.py
index 14865dfa3250f..d4c8e9ff43b0d 100644
--- a/python/paddle/jit/pir_dy2static/parameter_recorder.py
+++ b/python/paddle/jit/pir_dy2static/parameter_recorder.py
@@ -51,12 +51,17 @@ def get(self, program, tensor):
             if tensor.placements is not None:  # import for shard tensor api
                 import paddle.distributed as dist
 
-                value = dist.shard_tensor(
+                dist_value = dist.shard_tensor(
                     value,
                     tensor.process_mesh,
                     tensor.placements,
                     stop_gradient=value.stop_gradient,
                 )
+                value.set_type(dist_value.type())
+                value.get_defining_op().dist_attr = (
+                    dist_value.get_defining_op().dist_attr
+                )
+                dist_value.block.remove_op(dist_value.get_defining_op())
 
             if isinstance(tensor, paddle.Tensor):
                 params.add(tensor)
diff --git a/python/paddle/jit/sot/infer_meta.py b/python/paddle/jit/sot/infer_meta.py
index 93876a946266a..f5e4c7c01181c 100644
--- a/python/paddle/jit/sot/infer_meta.py
+++ b/python/paddle/jit/sot/infer_meta.py
@@ -266,11 +266,16 @@ def infer_meta_for_layer(layer, *args, **kwargs):
         partial_program_layer,
     ) = layer.forward.get_concrete_program(*args_, **kwargs_)
 
+    if use_pir_api():
+        output_values = partial_program_layer._outputs.var_list
+    else:
+        output_values = concrete_program.outputs
+
     out = partial_program_layer._restore_out(
         [
             x
             for x in paddle.utils.flatten(
-                convert_variable_to_meta_info(concrete_program.outputs)
+                convert_variable_to_meta_info(output_values)
             )
             if isinstance(x, MetaInfo)
         ]
diff --git a/python/paddle/jit/sot/opcode_translator/executor/function_graph.py b/python/paddle/jit/sot/opcode_translator/executor/function_graph.py
index dc57b252e00c2..372772ad69552 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/function_graph.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/function_graph.py
@@ -642,11 +642,7 @@ def get_opcode_executor_stack():
         code_line = source_lines[line_idx]
         stack = []
         stack.append(
-            '  File "{}", line {}, in {}'.format(
-                filename,
-                current_line,
-                current_executor._code.co_name,
-            )
+            f'  File "{filename}", line {current_line}, in {current_executor._code.co_name}'
         )
         stack.append(f'    {code_line}')
         return stack
diff --git a/python/paddle/jit/sot/opcode_translator/executor/guard.py b/python/paddle/jit/sot/opcode_translator/executor/guard.py
index 68a6a968d394b..cc9791f0214b3 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/guard.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/guard.py
@@ -19,7 +19,7 @@
 from typing import TYPE_CHECKING, Any, Callable, TypeVar
 
 from ...profiler import EventGuard
-from ...utils import InnerError, current_tmp_name_records, log, log_do
+from ...utils import current_tmp_name_records, log, log_do
 
 Guard = Callable[[types.FrameType], bool]
 
@@ -50,16 +50,6 @@ def __init__(self, str_expr, sub_exprs, free_vars):
         )
         self.free_vars = free_vars
 
-    def __post_init__(self):
-        self.check_expr(self.expr)
-
-    def check_expr(self, expr: str):
-        try:
-            pass
-            # ast.parse(expr) # TODO(xiongkun): too slow
-        except SyntaxError as e:
-            raise InnerError(f"Invalid expression: {expr}") from e
-
     def __hash__(self):
         if self.free_vars:
             return hash((self.debug_expr, id(self)))
diff --git a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
index 40a4c3ae62460..86e3ae39f7715 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
@@ -516,6 +516,12 @@ def error_message_summary(original_error: Exception) -> str:
         for current_simulator in OpcodeExecutorBase.call_stack:
             code = current_simulator._code
             current_line = current_simulator._current_line
+            file = inspect.getfile(code)
+            if file.startswith("<") and file.endswith(">"):
+                message_lines.append(
+                    f"{indent}  File \"{file}\", line {current_line}"
+                )
+                continue
             lines, start = inspect.getsourcelines(code)
             real_name = code.co_name
             message_lines.append(
diff --git a/python/paddle/jit/sot/opcode_translator/executor/variables/basic.py b/python/paddle/jit/sot/opcode_translator/executor/variables/basic.py
index fe99525fe44a1..1f06d61cf8dc3 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/variables/basic.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/variables/basic.py
@@ -32,6 +32,7 @@
     FallbackError,
     NameGenerator,
     paddle_tensor_methods,
+    printable,
 )
 from ....utils.exceptions import HasNoAttributeError, InnerError
 from ..dispatch_functions import tensor_numel
@@ -321,9 +322,7 @@ def __init__(
             self.meta = tensor
         else:
             raise InnerError(
-                "Required type(tensor) is paddle.Tensor or ProxyTensor, but received {}.".format(
-                    type(tensor).__name__
-                )
+                f"Required type(tensor) is paddle.Tensor or ProxyTensor, but received {type(tensor).__name__}."
             )
         self.origin_meta = self.meta
         self.var_name = TensorVariable.var_name_generator.next()
@@ -606,7 +605,12 @@ def __init__(self, obj, graph, tracker):
 
     @property
     def main_info(self) -> dict[str, Any]:
-        return {"value": self.value}
+        # NOTE(SigureMo): There are some objects that cannot be printed, such as
+        # uninitialized dataclass, we should fallback to the class name.
+        if printable(self.value):
+            return {"value": self.value}
+        else:
+            return {"value": f"instance {self.value.__class__.__name__}"}
 
     def get_py_value(self, allow_tensor=False) -> Any:
         return self.value
diff --git a/python/paddle/jit/sot/symbolic/export.py b/python/paddle/jit/sot/symbolic/export.py
index 39b06eca1891c..359ba3a5dca2a 100644
--- a/python/paddle/jit/sot/symbolic/export.py
+++ b/python/paddle/jit/sot/symbolic/export.py
@@ -239,9 +239,7 @@ def create_inputs(self):
                         f"    paddle.randint(low=0, high=2, shape={shape_str}, dtype=paddle.int32).cast(paddle.bool),"
                     )
                     numpy_inputs.append(
-                        "    np.random.randint(low=0, high=2, size={}, dtype='int').astype('bool'),".format(
-                            shape_str
-                        )
+                        f"    np.random.randint(low=0, high=2, size={shape_str}, dtype='int').astype('bool'),"
                     )
                 else:
                     paddle_inputs.append(
diff --git a/python/paddle/jit/sot/utils/__init__.py b/python/paddle/jit/sot/utils/__init__.py
index d7a7c0148f8ca..b27e062c5281c 100644
--- a/python/paddle/jit/sot/utils/__init__.py
+++ b/python/paddle/jit/sot/utils/__init__.py
@@ -75,5 +75,6 @@
     map_if_extend,
     meta_str,
     no_eval_frame,
+    printable,
     tmp_name_guard,
 )
diff --git a/python/paddle/jit/sot/utils/utils.py b/python/paddle/jit/sot/utils/utils.py
index bdd60679ba799..fa1b2b2d56ac2 100644
--- a/python/paddle/jit/sot/utils/utils.py
+++ b/python/paddle/jit/sot/utils/utils.py
@@ -385,6 +385,14 @@ def hashable(obj):
         return False
 
 
+def printable(obj):
+    try:
+        str(obj)
+        return True
+    except Exception as e:
+        return False
+
+
 class StepState(Enum):
     COLLECT_INFO = 1
     RUN_SOT = 2
diff --git a/python/paddle/jit/translated_layer.py b/python/paddle/jit/translated_layer.py
index 18886bfb2f7ba..ddf0cf9c8b02e 100644
--- a/python/paddle/jit/translated_layer.py
+++ b/python/paddle/jit/translated_layer.py
@@ -1110,9 +1110,7 @@ def _append_block(
     input_names = [inp.name for inp in input_variables]
     if len(name_inp_desc) != len(input_names):
         raise ValueError(
-            "The number of input is invalid, expected {}, but received {}.".format(
-                len(name_inp_desc), len(input_names)
-            )
+            f"The number of input is invalid, expected {len(name_inp_desc)}, but received {len(input_names)}."
         )
     for i, out_name in enumerate(name_inp_desc):
         if dict_rename_var_old_new:
diff --git a/python/paddle/linalg.py b/python/paddle/linalg.py
index 4ba51b20ba5dc..e83aedb740907 100644
--- a/python/paddle/linalg.py
+++ b/python/paddle/linalg.py
@@ -40,6 +40,7 @@
     slogdet,
     solve,
     svd,
+    svd_lowrank,
     triangular_solve,
     vector_norm,
 )
@@ -61,6 +62,7 @@
     'qr',
     'householder_product',
     'pca_lowrank',
+    'svd_lowrank',
     'lu',
     'lu_unpack',
     'matrix_exp',
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index acf85a5f675ce..3dd30afeec986 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -688,9 +688,7 @@ def rrelu(x, lower=1.0 / 8.0, upper=1.0 / 3.0, training=True, name=None):
     """
     if not isinstance(lower, float) or not isinstance(upper, float):
         raise TypeError(
-            "The lower and upper values must be float type. Received: lower {}, upper {}.".format(
-                lower, upper
-            )
+            f"The lower and upper values must be float type. Received: lower {lower}, upper {upper}."
         )
 
     if lower < 0 or lower > 1:
@@ -700,9 +698,7 @@ def rrelu(x, lower=1.0 / 8.0, upper=1.0 / 3.0, training=True, name=None):
 
     if upper < lower:
         raise ValueError(
-            "The upper value must be greater than lower value. Received: lower {}, upper {}.".format(
-                lower, upper
-            )
+            f"The upper value must be greater than lower value. Received: lower {lower}, upper {upper}."
         )
 
     if upper > 1:
@@ -1767,9 +1763,7 @@ def glu(x, axis=-1, name=None):
     rank = len(x.shape)
     if not (-rank <= axis < rank):
         raise ValueError(
-            "Expected value range of `axis` is [{}, {}), but received axis: {}".format(
-                -rank, rank, axis
-            )
+            f"Expected value range of `axis` is [{-rank}, {rank}), but received axis: {axis}"
         )
     a, b = chunk(x, 2, axis=axis, name=name)
     gate = sigmoid(b, name=name)
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index de78e37d99fd9..a5032158dd0bc 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -1234,15 +1234,11 @@ def get_attrs(prog, dropout_prob, is_test, seed):
             drop_axes = [axis] if isinstance(axis, int) else list(axis)
             if min(drop_axes) < 0 or max(drop_axes) > len(input_shape) - 1:
                 raise ValueError(
-                    "axis value should be greater than or equal to 0 and less than dimensions of x:{}, but get axis value:{} ".format(
-                        len(input_shape), max(drop_axes)
-                    )
+                    f"axis value should be greater than or equal to 0 and less than dimensions of x:{len(input_shape)}, but get axis value:{max(drop_axes)} "
                 )
             if len(drop_axes) > len(input_shape):
                 raise ValueError(
-                    "length of axis should not be greater than dimensions of x:{}, but get length of axis: {}".format(
-                        len(input_shape), len(drop_axes)
-                    )
+                    f"length of axis should not be greater than dimensions of x:{len(input_shape)}, but get length of axis: {len(drop_axes)}"
                 )
             mask_shape = [1] * len(input_shape)
             if not in_dynamic_mode():
@@ -1745,9 +1741,7 @@ def pad(x, pad, mode='constant', value=0.0, data_format="NCHW", name=None):
     }
     assert (
         data_format in supported_format_map[x_dim]
-    ), "input tensor dimension is {}, it's data format should be in {} but got {}".format(
-        x_dim, supported_format_map[x_dim], data_format
-    )
+    ), f"input tensor dimension is {x_dim}, it's data format should be in {supported_format_map[x_dim]} but got {data_format}"
 
     unsqueezed_dim = []
 
@@ -2210,10 +2204,8 @@ class centers and the shape of sampled_class_center will be [num_positive_class_
     """
     if not (group is False or group is None or hasattr(group, 'is_member')):
         raise ValueError(
-            'Expected group is False, None or instance of paddle.distributed.collective.Group \
-             (got group: {})'.format(
-                group
-            )
+            f'Expected group is False, None or instance of paddle.distributed.collective.Group \
+             (got group: {group})'
         )
         return
 
@@ -2236,9 +2228,7 @@ class centers and the shape of sampled_class_center will be [num_positive_class_
 
     if num_samples > num_classes:
         raise ValueError(
-            'Expected num_samples less than or equal to {}, got num_samples {}'.format(
-                num_classes, num_samples
-            )
+            f'Expected num_samples less than or equal to {num_classes}, got num_samples {num_samples}'
         )
 
     label_size = 1
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 4efe50331d4ac..7a80794277465 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -412,14 +412,14 @@ def conv1d(
     if num_channels % groups != 0:
         raise ValueError(
             "the channel of input must be divisible by groups,"
-            "received: the channel of input is {}, the shape of input is {}"
-            ", the groups is {}".format(num_channels, x.shape, groups)
+            f"received: the channel of input is {num_channels}, the shape of input is {x.shape}"
+            f", the groups is {groups}"
         )
     if num_filters % groups != 0:
         raise ValueError(
             "the number of filters must be divisible by groups,"
-            "received: the number of filters is {}, the shape of weight is {}"
-            ", the groups is {}".format(num_filters, weight.shape, groups)
+            f"received: the number of filters is {num_filters}, the shape of weight is {weight.shape}"
+            f", the groups is {groups}"
         )
 
     # update attrs
@@ -655,14 +655,14 @@ def conv2d(
     if num_channels % groups != 0:
         raise ValueError(
             "the channel of input must be divisible by groups,"
-            "received: the channel of input is {}, the shape of input is {}"
-            ", the groups is {}".format(num_channels, x.shape, groups)
+            f"received: the channel of input is {num_channels}, the shape of input is {x.shape}"
+            f", the groups is {groups}"
         )
     if num_filters % groups != 0:
         raise ValueError(
             "the number of filters must be divisible by groups,"
-            "received: the number of filters is {}, the shape of weight is {}"
-            ", the groups is {}".format(num_filters, weight.shape, groups)
+            f"received: the number of filters is {num_filters}, the shape of weight is {weight.shape}"
+            f", the groups is {groups}"
         )
 
     cudnn_version = get_cudnn_version()
@@ -911,8 +911,8 @@ def conv1d_transpose(
     if num_channels % groups != 0:
         raise ValueError(
             "the channel of input must be divisible by groups,"
-            "received: the channel of input is {}, the shape of input is {}"
-            ", the groups is {}".format(num_channels, x.shape, groups)
+            f"received: the channel of input is {num_channels}, the shape of input is {x.shape}"
+            f", the groups is {groups}"
         )
 
     # update attrs
@@ -960,9 +960,7 @@ def conv1d_transpose(
 
     if len(weight.shape) != 3:
         raise ValueError(
-            'Input weight should be 3D tensor, but received weight with the shape of {}'.format(
-                weight.shape
-            )
+            f'Input weight should be 3D tensor, but received weight with the shape of {weight.shape}'
         )
 
     op_type = 'conv2d_transpose'
@@ -1176,9 +1174,7 @@ def conv2d_transpose(
         )
     if len(weight.shape) != 4:
         raise ValueError(
-            "Input weight should be 4D tensor, but received weight with the shape of {}".format(
-                weight.shape
-            )
+            f"Input weight should be 4D tensor, but received weight with the shape of {weight.shape}"
         )
     num_channels = x.shape[channel_dim]
     if num_channels < 0:
@@ -1193,8 +1189,8 @@ def conv2d_transpose(
     if num_channels % groups != 0:
         raise ValueError(
             "the channel of input must be divisible by groups,"
-            "received: the channel of input is {}, the shape of input is {}"
-            ", the groups is {}".format(num_channels, x.shape, groups)
+            f"received: the channel of input is {num_channels}, the shape of input is {x.shape}"
+            f", the groups is {groups}"
         )
 
     cudnn_version = get_cudnn_version()
@@ -1669,9 +1665,7 @@ def conv3d_transpose(
         )
     if len(weight.shape) != 5:
         raise ValueError(
-            "Input weight should be 5D tensor, but received weight with the shape of {}".format(
-                weight.shape
-            )
+            f"Input weight should be 5D tensor, but received weight with the shape of {weight.shape}"
         )
     num_channels = x.shape[channel_dim]
     num_filters = weight.shape[1]
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 5741f0a643db0..3a44c20ace6fd 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2293,10 +2293,8 @@ def margin_cross_entropy(
     assert reduction in ['mean', 'sum', 'none', None]
     if not (group is False or group is None or hasattr(group, 'is_member')):
         raise ValueError(
-            'Expected group is False, None or instance of paddle.distributed.collective.Group \
-             (got group: {})'.format(
-                group
-            )
+            f'Expected group is False, None or instance of paddle.distributed.collective.Group \
+             (got group: {group})'
         )
         return
 
@@ -3185,9 +3183,7 @@ def sigmoid_focal_loss(
         normalizer_dims = len(normalizer_shape)
         if normalizer_dims > 1:
             raise ValueError(
-                "Expected zero or one dimension of normalizer in sigmoid_focal_loss but got {}.".format(
-                    normalizer_dims
-                )
+                f"Expected zero or one dimension of normalizer in sigmoid_focal_loss but got {normalizer_dims}."
             )
 
     if in_dynamic_or_pir_mode():
@@ -3968,9 +3964,7 @@ def multi_margin_loss(
     if not (input.shape[0] == label.shape[0]):
         raise ValueError(
             "The label's shape[0] should be equal to input's shape[0], "
-            "but received input's shape[0] {} and label's shape[0]:{}. ".format(
-                input.shape[0], label.shape[0]
-            )
+            f"but received input's shape[0] {input.shape[0]} and label's shape[0]:{label.shape[0]}. "
         )
     label = label.reshape((-1, 1))
     index_sample = paddle.index_sample(input, label)
@@ -3982,9 +3976,7 @@ def multi_margin_loss(
         if not (input.shape[1] == weight.shape[0]):
             raise ValueError(
                 "The weight's shape[0] should be equal to input's shape[1]"
-                "but received weight's shape[0]: {} and input's shape[1]: {}".format(
-                    weight.shape[0], input.shape[1]
-                )
+                f"but received weight's shape[0]: {weight.shape[0]} and input's shape[1]: {input.shape[1]}"
             )
         weight = paddle.gather(weight, label, axis=0).reshape((-1, 1))
         loss = paddle.mean(
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index dc79776afe90d..3fc857b5b6a09 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -43,9 +43,7 @@ def _is_list_or_tuple(input):
 def _check_input(x, dimension):
     if len(x.shape) != dimension:
         raise ValueError(
-            "Excepted Input X is {}-D tensor, but received {}-D {}".format(
-                dimension, len(x.shape), type(x)
-            )
+            f"Excepted Input X is {dimension}-D tensor, but received {len(x.shape)}-D {type(x)}"
         )
 
 
@@ -60,9 +58,7 @@ def _check_value_limitation(x, x_name, min_limit=1e-3):
     def _check_value(x, x_name, min_limit=1e-3):
         if isinstance(x, int) and min_limit is not None and x < min_limit:
             raise ValueError(
-                "Excepted the input {} to be greater than {} but received x: {}. ".format(
-                    x_name, min_limit, x
-                )
+                f"Excepted the input {x_name} to be greater than {min_limit} but received x: {x}. "
             )
 
     for ele in x:
@@ -716,9 +712,7 @@ def _unpool_output_size(x, kernel_size, stride, padding, output_size):
     if len(output_size) != len(kernel_size):
         raise ValueError(
             "output_size should be a sequence containing "
-            "{} or {} elements, but it has a length of '{}'".format(
-                len(kernel_size), len(kernel_size) + 2, len(output_size)
-            )
+            f"{len(kernel_size)} or {len(kernel_size) + 2} elements, but it has a length of '{len(output_size)}'"
         )
     if not has_static_var:
         for d in range(len(kernel_size)):
@@ -726,9 +720,7 @@ def _unpool_output_size(x, kernel_size, stride, padding, output_size):
             max_size = default_size[d] + stride[d]
             if not (min_size < output_size[d] < max_size):
                 raise ValueError(
-                    'invalid output_size "{}" (dim {} must be between {} and {})'.format(
-                        output_size, d, min_size, max_size
-                    )
+                    f'invalid output_size "{output_size}" (dim {d} must be between {min_size} and {max_size})'
                 )
 
     return output_size
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 2e5c988ab0c8e..a3df8c4b0067a 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -269,9 +269,7 @@ def grid_sample(
         )
     if padding_mode not in _padding_modes:
         raise ValueError(
-            "The padding mode of grid sample function should be in {}, but got: {}".format(
-                _padding_modes, padding_mode
-            )
+            f"The padding mode of grid sample function should be in {_padding_modes}, but got: {padding_mode}"
         )
 
     if not isinstance(align_corners, bool):
diff --git a/python/paddle/nn/initializer/xavier.py b/python/paddle/nn/initializer/xavier.py
index fd47805c22133..0a4c414aa274c 100644
--- a/python/paddle/nn/initializer/xavier.py
+++ b/python/paddle/nn/initializer/xavier.py
@@ -41,14 +41,14 @@ class XavierInitializer(Initializer):
 
     .. math::
 
-        x = \sqrt{\\frac{6.0}{fan\_in + fan\_out}}
+        x = gain \times \sqrt{\\frac{6.0}{fan\_in + fan\_out}}
 
     In case of Normal distribution, the mean is 0 and the standard deviation
     is
 
     .. math::
 
-        \sqrt{\\frac{2.0}{fan\_in + fan\_out}}
+       gain \times \sqrt{\\frac{2.0}{fan\_in + fan\_out}}
 
 
     Args:
@@ -57,6 +57,7 @@ class XavierInitializer(Initializer):
                 inferred from the variable. Default is None.
         fan_out (float, optional): fan_out for Xavier initialization. If None, it is
                  inferred from the variable. Default is None.
+        gain (float, optional): Scaling Tensor. Default is 1.0.
         seed (int, optional): Random seed. Default is 0.
 
     Note:
@@ -64,7 +65,9 @@ class XavierInitializer(Initializer):
 
     """
 
-    def __init__(self, uniform=True, fan_in=None, fan_out=None, seed=0):
+    def __init__(
+        self, uniform=True, fan_in=None, fan_out=None, seed=0, gain=1.0
+    ):
         assert uniform is not None
         assert seed is not None
         super().__init__()
@@ -72,6 +75,7 @@ def __init__(self, uniform=True, fan_in=None, fan_out=None, seed=0):
         self._fan_in = fan_in
         self._fan_out = fan_out
         self._seed = seed
+        self._gain = gain
 
     def forward(self, var, block=None):
         """Initialize the input tensor with Xavier initialization.
@@ -136,7 +140,7 @@ def forward(self, var, block=None):
 
         if in_dygraph_mode():
             if self._uniform:
-                limit = math.sqrt(6.0 / float(fan_in + fan_out))
+                limit = self._gain * math.sqrt(6.0 / float(fan_in + fan_out))
                 out_var = _C_ops.uniform(
                     out_var_shape,
                     out_dtype,
@@ -146,7 +150,7 @@ def forward(self, var, block=None):
                     _current_expected_place(),
                 )
             else:
-                std = math.sqrt(2.0 / float(fan_in + fan_out))
+                std = self._gain * math.sqrt(2.0 / float(fan_in + fan_out))
 
                 place = _current_expected_place()
                 out_var = _C_ops.gaussian(
@@ -173,7 +177,7 @@ def forward(self, var, block=None):
             return None
         elif in_pir_mode():
             if self._uniform:
-                limit = math.sqrt(6.0 / float(fan_in + fan_out))
+                limit = self._gain * math.sqrt(6.0 / float(fan_in + fan_out))
                 out_var = paddle._pir_ops.uniform(
                     out_var.shape,
                     out_dtype,
@@ -183,7 +187,7 @@ def forward(self, var, block=None):
                     _current_expected_place(),
                 )
             else:
-                std = math.sqrt(2.0 / float(fan_in + fan_out))
+                std = self._gain * math.sqrt(2.0 / float(fan_in + fan_out))
                 out_var = _C_ops.gaussian(
                     out_var.shape,
                     0.0,
@@ -202,7 +206,7 @@ def forward(self, var, block=None):
             return out_var
         else:
             if self._uniform:
-                limit = math.sqrt(6.0 / float(fan_in + fan_out))
+                limit = self._gain * math.sqrt(6.0 / float(fan_in + fan_out))
                 op = block.append_op(
                     type="uniform_random",
                     inputs={},
@@ -217,7 +221,7 @@ def forward(self, var, block=None):
                     stop_gradient=True,
                 )
             else:
-                std = math.sqrt(2.0 / float(fan_in + fan_out))
+                std = self._gain * math.sqrt(2.0 / float(fan_in + fan_out))
                 op = block.append_op(
                     type="gaussian_random",
                     outputs={"Out": out_var},
@@ -254,7 +258,7 @@ class XavierNormal(XavierInitializer):
 
     .. math::
 
-        \sqrt{\frac{2.0}{fan\_in + fan\_out}}.
+        gain \times \sqrt{\frac{2.0}{fan\_in + fan\_out}}.
 
 
     Args:
@@ -262,6 +266,7 @@ class XavierNormal(XavierInitializer):
                 inferred from the Tensor. Default is None.
         fan_out (float, optional): fan_out for Xavier initialization, which is
                  inferred from the Tensor. Default is None.
+        gain (float, optional): Scaling Tensor. Default is 1.0.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
@@ -299,8 +304,10 @@ class XavierNormal(XavierInitializer):
              [[1.13615966, 0.89018601]]])
     """
 
-    def __init__(self, fan_in=None, fan_out=None, name=None):
-        super().__init__(uniform=False, fan_in=fan_in, fan_out=fan_out, seed=0)
+    def __init__(self, fan_in=None, fan_out=None, gain=1.0, name=None):
+        super().__init__(
+            uniform=False, fan_in=fan_in, fan_out=fan_out, seed=0, gain=gain
+        )
 
 
 class XavierUniform(XavierInitializer):
@@ -316,13 +323,14 @@ class XavierUniform(XavierInitializer):
 
     .. math::
 
-        x = \sqrt{\frac{6.0}{fan\_in + fan\_out}}.
+        x = gain \times \sqrt{\frac{6.0}{fan\_in + fan\_out}}.
 
     Args:
         fan_in (float, optional): fan_in for Xavier initialization, which is
                 inferred from the Tensor. Default is None.
         fan_out (float, optional): fan_out for Xavier initialization, which is
                  inferred from the Tensor. Default is None.
+        gain (float, optional): Scaling Tensor. Default is 1.0.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
@@ -359,5 +367,7 @@ class XavierUniform(XavierInitializer):
              [[-1.02494967,  0.67544925]]])
     """
 
-    def __init__(self, fan_in=None, fan_out=None, name=None):
-        super().__init__(uniform=True, fan_in=fan_in, fan_out=fan_out, seed=0)
+    def __init__(self, fan_in=None, fan_out=None, gain=1.0, name=None):
+        super().__init__(
+            uniform=True, fan_in=fan_in, fan_out=fan_out, seed=0, gain=gain
+        )
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 59a9436dadb51..c1234c28bc47d 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -502,13 +502,7 @@ def forward(self, x):
 
     def extra_repr(self):
         name_str = f', name={self._name}' if self._name else ''
-        return 'num_parameters={}, data_format={}, init={}, dtype={}{}'.format(
-            self._num_parameters,
-            self._data_format,
-            self._init,
-            self._dtype,
-            name_str,
-        )
+        return f'num_parameters={self._num_parameters}, data_format={self._data_format}, init={self._init}, dtype={self._dtype}{name_str}'
 
 
 class RReLU(Layer):
@@ -597,9 +591,7 @@ def forward(self, x):
 
     def extra_repr(self):
         name_str = f', name={self._name}' if self._name else ''
-        return 'lower={}, upper={}, training={}, dtype={}{}'.format(
-            self._lower, self._upper, self.training, self._dtype, name_str
-        )
+        return f'lower={self._lower}, upper={self._upper}, training={self.training}, dtype={self._dtype}{name_str}'
 
 
 class ReLU(Layer):
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 9dba25bb0043e..6faf07bb6eb19 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -189,9 +189,7 @@ def forward(self, input):
 
     def extra_repr(self):
         name_str = f', name={self.name}' if self.name else ''
-        return 'in_features={}, out_features={}, dtype={}{}'.format(
-            self.weight.shape[0], self.weight.shape[1], self._dtype, name_str
-        )
+        return f'in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, dtype={self._dtype}{name_str}'
 
 
 class Upsample(Layer):
@@ -439,14 +437,7 @@ def extra_repr(self):
         else:
             main_str = f'size={self.size}'
         name_str = f', name={self.name}' if self.name else ''
-        return '{}, mode={}, align_corners={}, align_mode={}, data_format={}{}'.format(
-            main_str,
-            self.mode,
-            self.align_corners,
-            self.align_mode,
-            self.data_format,
-            name_str,
-        )
+        return f'{main_str}, mode={self.mode}, align_corners={self.align_corners}, align_mode={self.align_mode}, data_format={self.data_format}{name_str}'
 
 
 class UpsamplingNearest2D(Layer):
@@ -720,13 +711,7 @@ def forward(self, x1, x2):
 
     def extra_repr(self):
         name_str = f', name={self._name}' if self._name else ''
-        return 'in1_features={}, in2_features={}, out_features={}, dtype={}{}'.format(
-            self._in1_features,
-            self._in2_features,
-            self._out_features,
-            self._dtype,
-            name_str,
-        )
+        return f'in1_features={self._in1_features}, in2_features={self._in2_features}, out_features={self._out_features}, dtype={self._dtype}{name_str}'
 
 
 class Dropout(Layer):
@@ -1089,9 +1074,7 @@ def forward(self, x):
 
     def extra_repr(self):
         name_str = f', name={self._name}' if self._name else ''
-        return 'padding={}, mode={}, value={}, data_format={}{}'.format(
-            self._pad, self._mode, self._value, self._data_format, name_str
-        )
+        return f'padding={self._pad}, mode={self._mode}, value={self._value}, data_format={self._data_format}{name_str}'
 
 
 class Pad2D(Layer):
@@ -1163,9 +1146,7 @@ def forward(self, x):
 
     def extra_repr(self):
         name_str = f', name={self._name}' if self._name else ''
-        return 'padding={}, mode={}, value={}, data_format={}{}'.format(
-            self._pad, self._mode, self._value, self._data_format, name_str
-        )
+        return f'padding={self._pad}, mode={self._mode}, value={self._value}, data_format={self._data_format}{name_str}'
 
 
 class ZeroPad2D(Layer):
@@ -1306,9 +1287,7 @@ def forward(self, x):
 
     def extra_repr(self):
         name_str = f', name={self._name}' if self._name else ''
-        return 'padding={}, mode={}, value={}, data_format={}{}'.format(
-            self._pad, self._mode, self._value, self._data_format, name_str
-        )
+        return f'padding={self._pad}, mode={self._mode}, value={self._value}, data_format={self._data_format}{name_str}'
 
 
 class CosineSimilarity(Layer):
@@ -1606,13 +1585,7 @@ def forward(self, input):
 
     def extra_repr(self):
         name_str = f', name={self.name}' if self.name else ''
-        return 'kernel_size={}, dilation={}, padding={}, stride={}{}'.format(
-            self.kernel_sizes,
-            self.dilations,
-            self.paddings,
-            self.strides,
-            name_str,
-        )
+        return f'kernel_size={self.kernel_sizes}, dilation={self.dilations}, padding={self.paddings}, stride={self.strides}{name_str}'
 
 
 class Fold(Layer):
@@ -1704,13 +1677,7 @@ def forward(self, input):
 
     def extra_repr(self):
         name_str = f', name={self.name}' if self.name else ''
-        return 'kernel_size={}, dilation={}, padding={}, stride={}{}'.format(
-            self.kernel_sizes,
-            self.dilations,
-            self.paddings,
-            self.strides,
-            name_str,
-        )
+        return f'kernel_size={self.kernel_sizes}, dilation={self.dilations}, padding={self.paddings}, stride={self.strides}{name_str}'
 
 
 class Flatten(Layer):
diff --git a/python/paddle/nn/layer/container.py b/python/paddle/nn/layer/container.py
index 1f2986a6395d5..9d250ba3df872 100644
--- a/python/paddle/nn/layer/container.py
+++ b/python/paddle/nn/layer/container.py
@@ -422,9 +422,7 @@ def _get_abs_idx(self, idx):
         if isinstance(idx, int):
             if not (-len(self) <= idx < len(self)):
                 raise IndexError(
-                    'index {} is out of range, should be an integer in range [{}, {})'.format(
-                        idx, -len(self), len(self)
-                    )
+                    f'index {idx} is out of range, should be an integer in range [{-len(self)}, {len(self)})'
                 )
             if idx < 0:
                 idx += len(self)
@@ -550,6 +548,9 @@ class Sequential(Layer):
     Parameters:
         layers(Layer|list|tuple): Layer or list/tuple of iterable name Layer pair.
 
+    Returns:
+        None.
+
     Examples:
         .. code-block:: python
 
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index c96c8b0872910..2990969ef0503 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -78,9 +78,7 @@ def __init__(
         valid_padding_modes = {'zeros', 'reflect', 'replicate', 'circular'}
         if padding_mode not in valid_padding_modes:
             raise ValueError(
-                "padding_mode must be one of {}, but got padding_mode='{}'".format(
-                    valid_padding_modes, padding_mode
-                )
+                f"padding_mode must be one of {valid_padding_modes}, but got padding_mode='{padding_mode}'"
             )
 
         if padding_mode in {
@@ -95,9 +93,7 @@ def __init__(
         valid_format = {'NHWC', 'NCHW', 'NDHWC', 'NCDHW', 'NLC', 'NCL'}
         if data_format not in valid_format:
             raise ValueError(
-                "data_format must be one of {}, but got data_format='{}'".format(
-                    valid_format, data_format
-                )
+                f"data_format must be one of {valid_format}, but got data_format='{data_format}'"
             )
 
         channel_last = (
diff --git a/python/paddle/nn/layer/layers.py b/python/paddle/nn/layer/layers.py
index f9d6d1cde4cf7..877d3eb1da591 100644
--- a/python/paddle/nn/layer/layers.py
+++ b/python/paddle/nn/layer/layers.py
@@ -69,9 +69,7 @@ def record_program_ops_pre_hook(layer, inputs):
         else:
             layer._op_recorder.is_valid = False
             warnings.warn(
-                "{} has recorded the op information before. Please check whether you call this layer twice.".format(
-                    layer._full_name
-                )
+                f"{layer._full_name} has recorded the op information before. Please check whether you call this layer twice."
             )
 
 
@@ -1239,9 +1237,7 @@ def register_buffer(self, name, tensor, persistable=True):
             raise ValueError("super().__init__() should be called first")
         elif not isinstance(name, str):
             raise TypeError(
-                "The name of buffer should be a string, but received {}.".format(
-                    type(name).__name__
-                )
+                f"The name of buffer should be a string, but received {type(name).__name__}."
             )
         elif '.' in name:
             raise KeyError(
@@ -1255,9 +1251,7 @@ def register_buffer(self, name, tensor, persistable=True):
             raise KeyError(f"attribute '{name}' already exists.")
         elif tensor is not None and not (type(tensor) == core.eager.Tensor):
             raise TypeError(
-                "The registered buffer should be a Paddle.Tensor, but received {}.".format(
-                    type(tensor).__name__
-                )
+                f"The registered buffer should be a Paddle.Tensor, but received {type(tensor).__name__}."
             )
         else:
             self._buffers[name] = tensor
@@ -1355,10 +1349,14 @@ def named_buffers(self, prefix='', include_sublayers=True):
                 name = layer_prefix + ('.' if layer_prefix else '') + key
                 yield name, buffer
 
-    def clear_gradients(self):
+    def clear_gradients(self, set_to_zero=True):
         """
         Clear the gradients of all parameters for this layer.
 
+        Args:
+            set_to_zero (bool, optional): Whether to set the trainable parameters'
+                gradients to zero or None. Default is True.
+
         Returns:
             None
 
@@ -1381,7 +1379,7 @@ def clear_gradients(self):
         """
         for p in self.parameters():
             if p.trainable:
-                p.clear_gradient()
+                p.clear_gradient(set_to_zero)
 
     def _build_once(self, *args, **kwargs):
         pass
@@ -1532,9 +1530,7 @@ def add_parameter(self, name, parameter):
             raise RuntimeError("super().__init__() should be called firstly.")
         elif not isinstance(name, str):
             raise TypeError(
-                "The name of parameter should be a string, but received {}.".format(
-                    type(name).__name__
-                )
+                f"The name of parameter should be a string, but received {type(name).__name__}."
             )
         elif '.' in name:
             raise KeyError(
@@ -1550,9 +1546,7 @@ def add_parameter(self, name, parameter):
             parameter, framework.Parameter
         ):
             raise TypeError(
-                "The parameter to be added should be a Parameter, but received {}.".format(
-                    type(parameter).__name__
-                )
+                f"The parameter to be added should be a Parameter, but received {type(parameter).__name__}."
             )
         else:
             if parameter is None:
@@ -1561,9 +1555,7 @@ def add_parameter(self, name, parameter):
             if len(self._loaddict_holder) > 0:
                 assert (
                     parameter.name in self._loaddict_holder
-                ), "Parameter not found, Can't not find [ {} ] in state_dict".format(
-                    parameter.name
-                )
+                ), f"Parameter not found, Can't not find [ {parameter.name} ] in state_dict"
 
                 parameter.set_value(self._loaddict_holder[parameter.name])
 
@@ -1687,9 +1679,7 @@ def _remove_if_exist(*dicts):
         elif params is not None and name in params:
             if value is not None:
                 raise TypeError(
-                    "assignment to parameter '{}' should be of type Parameter or None, but got '{}'".format(
-                        name, type(value).__name__
-                    )
+                    f"assignment to parameter '{name}' should be of type Parameter or None, but got '{type(value).__name__}'"
                 )
             params[name] = None
         else:
@@ -1705,9 +1695,7 @@ def _remove_if_exist(*dicts):
             elif layers is not None and name in layers:
                 if value is not None:
                     raise TypeError(
-                        "assignment to sublayer '{}' should be of type Layer or None, but got '{}'".format(
-                            name, type(value).__name__
-                        )
+                        f"assignment to sublayer '{name}' should be of type Layer or None, but got '{type(value).__name__}'"
                     )
                 layers[name] = None
             else:
@@ -1754,9 +1742,7 @@ def _remove_if_exist(*dicts):
                             assign(value, getattr(self, name))
                     elif value is not None:
                         raise TypeError(
-                            "assignment to buffers '{}' should be of type core.Tensor or None, but got '{}'".format(
-                                name, type(value).__name__
-                            )
+                            f"assignment to buffers '{name}' should be of type core.Tensor or None, but got '{type(value).__name__}'"
                         )
                     else:
                         # Assigning None will remove the buffer, but if re-assign a new varBase to it,
@@ -2064,9 +2050,7 @@ def _check_match(key, param):
                 if list(state_shape) != list(param.shape):
                     missing_keys.append(key)
                     raise ValueError(
-                        "{} receives a shape {}, but the expected shape is {}.".format(
-                            key, list(state_shape), list(param.shape)
-                        )
+                        f"{key} receives a shape {list(state_shape)}, but the expected shape is {list(param.shape)}."
                     )
                 match_keys.add(key)
                 return param, state
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 2501976afab50..1b71fb426f5e0 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -543,9 +543,7 @@ def forward(self, input):
         )
 
     def extra_repr(self):
-        return 'num_groups={}, num_channels={}, epsilon={}'.format(
-            self._num_groups, self._num_channels, self._epsilon
-        )
+        return f'num_groups={self._num_groups}, num_channels={self._num_channels}, epsilon={self._epsilon}'
 
 
 class LayerNorm(Layer):
@@ -803,9 +801,7 @@ def forward(self, input):
         )
 
     def extra_repr(self):
-        main_str = 'num_features={}, momentum={}, epsilon={}'.format(
-            self._num_features, self._momentum, self._epsilon
-        )
+        main_str = f'num_features={self._num_features}, momentum={self._momentum}, epsilon={self._epsilon}'
         if self._data_format != 'NCHW':
             main_str += f', data_format={self._data_format}'
         if self._name is not None:
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index de848b9e16cce..aca8b66e6ad3d 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -774,9 +774,7 @@ def __init__(
         super().__init__()
         if hidden_size <= 0:
             raise ValueError(
-                "hidden_size of {} must be greater than 0, but now equals to {}".format(
-                    self.__class__.__name__, hidden_size
-                )
+                f"hidden_size of {self.__class__.__name__} must be greater than 0, but now equals to {hidden_size}"
             )
         std = 1.0 / math.sqrt(hidden_size)
         if weight_ih_attr is not False:
@@ -969,9 +967,7 @@ def __init__(
         super().__init__()
         if hidden_size <= 0:
             raise ValueError(
-                "hidden_size of {} must be greater than 0, but now equals to {}".format(
-                    self.__class__.__name__, hidden_size
-                )
+                f"hidden_size of {self.__class__.__name__} must be greater than 0, but now equals to {hidden_size}"
             )
         std = 1.0 / math.sqrt(hidden_size)
         if weight_ih_attr is not False:
@@ -1162,9 +1158,7 @@ def __init__(
         super().__init__()
         if hidden_size <= 0:
             raise ValueError(
-                "hidden_size of {} must be greater than 0, but now equals to {}".format(
-                    self.__class__.__name__, hidden_size
-                )
+                f"hidden_size of {self.__class__.__name__} must be greater than 0, but now equals to {hidden_size}"
             )
         std = 1.0 / math.sqrt(hidden_size)
         if weight_ih_attr is not False:
diff --git a/python/paddle/nn/quant/quantized_linear.py b/python/paddle/nn/quant/quantized_linear.py
index 6baeb30c20d95..7803c3bd38766 100644
--- a/python/paddle/nn/quant/quantized_linear.py
+++ b/python/paddle/nn/quant/quantized_linear.py
@@ -75,8 +75,7 @@ def weight_quantize(x, algo="weight_only_int8", arch=None, group_size=-1):
     assert (
         group_size == -1 or group_size == 64 or group_size == 128
     ), f"Currently group_size only support -1/64/128. but got {group_size} "
-
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.weight_quantize(x, algo, arch, group_size)
     else:
         type = "weight_quantize"
@@ -129,7 +128,7 @@ def weight_dequantize(
         out_dtype, 'out_dtype', ['float16', 'bfloat16'], 'weight_dequantize'
     )
     out_dtype = convert_np_dtype_to_dtype_(out_dtype)
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.weight_dequantize(x, scale, algo, out_dtype, group_size)
     else:
         type = "weight_dequantize"
@@ -200,7 +199,7 @@ def weight_only_linear(
         group_size == -1 or group_size == 64 or group_size == 128
     ), f"Currently weight_quantize only support group size of -1, 64 or 128. but got {group_size} "
 
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         out = _C_ops.weight_only_linear(
             x, weight, bias, weight_scale, weight_dtype, arch, group_size
         )
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index e89d832e8fb1d..5e91317da4c2b 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -202,9 +202,7 @@ def __init__(
             if isinstance(parameters, (paddle.Tensor, core.eager.Tensor)):
                 raise TypeError(
                     "`parameters` argument given to the optimizer should be "
-                    "an iterable of paddle Tensors, but got argument type is `{}`.".format(
-                        type(parameters)
-                    )
+                    f"an iterable of paddle Tensors, but got argument type is `{type(parameters)}`."
                 )
             if isinstance(parameters, dict):
                 raise TypeError(
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index f1c81eac3b798..e237a7d2474d6 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -124,9 +124,7 @@ class LRScheduler:
     def __init__(self, learning_rate=0.1, last_epoch=-1, verbose=False):
         if not isinstance(learning_rate, (float, int)):
             raise TypeError(
-                "The type of learning rate must be float, but received {}".format(
-                    type(learning_rate)
-                )
+                f"The type of learning rate must be float, but received {type(learning_rate)}"
             )
         if learning_rate < 0:
             raise ValueError(f"Invalid learning rate: {learning_rate}")
@@ -194,9 +192,7 @@ def step(self, epoch=None):
 
         if self.verbose:
             print(
-                'Epoch {}: {} set learning rate to {}.'.format(
-                    self.last_epoch, self.__class__.__name__, self.last_lr
-                )
+                f'Epoch {self.last_epoch}: {self.__class__.__name__} set learning rate to {self.last_lr}.'
             )
 
     def state_dict(self):
@@ -889,9 +885,7 @@ def __init__(
         type_check = isinstance(learning_rate, (float, int, LRScheduler))
         if not type_check:
             raise TypeError(
-                "the type of learning_rate should be [int, float or LRScheduler], the current type is {}".format(
-                    learning_rate
-                )
+                f"the type of learning_rate should be [int, float or LRScheduler], the current type is {learning_rate}"
             )
         self.learning_rate = learning_rate
         assert warmup_steps > 0 and isinstance(
@@ -1529,18 +1523,14 @@ def step(self, metrics, epoch=None):
         # loss must be float, numpy.ndarray or 1-D Tensor with numel 1
         if isinstance(metrics, (core.eager.Tensor, numpy.ndarray)):
             assert metrics.size == 1, (
-                "the size of metrics must be 1, but the current metrics.size is {}. Maybe that "
-                "you should call paddle.mean to process it first.".format(
-                    metrics.size
-                )
+                f"the size of metrics must be 1, but the current metrics.size is {metrics.size}. Maybe that "
+                "you should call paddle.mean to process it first."
             )
         elif not isinstance(
             metrics, (int, float, numpy.float32, numpy.float64)
         ):
             raise TypeError(
-                "metrics must be 'int', 'float', 'np.float64', 'numpy.ndarray' or 'paddle.Tensor', but receive {}".format(
-                    type(metrics)
-                )
+                f"metrics must be 'int', 'float', 'np.float64', 'numpy.ndarray' or 'paddle.Tensor', but receive {type(metrics)}"
             )
 
         if self.cooldown_counter > 0:
@@ -1560,11 +1550,7 @@ def step(self, metrics, epoch=None):
                     self.last_lr = new_lr
                     if self.verbose:
                         print(
-                            'Epoch {}: {} set learning rate to {}.'.format(
-                                self.last_epoch,
-                                self.__class__.__name__,
-                                self.last_lr,
-                            )
+                            f'Epoch {self.last_epoch}: {self.__class__.__name__} set learning rate to {self.last_lr}.'
                         )
 
     def _is_better(self, current, best):
@@ -1889,9 +1875,7 @@ def __init__(
         # Check type and value of max_learning_rate
         if not isinstance(max_learning_rate, (float, int)):
             raise TypeError(
-                "'max_learning_rate' must be 'float' or 'int', but received {}".format(
-                    type(max_learning_rate)
-                )
+                f"'max_learning_rate' must be 'float' or 'int', but received {type(max_learning_rate)}"
             )
         if max_learning_rate < 0:
             raise ValueError("'max_learning_rate' must be a positive integer.")
@@ -1899,9 +1883,7 @@ def __init__(
         # Check type and value of end_learning_rate
         if not isinstance(end_learning_rate, (float, int)):
             raise TypeError(
-                "'end_learning_rate' must be 'float' or 'int', but received {}".format(
-                    type(end_learning_rate)
-                )
+                f"'end_learning_rate' must be 'float' or 'int', but received {type(end_learning_rate)}"
             )
         if end_learning_rate < 0:
             raise ValueError("'end_learning_rate' must be a positive integer.")
@@ -1928,9 +1910,7 @@ def __init__(
         # Check type and value of divide_factor
         if not isinstance(divide_factor, (float, int)):
             raise TypeError(
-                "'divide_factor' must be 'float' or 'int', but received {}".format(
-                    type(divide_factor)
-                )
+                f"'divide_factor' must be 'float' or 'int', but received {type(divide_factor)}"
             )
 
         initial_lr = max_learning_rate / float(divide_factor)
@@ -1985,9 +1965,7 @@ def __init__(
             self.anneal_func = self._linear_annealing
         else:
             raise ValueError(
-                "'anneal_strategy' must by one of 'cos' or 'linear', but received {}".format(
-                    anneal_strategy
-                )
+                f"'anneal_strategy' must by one of 'cos' or 'linear', but received {anneal_strategy}"
             )
         super().__init__(initial_lr, last_epoch, verbose)
 
@@ -2003,9 +1981,7 @@ def get_lr(self):
 
         if current_step > self.total_steps:
             raise ValueError(
-                "Tried to step {} times. However the number of total steps is {}".format(
-                    current_step, self.total_steps
-                )
+                f"Tried to step {current_step} times. However the number of total steps is {self.total_steps}"
             )
 
         for i, (end_step, step_size) in enumerate(
@@ -2134,44 +2110,32 @@ def __init__(
         # check type and value of max_learning_rate
         if not isinstance(max_learning_rate, (float, int)):
             raise TypeError(
-                "'max_learning_rate' must be 'float' or 'int', but received {}".format(
-                    type(max_learning_rate)
-                )
+                f"'max_learning_rate' must be 'float' or 'int', but received {type(max_learning_rate)}"
             )
         if max_learning_rate < 0:
             raise ValueError(
-                "'max_learning_rate' must be a positive integer, but received {}".format(
-                    max_learning_rate
-                )
+                f"'max_learning_rate' must be a positive integer, but received {max_learning_rate}"
             )
 
         # check type and value of step_size_up
         if not isinstance(step_size_up, int):
             raise TypeError(
-                "The type of 'step_size_up' must be int, but received {}".format(
-                    type(step_size_up)
-                )
+                f"The type of 'step_size_up' must be int, but received {type(step_size_up)}"
             )
         if step_size_up <= 0:
             raise ValueError(
-                "'step_size_up' must be a positive integer, but received {}".format(
-                    step_size_up
-                )
+                f"'step_size_up' must be a positive integer, but received {step_size_up}"
             )
 
         # check type and value of step_size_down
         if step_size_down is not None:
             if not isinstance(step_size_down, int):
                 raise TypeError(
-                    "The type of 'step_size_down' must be int, but received {}".format(
-                        type(step_size_down)
-                    )
+                    f"The type of 'step_size_down' must be int, but received {type(step_size_down)}"
                 )
             if step_size_down <= 0:
                 raise ValueError(
-                    "'step_size_down' must be a positive integer, but received {}".format(
-                        step_size_down
-                    )
+                    f"'step_size_down' must be a positive integer, but received {step_size_down}"
                 )
 
         # check type of exp_gamma
@@ -2331,16 +2295,12 @@ def __init__(
     ):
         if start_factor > 1.0 or start_factor <= 0:
             raise ValueError(
-                "`start_factor` must be greater than 0 and less or equal to 1, but got {}".format(
-                    start_factor
-                )
+                f"`start_factor` must be greater than 0 and less or equal to 1, but got {start_factor}"
             )
 
         if end_factor > 1.0 or end_factor < 0:
             raise ValueError(
-                "`end_factor` must be greater than 0 and less than 1, but got {}".format(
-                    end_factor
-                )
+                f"`end_factor` must be greater than 0 and less than 1, but got {end_factor}"
             )
 
         if total_steps <= 0:
@@ -2524,9 +2484,7 @@ def step(self, epoch=None):
         self.last_lr = self.get_lr()
         if self.verbose:
             print(
-                'Epoch {}: {} set learning rate to {}.'.format(
-                    self.last_epoch, self.__class__.__name__, self.last_lr
-                )
+                f'Epoch {self.last_epoch}: {self.__class__.__name__} set learning rate to {self.last_lr}.'
             )
 
 
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 7643ba21965fa..bfd5d8e5edd61 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -191,9 +191,7 @@ def __init__(
             if isinstance(parameters, (paddle.Tensor, core.eager.Tensor)):
                 raise TypeError(
                     "`parameters` argument given to the optimizer should be "
-                    "an iterable of paddle Tensors, but got argument type is `{}`.".format(
-                        type(parameters)
-                    )
+                    f"an iterable of paddle Tensors, but got argument type is `{type(parameters)}`."
                 )
             if isinstance(parameters, dict):
                 raise TypeError(
@@ -335,14 +333,18 @@ def state_dict(self):
 
         '''
         state_dict = {}
-        for k, v in self._accumulators.items():
-            for para_name, var_tmp in v.items():
-                state_dict[var_tmp.name] = var_tmp
-                # save scale value for xpu
-                if core.is_compiled_with_xpu():
-                    state_dict[
-                        var_tmp.name + ".SCALE_VALUE"
-                    ] = var_tmp.get_tensor().get_xpu_scale_value()
+        if len(self._accumulators) == 0 and len(self._accumulators_holder) > 0:
+            for name, var in self._accumulators_holder.items():
+                state_dict[name] = var
+        else:
+            for k, v in self._accumulators.items():
+                for para_name, var_tmp in v.items():
+                    state_dict[var_tmp.name] = var_tmp
+                    # save scale value for xpu
+                    if core.is_compiled_with_xpu():
+                        state_dict[
+                            var_tmp.name + ".SCALE_VALUE"
+                        ] = var_tmp.get_tensor().get_xpu_scale_value()
         # if has master weight and then save master weight
         if hasattr(self, "_master_weights"):
             if len(self._master_weights) != 0:
@@ -1411,10 +1413,8 @@ def backward(
                 assert isinstance(callbacks, list)
             program = loss.block.program
             assert np.prod(loss.shape) == 1, (
-                "The number of elements of loss should be 1, but the current loss.shape is {}, whose number of elements is not 1. "
-                "Maybe that you should call paddle.mean to process the current loss.".format(
-                    loss.shape
-                )
+                f"The number of elements of loss should be 1, but the current loss.shape is {loss.shape}, whose number of elements is not 1. "
+                "Maybe that you should call paddle.mean to process the current loss."
             )
             parameter_list = parameters if parameters else self._parameter_list
             with paddle.static.program_guard(program, startup_program):
diff --git a/python/paddle/pir/math_op_patch.py b/python/paddle/pir/math_op_patch.py
index c96940f63d928..588b694893db8 100644
--- a/python/paddle/pir/math_op_patch.py
+++ b/python/paddle/pir/math_op_patch.py
@@ -141,6 +141,7 @@ def cuda(self, device_id=None, blocking=True):
         # 1 means cuda place, see paddle/phi/kernels/memcpy_kernel.cc
         return _C_ops.memcpy(self, 1)
 
+    @property
     def place(self):
         """
         Value don't have 'place' interface in static graph mode
diff --git a/python/paddle/pir_utils.py b/python/paddle/pir_utils.py
index 9adf1d0471089..d2a93f2b8e557 100644
--- a/python/paddle/pir_utils.py
+++ b/python/paddle/pir_utils.py
@@ -19,92 +19,83 @@
 from paddle.framework.dtype import bind_datatype, bind_vartype
 
 
-class IrGuard:
-    def __init__(self):
-        self.in_dygraph_outside = False
-        old_flag = paddle.base.framework.get_flags("FLAGS_enable_pir_api")
-        paddle.base.framework.set_flags({"FLAGS_enable_pir_api": False})
-        paddle.base.framework.global_var._use_pir_api_ = False
-        if not paddle.base.framework.get_flags("FLAGS_enable_pir_api")[
-            "FLAGS_enable_pir_api"
-        ]:
-            self.old_Program = paddle.static.Program
-            self.old_program_guard = paddle.base.program_guard
-            self.old_default_main_program = paddle.static.default_main_program
-            self.old_default_startup_program = (
-                paddle.static.default_startup_program
-            )
-        else:
-            raise RuntimeError(
-                "IrGuard only init when paddle.framework.in_pir_mode(): is false, \
-                please set FLAGS_enable_pir_api = false"
-            )
-        paddle.base.framework.set_flags(old_flag)
-        paddle.base.framework.global_var._use_pir_api_ = old_flag[
-            "FLAGS_enable_pir_api"
-        ]
+def _switch_to_pir_():
+    paddle.base.framework.global_var._use_pir_api_ = True
+    paddle.framework.set_flags({"FLAGS_enable_pir_in_executor": True})
+    paddle.pir.register_paddle_dialect()
+    # TODO find a better place to init the registion of dist dialect.
+    paddle.pir.register_dist_dialect()
 
+    paddle.base.Program = paddle.pir.Program
+    paddle.base.program_guard = paddle.pir.core.program_guard
+    paddle.base.default_main_program = paddle.pir.core.default_main_program
+    paddle.base.default_startup_program = (
+        paddle.pir.core.default_startup_program
+    )
+    paddle.static.Program = paddle.pir.Program
+    paddle.static.program_guard = paddle.pir.core.program_guard
+    paddle.static.default_main_program = paddle.pir.core.default_main_program
+    paddle.static.default_startup_program = (
+        paddle.pir.core.default_startup_program
+    )
+
+
+def _switch_to_old_ir_():
+    paddle.base.framework.global_var._use_pir_api_ = False
+    paddle.framework.set_flags({"FLAGS_enable_pir_in_executor": False})
+
+    paddle.base.Program = paddle.base.framework.Program
+    paddle.base.program_guard = paddle.base.framework.program_guard
+    paddle.base.default_main_program = (
+        paddle.base.framework.default_main_program
+    )
+    paddle.base.default_startup_program = (
+        paddle.base.framework.default_startup_program
+    )
+    paddle.static.Program = paddle.base.framework.Program
+    paddle.static.program_guard = paddle.base.framework.program_guard
+    paddle.static.default_main_program = (
+        paddle.base.framework.default_main_program
+    )
+    paddle.static.default_startup_program = (
+        paddle.base.framework.default_startup_program
+    )
+
+
+class IrGuard:
     def __enter__(self):
         self.in_dygraph_outside = paddle.base.framework.in_dygraph_mode()
+        self.old_flag = paddle.base.framework.get_flags("FLAGS_enable_pir_api")[
+            "FLAGS_enable_pir_api"
+        ]
         if self.in_dygraph_outside:
             paddle.enable_static()
-        paddle.framework.set_flags({"FLAGS_enable_pir_api": True})
-        paddle.base.framework.global_var._use_pir_api_ = True
-        bind_datatype()
-        self._switch_to_pir()
+        if not self.old_flag:
+            paddle.framework.set_flags({"FLAGS_enable_pir_api": True})
+            paddle.base.framework.global_var._use_pir_api_ = True
+            bind_datatype()
+            self._switch_to_pir()
 
     def __exit__(self, exc_type, exc_val, exc_tb):
-        paddle.framework.set_flags({"FLAGS_enable_pir_api": False})
-        paddle.base.framework.global_var._use_pir_api_ = False
-        bind_vartype()
-        self._switch_to_old_ir()
         if self.in_dygraph_outside:
             paddle.disable_static()
+        if not self.old_flag:
+            paddle.framework.set_flags({"FLAGS_enable_pir_api": False})
+            paddle.base.framework.global_var._use_pir_api_ = False
+            bind_vartype()
+            self._switch_to_old_ir()
 
     def _switch_to_pir(self):
         if paddle.base.framework.get_flags("FLAGS_enable_pir_api")[
             "FLAGS_enable_pir_api"
         ]:
-            paddle.framework.set_flags({"FLAGS_enable_pir_in_executor": True})
-            paddle.pir.register_paddle_dialect()
-            # TODO find a better place to init the registion of dist dialect.
-            paddle.pir.register_dist_dialect()
-
-            paddle.base.Program = paddle.pir.Program
-            paddle.base.program_guard = paddle.pir.core.program_guard
-            # paddle.base.default_main_program = (
-            #     paddle.pir.core.default_main_program
-            # )
-            # paddle.base.default_startup_program = (
-            #     paddle.pir.core.default_startup_program
-            # )
-            paddle.static.Program = paddle.pir.Program
-            paddle.static.program_guard = paddle.pir.core.program_guard
-            paddle.static.default_main_program = (
-                paddle.pir.core.default_main_program
-            )
-            paddle.static.default_startup_program = (
-                paddle.pir.core.default_startup_program
-            )
+            _switch_to_pir_()
 
     def _switch_to_old_ir(self):
         if not paddle.base.framework.get_flags("FLAGS_enable_pir_api")[
             "FLAGS_enable_pir_api"
         ]:
-            paddle.framework.set_flags({"FLAGS_enable_pir_in_executor": False})
-
-            paddle.base.Program = self.old_Program
-            paddle.base.program_guard = self.old_program_guard
-            # paddle.base.default_main_program = self.old_default_main_program
-            # paddle.base.default_startup_program = (
-            #     self.old_default_startup_program
-            # )
-            paddle.static.Program = self.old_Program
-            paddle.static.program_guard = self.old_program_guard
-            paddle.static.default_main_program = self.old_default_main_program
-            paddle.static.default_startup_program = (
-                self.old_default_startup_program
-            )
+            _switch_to_old_ir_()
         else:
             raise RuntimeError(
                 "IrGuard._switch_to_old_ir only work when paddle.framework.in_pir_mode() is false, \
@@ -112,11 +103,45 @@ def _switch_to_old_ir(self):
             )
 
 
+class OldIrGuard:
+    def __enter__(self):
+        self.in_dygraph_outside = paddle.base.framework.in_dygraph_mode()
+        self.old_flag = paddle.base.framework.get_flags("FLAGS_enable_pir_api")[
+            "FLAGS_enable_pir_api"
+        ]
+        if self.in_dygraph_outside:
+            paddle.enable_static()
+        if self.old_flag:
+            paddle.framework.set_flags({"FLAGS_enable_pir_api": False})
+            paddle.base.framework.global_var._use_pir_api_ = False
+            bind_vartype()
+            _switch_to_old_ir_()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.in_dygraph_outside:
+            paddle.disable_static()
+        if self.old_flag:
+            paddle.framework.set_flags({"FLAGS_enable_pir_api": True})
+            paddle.base.framework.global_var._use_pir_api_ = True
+            bind_datatype()
+            _switch_to_pir_()
+
+
 def test_with_pir_api(func):
     @wraps(func)
     def impl(*args, **kwargs):
-        func(*args, **kwargs)
+        with OldIrGuard():
+            func(*args, **kwargs)
         with IrGuard():
             func(*args, **kwargs)
 
     return impl
+
+
+def test_with_old_ir_only(func):
+    @wraps(func)
+    def impl(*args, **kwargs):
+        with OldIrGuard():
+            func(*args, **kwargs)
+
+    return impl
diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py
index d9dee32dc8dc2..0cb140efc7ff8 100755
--- a/python/paddle/profiler/profiler_statistic.py
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -1154,20 +1154,8 @@ def format_ratio(ratio, indent=0):
                     row_values = [
                         f'{name}',
                         item.call,
-                        '{} / {} / {} / {} / {}'.format(
-                            format_time(item.cpu_time, unit=time_unit),
-                            format_time(item.avg_cpu_time, unit=time_unit),
-                            format_time(item.max_cpu_time, unit=time_unit),
-                            format_time(item.min_cpu_time, unit=time_unit),
-                            format_ratio(float(item.cpu_time) / total_time),
-                        ),
-                        '{} / {} / {} / {} / {}'.format(
-                            format_time(item.gpu_time, unit=time_unit),
-                            format_time(item.avg_gpu_time, unit=time_unit),
-                            format_time(item.max_gpu_time, unit=time_unit),
-                            format_time(item.min_gpu_time, unit=time_unit),
-                            format_ratio(gpu_ratio),
-                        ),
+                        f'{format_time(item.cpu_time, unit=time_unit)} / {format_time(item.avg_cpu_time, unit=time_unit)} / {format_time(item.max_cpu_time, unit=time_unit)} / {format_time(item.min_cpu_time, unit=time_unit)} / {format_ratio(float(item.cpu_time) / total_time)}',
+                        f'{format_time(item.gpu_time, unit=time_unit)} / {format_time(item.avg_gpu_time, unit=time_unit)} / {format_time(item.max_gpu_time, unit=time_unit)} / {format_time(item.min_gpu_time, unit=time_unit)} / {format_ratio(gpu_ratio)}',
                     ]
                     all_row_values.append(row_values)
                     if 'ProfileStep' not in name:
@@ -1183,14 +1171,8 @@ def format_ratio(ratio, indent=0):
             row_values = [
                 '  Others',
                 '-',
-                '{} / - / - / - / {}'.format(
-                    format_time(other_time, unit=time_unit),
-                    format_ratio(float(other_time) / total_time),
-                ),
-                '{} / - / - / - / {}'.format(
-                    format_time(other_gpu_time, unit=time_unit),
-                    format_ratio(gpu_ratio),
-                ),
+                f'{format_time(other_time, unit=time_unit)} / - / - / - / {format_ratio(float(other_time) / total_time)}',
+                f'{format_time(other_gpu_time, unit=time_unit)} / - / - / - / {format_ratio(gpu_ratio)}',
             ]
             all_row_values.append(row_values)
             # Calculate the column width
@@ -1398,13 +1380,7 @@ def format_ratio(ratio, indent=0):
                     row_values = [
                         name,
                         item.call,
-                        '{} / {} / {} / {} / {}'.format(
-                            format_time(item.cpu_time, unit=time_unit),
-                            format_time(item.avg_cpu_time, unit=time_unit),
-                            format_time(item.max_cpu_time, unit=time_unit),
-                            format_time(item.min_cpu_time, unit=time_unit),
-                            format_ratio(cpu_ratio),
-                        ),
+                        f'{format_time(item.cpu_time, unit=time_unit)} / {format_time(item.avg_cpu_time, unit=time_unit)} / {format_time(item.max_cpu_time, unit=time_unit)} / {format_time(item.min_cpu_time, unit=time_unit)} / {format_ratio(cpu_ratio)}',
                         '{} / {} / {} / {} / {}'.format(
                             format_time(item.general_gpu_time, unit=time_unit),
                             format_time(
@@ -1660,13 +1636,7 @@ def format_ratio(ratio, indent=0):
                 row_values = [
                     name,
                     item.call,
-                    '{} / {} / {} / {} / {}'.format(
-                        format_time(item.gpu_time, unit=time_unit),
-                        format_time(item.avg_gpu_time, unit=time_unit),
-                        format_time(item.max_gpu_time, unit=time_unit),
-                        format_time(item.min_gpu_time, unit=time_unit),
-                        format_ratio(gpu_ratio),
-                    ),
+                    f'{format_time(item.gpu_time, unit=time_unit)} / {format_time(item.avg_gpu_time, unit=time_unit)} / {format_time(item.max_gpu_time, unit=time_unit)} / {format_time(item.min_gpu_time, unit=time_unit)} / {format_ratio(gpu_ratio)}',
                 ]
                 all_row_values.append(row_values)
 
@@ -1741,20 +1711,8 @@ def format_ratio(ratio, indent=0):
                 row_values = [
                     name,
                     item.call,
-                    '{} / {} / {} / {} / {}'.format(
-                        format_time(item.cpu_time, unit=time_unit),
-                        format_time(item.avg_cpu_time, unit=time_unit),
-                        format_time(item.max_cpu_time, unit=time_unit),
-                        format_time(item.min_cpu_time, unit=time_unit),
-                        format_ratio(float(item.cpu_time) / total_time),
-                    ),
-                    '{} / {} / {} / {} / {}'.format(
-                        format_time(item.general_gpu_time, unit=time_unit),
-                        format_time(item.avg_general_gpu_time, unit=time_unit),
-                        format_time(item.max_general_gpu_time, unit=time_unit),
-                        format_time(item.min_general_gpu_time, unit=time_unit),
-                        format_ratio(gpu_ratio),
-                    ),
+                    f'{format_time(item.cpu_time, unit=time_unit)} / {format_time(item.avg_cpu_time, unit=time_unit)} / {format_time(item.max_cpu_time, unit=time_unit)} / {format_time(item.min_cpu_time, unit=time_unit)} / {format_ratio(float(item.cpu_time) / total_time)}',
+                    f'{format_time(item.general_gpu_time, unit=time_unit)} / {format_time(item.avg_general_gpu_time, unit=time_unit)} / {format_time(item.max_general_gpu_time, unit=time_unit)} / {format_time(item.min_general_gpu_time, unit=time_unit)} / {format_ratio(gpu_ratio)}',
                 ]
                 all_row_values.append(row_values)
 
@@ -1878,13 +1836,7 @@ def format_ratio(ratio, indent=0):
                     row_values = [
                         name,
                         item.call,
-                        '{} / {} / {} / {} / {}'.format(
-                            format_time(item.cpu_time, unit=time_unit),
-                            format_time(item.avg_cpu_time, unit=time_unit),
-                            format_time(item.max_cpu_time, unit=time_unit),
-                            format_time(item.min_cpu_time, unit=time_unit),
-                            format_ratio(float(item.cpu_time) / total_time),
-                        ),
+                        f'{format_time(item.cpu_time, unit=time_unit)} / {format_time(item.avg_cpu_time, unit=time_unit)} / {format_time(item.max_cpu_time, unit=time_unit)} / {format_time(item.min_cpu_time, unit=time_unit)} / {format_ratio(float(item.cpu_time) / total_time)}',
                         '{} / {} / {} / {} / {}'.format(
                             format_time(item.general_gpu_time, unit=time_unit),
                             format_time(
diff --git a/python/paddle/quantization/imperative/ptq.py b/python/paddle/quantization/imperative/ptq.py
index 6e7df956aa459..85aac231556a9 100644
--- a/python/paddle/quantization/imperative/ptq.py
+++ b/python/paddle/quantization/imperative/ptq.py
@@ -287,9 +287,7 @@ def _save_output_thresholds(self, sub_layer, quant_config):
             sub_layer._set_op_attrs({"out_threshold": output_thresholds[0]})
         else:
             _logger.warning(
-                "output_thresholds shape of {} need to be 1, but received {}".format(
-                    output_names[0], len(output_thresholds)
-                )
+                f"output_thresholds shape of {output_names[0]} need to be 1, but received {len(output_thresholds)}"
             )
 
     def _wrap_simulated_layers(self, model):
diff --git a/python/paddle/quantization/quantize.py b/python/paddle/quantization/quantize.py
index 2a3e26d53920b..439b1118ec702 100644
--- a/python/paddle/quantization/quantize.py
+++ b/python/paddle/quantization/quantize.py
@@ -74,7 +74,7 @@ def convert(self, model: Layer, inplace=False, remain_weight=False):
             if isinstance(child, ConvertibleQuantedLayer):
                 if child.converted:
                     continue
-                if (
+                if hasattr(child, 'weight_quanter') and (
                     child.weight_quanter is None
                     or child.weight_quanter.scales() is None
                 ):
@@ -109,7 +109,12 @@ def _insert_activation_observers(self, model: Layer, config: QuantConfig):
             if config._need_observe(child):
                 replaced[name] = config._get_observe_wrapper(child)
             else:
-                self._insert_activation_observers(child, config)
+                if (
+                    type(child) not in config._qat_layer_mapping.values()
+                    and type(child)
+                    not in config._customized_qat_layer_mapping.values()
+                ):
+                    self._insert_activation_observers(child, config)
         for key, value in replaced.items():
             model._sub_layers[key] = value
 
diff --git a/python/paddle/signal.py b/python/paddle/signal.py
index 8e64bc2e3400a..da5df0a15506a 100644
--- a/python/paddle/signal.py
+++ b/python/paddle/signal.py
@@ -542,22 +542,16 @@ def istft(
         if onesided:
             assert (
                 fft_size == n_fft // 2 + 1
-            ), 'fft_size should be equal to n_fft // 2 + 1({}) when onesided is True, but got {}.'.format(
-                n_fft // 2 + 1, fft_size
-            )
+            ), f'fft_size should be equal to n_fft // 2 + 1({n_fft // 2 + 1}) when onesided is True, but got {fft_size}.'
         else:
             assert (
                 fft_size == n_fft
-            ), 'fft_size should be equal to n_fft({}) when onesided is False, but got {}.'.format(
-                n_fft, fft_size
-            )
+            ), f'fft_size should be equal to n_fft({n_fft}) when onesided is False, but got {fft_size}.'
 
     if window is not None:
         assert (
             len(window.shape) == 1 and len(window) == win_length
-        ), 'expected a 1D window tensor of size equal to win_length({}), but got window with shape {}.'.format(
-            win_length, window.shape
-        )
+        ), f'expected a 1D window tensor of size equal to win_length({win_length}), but got window with shape {window.shape}.'
     else:
         window_dtype = (
             paddle.float32
diff --git a/python/paddle/sparse/creation.py b/python/paddle/sparse/creation.py
index 89ee841053a97..4630fc9382a07 100644
--- a/python/paddle/sparse/creation.py
+++ b/python/paddle/sparse/creation.py
@@ -132,9 +132,7 @@ def sparse_coo_tensor(
 
         if nnz != values.shape[0]:
             raise ValueError(
-                "the indices and values must have same number of non-zero, but get {} and {}".format(
-                    nnz, values.shape[0]
-                )
+                f"the indices and values must have same number of non-zero, but get {nnz} and {values.shape[0]}"
             )
 
         dense_dim = len(values.shape) - 1
@@ -159,9 +157,7 @@ def sparse_coo_tensor(
                 )
             if len(shape) != sparse_dim + dense_dim:
                 raise ValueError(
-                    "the number of dimensions(len(shape) must be sparse_dim({}) + dense_dim({}), but get {}".format(
-                        sparse_dim, dense_dim, len(shape)
-                    )
+                    f"the number of dimensions(len(shape) must be sparse_dim({sparse_dim}) + dense_dim({dense_dim}), but get {len(shape)}"
                 )
 
         return _C_ops.sparse_sparse_coo_tensor(values, indices, shape)
@@ -269,9 +265,7 @@ def sparse_csr_tensor(
     if len(shape) == 2:
         if crows.shape[0] != rows + 1:
             raise ValueError(
-                "The length({}) of crows must be equal to the rows({})+1 of matrix.".format(
-                    crows.shape[0], rows
-                )
+                f"The length({crows.shape[0]}) of crows must be equal to the rows({rows})+1 of matrix."
             )
         if crows[0] != 0:
             raise ValueError("the 0th value of crows must be 0")
@@ -283,9 +277,7 @@ def sparse_csr_tensor(
     else:
         if crows.shape[0] % (rows + 1) != 0:
             raise ValueError(
-                "The length({}) of crows must be divisible the rows({})+1 of matrix.".format(
-                    crows.shape[0], rows
-                )
+                f"The length({crows.shape[0]}) of crows must be divisible the rows({rows})+1 of matrix."
             )
     # TODO(zkh2016): check whether the value in crows and cols is legal
 
diff --git a/python/paddle/sparse/nn/layer/conv.py b/python/paddle/sparse/nn/layer/conv.py
index 6ed3c840f39e9..62cf355de2e3d 100644
--- a/python/paddle/sparse/nn/layer/conv.py
+++ b/python/paddle/sparse/nn/layer/conv.py
@@ -62,9 +62,7 @@ def __init__(
         valid_format = {'NDHWC'}
         if data_format not in valid_format:
             raise ValueError(
-                "data_format must be one of {}, but got data_format='{}'".format(
-                    valid_format, data_format
-                )
+                f"data_format must be one of {valid_format}, but got data_format='{data_format}'"
             )
 
         channel_last = data_format == "NDHWC"
@@ -168,9 +166,7 @@ def __init__(
         valid_format = {'NHWC'}
         if data_format not in valid_format:
             raise ValueError(
-                "data_format must be one of {}, but got data_format='{}'".format(
-                    valid_format, data_format
-                )
+                f"data_format must be one of {valid_format}, but got data_format='{data_format}'"
             )
 
         channel_last = data_format == "NHWC"
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
index 5cfacdfb63667..76ed286f0a18d 100644
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -18,7 +18,6 @@
 from ..base.compiler import (
     BuildStrategy,
     CompiledProgram,
-    ExecutionStrategy,
     IpuCompiledProgram,
     IpuStrategy,
 )
@@ -82,7 +81,6 @@
     'IpuStrategy',
     'Print',
     'py_func',
-    'ExecutionStrategy',
     'name_scope',
     'program_guard',
     'WeightNormParamAttr',
diff --git a/python/paddle/static/amp/bf16/amp_utils.py b/python/paddle/static/amp/bf16/amp_utils.py
index 33deded1e62ca..f349d5d7f3d41 100644
--- a/python/paddle/static/amp/bf16/amp_utils.py
+++ b/python/paddle/static/amp/bf16/amp_utils.py
@@ -150,9 +150,7 @@ def _insert_cast_post_op(
 
     assert (
         target_var.dtype == src_dtype
-    ), "The real dtype({}) is not equal to the src dtype({})".format(
-        _dtype_to_str(target_var.dtype), _dtype_to_str(src_dtype)
-    )
+    ), f"The real dtype({_dtype_to_str(target_var.dtype)}) is not equal to the src dtype({_dtype_to_str(src_dtype)})"
 
     cast_name = target_var.name + '.cast_' + _dtype_to_str(dest_dtype)
     cast_var = block.vars.get(cast_name)
@@ -355,9 +353,7 @@ def cast_model_to_bf16(
                         to_bf16_var_names.add(in_var_name)
 
                     _logger.debug(
-                        "-- op type: {}, in var name: {}, in var dtype: {} --".format(
-                            op.type, in_var_name, in_var.dtype
-                        )
+                        f"-- op type: {op.type}, in var name: {in_var_name}, in var dtype: {in_var.dtype} --"
                     )
 
             for out_name in op.output_names:
@@ -388,9 +384,7 @@ def cast_model_to_bf16(
                         out_var.desc.set_dtype(core.VarDesc.VarType.BF16)
 
                     _logger.debug(
-                        "-- op type: {}, out var name: {}, out var dtype: {} --".format(
-                            op.type, out_var_name, out_var.dtype
-                        )
+                        f"-- op type: {op.type}, out var name: {out_var_name}, out var dtype: {out_var.dtype} --"
                     )
             for attr_name in ['in_dtype', 'out_dtype', 'dtype']:
                 if (
diff --git a/python/paddle/static/amp/debugging.py b/python/paddle/static/amp/debugging.py
index 954a958d939db..fa590faa04178 100644
--- a/python/paddle/static/amp/debugging.py
+++ b/python/paddle/static/amp/debugging.py
@@ -106,9 +106,7 @@ def _extract_compute_dtype(op, block):
                     var_dtype
                 ):
                     _logger.warning(
-                        "Operator < {} > has different input data types, input_names = {}, output_names = {}.".format(
-                            op.type, op.input_names, op.output_names
-                        )
+                        f"Operator < {op.type} > has different input data types, input_names = {op.input_names}, output_names = {op.output_names}."
                     )
                 elif _is_floating_point(var_dtype):
                     # When there are multiple inputs, such as embedding
@@ -132,9 +130,7 @@ def _extract_compute_dtype(op, block):
                     var_dtype
                 ):
                     _logger.warning(
-                        "Operator < {} > has different input / output data types, input_names = {}, output_names = {}.".format(
-                            op.type, op.input_names, op.output_names
-                        )
+                        f"Operator < {op.type} > has different input / output data types, input_names = {op.input_names}, output_names = {op.output_names}."
                     )
     return compute_dtype
 
diff --git a/python/paddle/static/amp/decorator.py b/python/paddle/static/amp/decorator.py
index bb5f2720c2b9d..877a855bcb95e 100644
--- a/python/paddle/static/amp/decorator.py
+++ b/python/paddle/static/amp/decorator.py
@@ -41,9 +41,7 @@ def _set_multi_precision(optimizer, multi_precision):
         (paddle.optimizer.Optimizer),
     ):
         raise RuntimeError(
-            "Current AMP training level is O2, optimizer is expected to be paddle.optimizer.Optimizer, but receive {}.".format(
-                type(optimizer)
-            )
+            f"Current AMP training level is O2, optimizer is expected to be paddle.optimizer.Optimizer, but receive {type(optimizer)}."
         )
 
     if multi_precision and hasattr(optimizer, "_multi_precision"):
diff --git a/python/paddle/static/amp/fp16_lists.py b/python/paddle/static/amp/fp16_lists.py
index 2cb176f18f8ec..bec67fd7a7414 100644
--- a/python/paddle/static/amp/fp16_lists.py
+++ b/python/paddle/static/amp/fp16_lists.py
@@ -62,9 +62,7 @@ def get_low_precision_vartype(dtype):
         return var_type
     else:
         raise TypeError(
-            "The type of dtype is expected to be string or core.VarDesc.VarType, but received {}.".format(
-                type(dtype)
-            )
+            f"The type of dtype is expected to be string or core.VarDesc.VarType, but received {type(dtype)}."
         )
 
 
@@ -82,9 +80,7 @@ def get_low_precision_dtypestr(dtype):
             )
     else:
         raise TypeError(
-            "The type of dtype is expected to be string or core.VarDesc.VarType, but received {}.".format(
-                type(dtype)
-            )
+            f"The type of dtype is expected to be string or core.VarDesc.VarType, but received {type(dtype)}."
         )
 
 
diff --git a/python/paddle/static/amp/fp16_utils.py b/python/paddle/static/amp/fp16_utils.py
index f6c84975bf265..f12f125462e48 100644
--- a/python/paddle/static/amp/fp16_utils.py
+++ b/python/paddle/static/amp/fp16_utils.py
@@ -439,9 +439,7 @@ def set_var_dst_dtype(
                 var.desc.set_dtype(dtype)
 
         _logger.debug(
-            "---- op type: {}, var name: {}, var dtype: {} ----".format(
-                op.type, var_name, var.dtype
-            )
+            f"---- op type: {op.type}, var name: {var_name}, var dtype: {var.dtype} ----"
         )
 
     return low_precision_var_names
diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py
index f1aad7f8fa96a..4cc2d1b918745 100644
--- a/python/paddle/static/input.py
+++ b/python/paddle/static/input.py
@@ -224,13 +224,7 @@ def _create_feed_layer(self):
         return data(self.name, shape=self.shape, dtype=self.dtype)
 
     def __repr__(self):
-        return '{}(shape={}, dtype={}, name={}, stop_gradient={})'.format(
-            type(self).__name__,
-            self.shape,
-            self.dtype,
-            self.name,
-            self.stop_gradient,
-        )
+        return f'{type(self).__name__}(shape={self.shape}, dtype={self.dtype}, name={self.name}, stop_gradient={self.stop_gradient})'
 
     @classmethod
     def from_tensor(cls, tensor, name=None):
@@ -261,9 +255,7 @@ def from_tensor(cls, tensor, name=None):
             return cls(tensor.shape, tensor.dtype, name or tensor.name)
         else:
             raise ValueError(
-                "Input `tensor` should be a Tensor, but received {}.".format(
-                    type(tensor).__name__
-                )
+                f"Input `tensor` should be a Tensor, but received {type(tensor).__name__}."
             )
 
     @classmethod
@@ -315,16 +307,12 @@ def batch(self, batch_size):
         if isinstance(batch_size, (list, tuple)):
             if len(batch_size) != 1:
                 raise ValueError(
-                    "Length of batch_size: {} shall be 1, but received {}.".format(
-                        batch_size, len(batch_size)
-                    )
+                    f"Length of batch_size: {batch_size} shall be 1, but received {len(batch_size)}."
                 )
             batch_size = batch_size[1]
         elif not isinstance(batch_size, int):
             raise TypeError(
-                "type(batch_size) shall be `int`, but received {}.".format(
-                    type(batch_size).__name__
-                )
+                f"type(batch_size) shall be `int`, but received {type(batch_size).__name__}."
             )
 
         new_shape = [batch_size] + list(self.shape)
@@ -364,18 +352,14 @@ def _verify(self, shape):
         """
         if not isinstance(shape, (list, tuple)):
             raise TypeError(
-                "Type of `shape` in InputSpec should be one of (tuple, list), but received {}.".format(
-                    type(shape).__name__
-                )
+                f"Type of `shape` in InputSpec should be one of (tuple, list), but received {type(shape).__name__}."
             )
 
         for i, ele in enumerate(shape):
             if ele is not None:
                 if not isinstance(ele, int):
                     raise ValueError(
-                        "shape[{}] should be an `int`, but received `{}`:{}.".format(
-                            i, type(ele).__name__, ele
-                        )
+                        f"shape[{i}] should be an `int`, but received `{type(ele).__name__}`:{ele}."
                     )
             if ele is None or ele < -1:
                 shape[i] = -1
diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py
index f4b61001a9fb6..934cce5ad26ea 100644
--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
@@ -23,6 +23,7 @@
 import numpy as np
 
 import paddle
+from paddle import pir
 from paddle.base import (
     CompiledProgram,
     Program,
@@ -36,6 +37,7 @@
 from paddle.base.framework import (
     Parameter,
     dygraph_not_support,
+    in_pir_mode,
     process_type_promotion,
     static_only,
 )
@@ -64,22 +66,18 @@ def _check_args(caller, args, supported_args=None, deprecated_args=None):
     for arg in args:
         if arg in deprecated_args:
             raise ValueError(
-                "argument '{}' in function '{}' is deprecated, only {} are supported.".format(
-                    arg, caller, supported_args
-                )
+                f"argument '{arg}' in function '{caller}' is deprecated, only {supported_args} are supported."
             )
         elif arg not in supported_args:
             raise ValueError(
-                "function '{}' doesn't support argument '{}',\n only {} are supported.".format(
-                    caller, arg, supported_args
-                )
+                f"function '{caller}' doesn't support argument '{arg}',\n only {supported_args} are supported."
             )
 
 
 def _check_vars(name, var_list):
     if not isinstance(var_list, list):
         var_list = [var_list]
-    if not all(isinstance(var, Variable) for var in var_list):
+    if not all(isinstance(var, (Variable, pir.Value)) for var in var_list):
         raise ValueError(
             f"'{name}' should be a Variable or a list of Variable."
         )
@@ -113,7 +111,7 @@ def _get_valid_program(program=None):
         warnings.warn(
             "The input is a CompiledProgram, this is not recommended."
         )
-    if not isinstance(program, Program):
+    if not isinstance(program, paddle.static.Program):
         raise TypeError(
             "The type of input program is invalid, expected type is base.Program, but received %s"
             % type(program)
@@ -163,11 +161,9 @@ def prepend_feed_ops(
     for i, name in enumerate(feed_target_names):
         if not global_block.has_var(name):
             raise ValueError(
-                "The feeded_var_names[{i}]: '{name}' doesn't exist in pruned inference program. "
-                "Please check whether '{name}' is a valid feed_var name, or remove it from feeded_var_names "
-                "if '{name}' is not involved in the target_vars calculation.".format(
-                    i=i, name=name
-                )
+                f"The feeded_var_names[{i}]: '{name}' doesn't exist in pruned inference program. "
+                f"Please check whether '{name}' is a valid feed_var name, or remove it from feeded_var_names "
+                f"if '{name}' is not involved in the target_vars calculation."
             )
         out = global_block.var(name)
         global_block._prepend_op(
@@ -197,6 +193,100 @@ def append_fetch_ops(
         )
 
 
+def normalize_pir_program(program, feed_vars, fetch_vars, **kwargs):
+    """
+
+    Normalize/Optimize a program according to feed_vars and fetch_vars.
+
+    Args:
+        program(Program): Specify a program you want to optimize.
+        feed_vars(Tensor | list[Tensor]): Values needed by inference.
+        fetch_vars(Tensor | list[Tensor]): Values returned by inference.
+        kwargs: Supported keys including ``skip_prune_program``.
+            - skip_prune_program(bool): whether to skip pruning program. Defaults to False.
+
+    Returns:
+        Program: Normalized/Optimized program.
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+
+            >>> paddle.enable_static()
+
+            >>> path_prefix = "./infer_model"
+
+            # User defined network, here a softmax regression example
+            >>> image = paddle.static.data(name='img', shape=[None, 28, 28], dtype='float32')
+            >>> label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
+            >>> predict = paddle.static.nn.fc(image, 10, activation='softmax')
+
+            >>> loss = paddle.nn.functional.cross_entropy(predict, label)
+
+            >>> exe = paddle.static.Executor(paddle.CPUPlace())
+            >>> exe.run(paddle.static.default_startup_program())
+
+            # normalize main program.
+            >>> program = paddle.static.default_main_program()
+            >>> normalized_program = paddle.static.normalize_program(program, [image], [predict])
+
+    """
+    if not isinstance(program, paddle.static.Program):
+        raise TypeError(
+            "program type must be `paddle.static.Program`, but received `%s`"
+            % type(program)
+        )
+    if not isinstance(feed_vars, list):
+        feed_vars = [feed_vars]
+    if not all(isinstance(v, pir.Value) for v in feed_vars):
+        raise TypeError("feed_vars type must be a Value or a list of Variable.")
+    if not isinstance(fetch_vars, list):
+        fetch_vars = [fetch_vars]
+    if not all(isinstance(v, pir.Value) for v in fetch_vars):
+        raise TypeError(
+            "fetch_vars type must be a Value or a list of Variable."
+        )
+
+    # TODO(Ruting) remind users to set auc_states to 0 if auc op were found.
+
+    # fix the bug that the activation op's output as target will be pruned.
+    # will affect the inference performance.
+    # TODO(Superjomn) add an IR pass to remove 1-scale op.
+    with paddle.static.program_guard(program):
+        uniq_fetch_vars = []
+        for i, var in enumerate(fetch_vars):
+            if var.dtype != paddle.bool:
+                var = paddle.scale(var, 1.0, name=f"save_infer_model/scale_{i}")
+            uniq_fetch_vars.append(var)
+        fetch_vars = uniq_fetch_vars
+
+    # serialize program
+    copy_program = program.clone()
+    global_block = copy_program.global_block()
+    remove_ops = []
+    for op in global_block.ops:
+        if op.name() == "pd_op.feed" or op.name() == "pd_op.fetch":
+            remove_ops.append(op)
+
+    for op in remove_ops:
+        global_block.remove_op(op)
+
+    # feed_var_names = [var.name for var in feed_vars]
+
+    # skip_prune_program = kwargs.get('skip_prune_program', False)
+    # if not skip_prune_program:
+    #     copy_program = copy_program._prune_with_input(
+    #         feeded_var_names=feed_var_names, targets=fetch_vars
+    #     )
+    # copy_program = copy_program._inference_optimize(prune_read_op=True)
+    # fetch_var_names = [var.name for var in fetch_vars]
+    # prepend_feed_ops(copy_program, feed_var_names)
+    # append_fetch_ops(copy_program, fetch_var_names)
+
+    return copy_program
+
+
 def normalize_program(program, feed_vars, fetch_vars, **kwargs):
     """
 
@@ -584,7 +674,12 @@ def save_inference_model(
     except OSError as e:
         if e.errno != errno.EEXIST:
             raise
-    model_path = path_prefix + ".pdmodel"
+
+    if in_pir_mode():
+        model_path = path_prefix + ".json"
+    else:
+        model_path = path_prefix + ".pdmodel"
+
     params_path = path_prefix + ".pdiparams"
     if os.path.isdir(model_path):
         raise ValueError(f"'{model_path}' is an existing directory.")
@@ -602,40 +697,52 @@ def save_inference_model(
     program = process_type_promotion(program)
 
     clip_extra = kwargs.get('clip_extra', True)
-    program = normalize_program(
-        program,
-        feed_vars,
-        fetch_vars,
-        skip_prune_program=kwargs.get('skip_prune_program', False),
-    )
-
     # serialize and save program
-    legacy_format = kwargs.get('legacy_format', False)
-    program_bytes = _serialize_program(
-        program._remove_training_info(clip_extra=clip_extra),
-        legacy_format=legacy_format,
-    )
-
-    save_to_file(model_path, program_bytes)
 
-    vars = list(filter(is_persistable, program.list_vars()))
-
-    if len(list(vars)) == 0:
-        warnings.warn(
-            "no variable in your model, please ensure there are any variables in your model to save"
+    if in_pir_mode():
+        program = normalize_pir_program(
+            program,
+            feed_vars,
+            fetch_vars,
+            skip_prune_program=kwargs.get('skip_prune_program', False),
+        )
+        paddle.core.serialize_pir_program(
+            program, model_path, 1, True, False, True
         )
 
-    if len(vars) > 0:
-        save_dirname = os.path.dirname(params_path)
-        params_filename = os.path.basename(params_path)
-        save_vars(
-            executor,
-            dirname=save_dirname,
-            main_program=program,
-            predicate=is_persistable,
-            filename=params_filename,
+    else:
+        program = normalize_program(
+            program,
+            feed_vars,
+            fetch_vars,
+            skip_prune_program=kwargs.get('skip_prune_program', False),
+        )
+        legacy_format = kwargs.get('legacy_format', False)
+        program_bytes = _serialize_program(
+            program._remove_training_info(clip_extra=clip_extra),
+            legacy_format=legacy_format,
         )
 
+        save_to_file(model_path, program_bytes)
+
+        vars = list(filter(is_persistable, program.list_vars()))
+
+        if len(list(vars)) == 0:
+            warnings.warn(
+                "no variable in your model, please ensure there are any variables in your model to save"
+            )
+
+        if len(vars) > 0:
+            save_dirname = os.path.dirname(params_path)
+            params_filename = os.path.basename(params_path)
+            save_vars(
+                executor,
+                dirname=save_dirname,
+                main_program=program,
+                predicate=is_persistable,
+                filename=params_filename,
+            )
+
 
 @static_only
 def deserialize_program(data):
@@ -782,10 +889,8 @@ def deserialize_persistables(program, data, executor):
         origin_shape = origin_shape_map.get(var.name)
         if new_shape != origin_shape:
             raise RuntimeError(
-                "Shape mismatch, program needs a parameter with shape ({}), "
-                "but the loaded parameter ('{}') has a shape of ({}).".format(
-                    origin_shape, var.name, new_shape
-                )
+                f"Shape mismatch, program needs a parameter with shape ({origin_shape}), "
+                f"but the loaded parameter ('{var.name}') has a shape of ({new_shape})."
             )
 
 
@@ -896,6 +1001,8 @@ def load_inference_model(path_prefix, executor, **kwargs):
             # fetch_targets, we can use an executor to run the inference
             # program to get the inference result.
     """
+    if in_pir_mode():
+        return load_pir_inference_model(path_prefix, executor, **kwargs)
     # check kwargs
     supported_args = ('model_filename', 'params_filename')
     deprecated_args = ('pserver_endpoints',)
@@ -1000,6 +1107,150 @@ def load_inference_model(path_prefix, executor, **kwargs):
     return [program, feed_target_names, fetch_targets]
 
 
+@static_only
+def load_pir_inference_model(path_prefix, executor, **kwargs):
+    """
+
+    Load inference model from a given path. By this API, you can get the model
+    structure(Inference Program) and model parameters.
+
+    Args:
+        path_prefix(str | None): One of the following:
+          - Directory path to save model + model name without suffix.
+          - Set to None when reading the model from memory.
+        executor(Executor): The executor to run for loading inference model.
+                            See :ref:`api_guide_executor_en` for more details about it.
+        kwargs: Supported keys including 'model_filename', 'params_filename'. Attention please, kwargs is used for backward compatibility mainly.
+
+            - model_filename(str): specify model_filename if you don't want to use default name.
+
+            - params_filename(str): specify params_filename if you don't want to use default name.
+
+    Returns:
+        list: The return of this API is a list with three elements:
+        (program, feed_target_names, fetch_targets). The `program` is a
+        ``Program`` (refer to :ref:`api_guide_Program_en`), which is used for inference.
+        The `feed_target_names` is a list of ``str``, which contains names of variables
+        that need to feed data in the inference program. The `fetch_targets` is a list of
+        ``Variable`` (refer to :ref:`api_guide_Program_en`). It contains variables from which
+        we can get inference results.
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+            >>> import numpy as np
+
+            >>> paddle.enable_static()
+
+            # Build the model
+            >>> startup_prog = paddle.static.default_startup_program()
+            >>> main_prog = paddle.static.default_main_program()
+            >>> with paddle.static.program_guard(main_prog, startup_prog):
+            ...     image = paddle.static.data(name="img", shape=[64, 784])
+            ...     w = paddle.create_parameter(shape=[784, 200], dtype='float32')
+            ...     b = paddle.create_parameter(shape=[200], dtype='float32')
+            ...     hidden_w = paddle.matmul(x=image, y=w)
+            ...     hidden_b = paddle.add(hidden_w, b)
+            >>> exe = paddle.static.Executor(paddle.CPUPlace())
+            >>> exe.run(startup_prog)
+
+            # Save the inference model
+            >>> path_prefix = "./infer_model"
+            >>> paddle.static.save_inference_model(path_prefix, [image], [hidden_b], exe)
+
+            >>> [inference_program, feed_target_names, fetch_targets] = (
+            ...     paddle.static.load_inference_model(path_prefix, exe))
+            >>> tensor_img = np.array(np.random.random((64, 784)), dtype=np.float32)
+            >>> results = exe.run(inference_program,
+            ...               feed={feed_target_names[0]: tensor_img},
+            ...               fetch_list=fetch_targets)
+
+            # In this example, the inference program was saved in file
+            # "./infer_model.pdmodel" and parameters were saved in file
+            # " ./infer_model.pdiparams".
+            # By the inference program, feed_target_names and
+            # fetch_targets, we can use an executor to run the inference
+            # program to get the inference result.
+    """
+    # check kwargs
+    supported_args = ('model_filename', 'params_filename')
+    deprecated_args = ('pserver_endpoints',)
+    caller = inspect.currentframe().f_code.co_name
+    _check_args(caller, kwargs, supported_args, deprecated_args)
+
+    # load from memory
+    if path_prefix is None:
+        _logger.warning(
+            "Load inference model from memory is deprecated. Please specify path_prefix."
+        )
+        model_filename = kwargs.get('model_filename', None)
+        params_filename = kwargs.get('params_filename', None)
+        if params_filename is None:
+            raise ValueError(
+                "params_filename cannot be None when path_prefix is None."
+            )
+
+        # deserialize bytes to program
+        program = paddle.static.Program()
+        paddle.base.core.deserialize_pir_program(model_filename, program, 1)
+
+        vars = list(filter(is_persistable, program.list_vars()))
+        if len(vars) > 0:
+            load_vars(
+                executor,
+                # load from memory, dirname is None
+                dirname=None,
+                main_program=program,
+                predicate=is_persistable,
+                filename=params_filename,
+            )
+    # load from file
+    else:
+        # check and norm path_prefix
+        path_prefix = _normalize_path_prefix(path_prefix)
+        dir_path = os.path.dirname(path_prefix)
+        if not os.path.isdir(dir_path):
+            raise ValueError(f"There is no directory named {dir_path}")
+        # set model_path and params_path in new way,
+        # path_prefix represents a file path without suffix in this case.
+        if not kwargs:
+            model_path = path_prefix + ".json"
+            params_path = path_prefix + ".pdiparams"
+        # set model_path and params_path in old way for compatible,
+        # path_prefix represents a directory path.
+        else:
+            model_filename = kwargs.get('model_filename', None)
+            params_filename = kwargs.get('params_filename', None)
+            # set model_path
+            if model_filename is None:
+                model_path = os.path.join(path_prefix, "__model__")
+            else:
+                model_path = os.path.join(path_prefix, model_filename + ".json")
+
+                if not os.path.exists(model_path):
+                    model_path = os.path.join(path_prefix, model_filename)
+            # set params_path
+            if params_filename is None:
+                params_path = os.path.join(path_prefix, "")
+            else:
+                params_path = os.path.join(
+                    path_prefix, params_filename + ".pdiparams"
+                )
+                if not os.path.exists(params_path):
+                    params_path = os.path.join(path_prefix, params_filename)
+            _logger.warning(
+                "The old way to load inference model is deprecated. Please specify path_prefix."
+                f" model path: {model_path}, params path: {params_path}"
+            )
+
+        # deserialize bytes to program
+        program = paddle.static.Program()
+        paddle.base.core.deserialize_pir_program(model_path, program, 1)
+
+    return [program, [], []]
+
+
 @dygraph_not_support
 def save_vars(
     executor,
@@ -1162,8 +1413,8 @@ def save_vars(
             return global_scope().find_var(params_var_name).get_bytes()
 
 
-def load_vars(
-    executor,
+@dygraph_not_support
+def save_vars_pir(
     dirname,
     main_program=None,
     vars=None,
@@ -1171,22 +1422,125 @@ def load_vars(
     filename=None,
 ):
     """
-    :api_attr: Static Graph
-
-    This API loads variables from files by executor.
+    Save specific variables in the `Program` to files.
 
-    There are two ways to specify the variables to be loaded: the first way, set
-    variables in a list and assign it to the `vars`; the second way, use the
-    `predicate` function to select variables that make `predicate(variable) == True`.
-    The first way has a higher priority.
+    There are two ways to specify the variables to be saved: set variables in
+    a list and assign it to the `vars`, or use the `predicate` function to select
+    variables that make `predicate(variable) == True`. The first way has a higher priority.
 
-    The `dirname` is used to specify the folder where to load variables.
-    If variables were saved in separate files in the folder `dirname`,
-    set `filename` None. If all variables were saved in a single file,
+    The `dirname` is used to specify the folder where to save variables.
+    If you prefer to save variables in separate files in the `dirname` folder,
+    do not set `filename`. If you prefer to save all variables in a single file,
     use `filename` to specify it.
 
     Args:
-        executor(Executor): The executor to run for loading variables.
+        dirname(str, optional): The folder to save variables.
+                            When you need to save the parameter to the memory, set it to None.
+        main_program(Program, optional): The program whose variables will be saved.
+                                    If it is None, the default main program will
+                                    be used automatically.
+                                    Default: None
+        vars(list[Variable], optional): The list contains all variables to be saved.
+                                        Default: None
+        predicate(function, optional): The function selects the variables that make
+                                       `predicate(variable) == True`.
+                                       Default: None
+        filename(str, optional): If you prefer to save all variables in a single file,
+                                 use `filename` to specify it. Otherwise, let `filename` be None.
+                                 Default: None
+
+    Returns:
+        str: When saving parameters to a file, returns None.
+             When saving parameters to memory, returns a binary string containing parameters.
+    """
+
+    save_to_memory = False
+    if dirname is None and filename is None:
+        save_to_memory = True
+
+    main_program = _get_valid_program(main_program)
+
+    if vars is None:
+        param, opt = get_pir_parameters(main_program)
+        vars_list = param + opt
+        return save_vars_pir(
+            main_program=main_program,
+            dirname=dirname,
+            vars=list(filter(predicate, vars_list)),
+            filename=filename,
+        )
+    else:
+        params_var_name = "saved_params"
+        # give warning when there is no var in model
+        if len(list(vars)) == 0:
+            warnings.warn(
+                "no variable in your model, please ensure there are any variables in your model to save"
+            )
+            return None
+
+        save_var_map = {}
+        for var_name in vars:
+            var = global_scope().find_var(var_name)
+            # TODO(chenzhiyang): deal with RAW type and sparse
+            if filename is None and save_to_memory is False:
+                save_file_path = os.path.join(
+                    os.path.normpath(dirname), var_name
+                )
+                core.save_func(
+                    var.get_tensor(), var_name, save_file_path, True, False
+                )
+            else:
+                save_var_map[var_name] = var.get_tensor()
+
+        if filename is not None or save_to_memory:
+            save_var_list = []
+            save_var_names = []
+            for name in sorted(save_var_map.keys()):
+                save_var_list.append(save_var_map[name])
+                save_var_names.append(name)
+
+            save_path = ''
+            if save_to_memory is False:
+                save_path = os.path.join(os.path.normpath(dirname), filename)
+
+            core.save_combine_func(
+                save_var_list,
+                save_var_names,
+                save_path,
+                True,
+                False,
+                save_to_memory,
+            )
+
+        if save_to_memory:
+            return global_scope().find_var(params_var_name).get_bytes()
+
+
+def load_vars(
+    executor,
+    dirname,
+    main_program=None,
+    vars=None,
+    predicate=None,
+    filename=None,
+):
+    """
+    :api_attr: Static Graph
+
+    This API loads variables from files by executor.
+
+    There are two ways to specify the variables to be loaded: the first way, set
+    variables in a list and assign it to the `vars`; the second way, use the
+    `predicate` function to select variables that make `predicate(variable) == True`.
+    The first way has a higher priority.
+
+    The `dirname` is used to specify the folder where to load variables.
+    If variables were saved in separate files in the folder `dirname`,
+    set `filename` None. If all variables were saved in a single file,
+    use `filename` to specify it.
+
+    Args:
+        executor(Executor): The executor to run for loading variables.
         dirname(str): The folder where to load the variables.
         main_program(Program, optional): The program whose variables will be loaded.
                                     If it is None, the default main program will
@@ -1414,13 +1768,107 @@ def load_vars(
             orig_shape = orig_para_shape.get(each_var.name)
             if new_shape != orig_shape:
                 raise RuntimeError(
-                    "Variable's shape does not match, the Program requires a parameter with the shape of ({}), "
-                    "while the loaded parameter (namely [ {} ]) has a shape of  ({}).".format(
-                        orig_shape, each_var.name, new_shape
-                    )
+                    f"Variable's shape does not match, the Program requires a parameter with the shape of ({orig_shape}), "
+                    f"while the loaded parameter (namely [ {each_var.name} ]) has a shape of  ({new_shape})."
                 )
 
 
+def load_vars_pir(
+    dirname,
+    main_program=None,
+    vars=None,
+    predicate=None,
+    filename=None,
+):
+    """
+    :api_attr: PIR Static Graph
+
+    This API loads variables from files by C++ function.
+
+    There are two ways to specify the variables to be loaded: the first way, set
+    variables in a list and assign it to the `vars`; the second way, use the
+    `predicate` function to select variables that make `predicate(variable) == True`.
+    The first way has a higher priority.
+
+    The `dirname` is used to specify the folder where to load variables.
+    If variables were saved in separate files in the folder `dirname`,
+    set `filename` None. If all variables were saved in a single file,
+    use `filename` to specify it.
+
+    Args:
+        dirname(str): The folder where to load the variables.
+        main_program(Program, optional): The program whose variables will be loaded.
+                                    If it is None, the default main program will
+                                    be used automatically.
+                                    Default: None
+        vars(list[Variable], optional): The list that contains all variables to be loaded.
+                                   Default: None
+        predicate(function, optional): The function selects variables that make
+                                        `predicate(variable) == True`.
+                                        Default: None
+        filename(str, optional): The file which saved all required variables. If variables
+                                were saved in separate files, set it to be None.
+                                Default: None
+
+    Returns:
+        None
+    """
+
+    vars_from_memory = False
+    if dirname is not None:
+        dirname = os.path.normpath(dirname)
+    # TODO(chenzhiyang): vars_from_memory
+
+    if filename == '':
+        filename = None
+
+    if vars is None:
+        if main_program is None:
+            main_program = default_main_program()
+
+        param, opt = get_pir_parameters(main_program)
+        vars_list = param + opt
+        load_vars_pir(
+            dirname=dirname,
+            main_program=main_program,
+            vars=list(filter(predicate, vars_list)),
+            filename=filename,
+        )
+    else:
+        if main_program is None:
+            main_program = default_main_program()
+
+        # TODO(chenzhiyang):save origin param shape, check vars
+        load_var_map = {}
+
+        for var_name in vars:
+            var = global_scope().find_var(var_name)
+            assert isinstance(var, paddle.base.libpaddle.Variable)
+            if filename is None:
+                if dirname is None:
+                    raise ValueError(
+                        "The directory path and params cannot be None at the same time."
+                    )
+                file_path = os.path.join(dirname, var_name)
+                core.load_func(file_path, -1, [], False, var.get_tensor())
+            else:
+                load_var_map[var_name] = var
+
+        if filename is not None:
+            load_var_list = []
+            load_var_names = []
+            for name in sorted(load_var_map.keys()):
+                load_var_list.append(load_var_map[name].get_tensor())
+                load_var_names.append(name)
+
+            if vars_from_memory is False:
+                filename = os.path.join(dirname, filename)
+
+            core.load_combine_func(
+                filename, load_var_names, load_var_list, False
+            )
+
+
 @static_only
 def save(program, model_path, protocol=4, **configs):
     """
@@ -1520,6 +1968,100 @@ def get_tensor(var):
         f.write(program.desc.serialize_to_string())
 
 
+def get_pir_parameters(program):
+    """
+    Get parameters and optimizer variables from program.
+        Args:
+            program(Program): The program to get parameters and optimizer variables.
+    """
+    params = []
+    opts = []
+    for op in program.global_block().ops:
+        if op.name() == "builtin.parameter" and "persistable" in op.attrs():
+            if op.attrs()['persistable'] == [True]:
+                name = op.attrs()["parameter_name"]
+                params.append(name)
+        elif op.name() == "pd_op.data" and "persistable" in op.attrs():
+            if op.attrs()['persistable'] == [True]:
+                name = op.attrs()["name"]
+                opts.append(name)
+    return params, opts
+
+
+@static_only
+def save_pir(program, model_path, protocol=4, **configs):
+    """
+    This function saves parameters, optimizer information and network description to model_path.
+
+    The parameters contain all the trainable Tensor, and save to a file with suffix ".pdparams".
+    The optimizer information contains all the Tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. All the information will be saved to a file with suffix ".pdopt". (If the optimizer has no Tensor to save (like SGD), the file will not be generated).
+    The network description is the description of the program. It's only used for deployment. The description will be saved to a file with a suffix ".pdmodel".
+
+    Args:
+        program(Program) : The program to be saved.
+        model_path(str): The file prefix to save the program. The format is "dirname/file_prefix". If file_prefix is an empty str, an exception will be raised.
+        protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5.
+                                 Default: 4
+        configs(dict, optional) : Optional keyword arguments.
+
+    Returns:
+        None
+    """
+
+    base_name = os.path.basename(model_path)
+    assert (
+        base_name != ""
+    ), "The input model_path MUST be format of dirname/filename [dirname\\filename in Windows system], but received model_path is empty string."
+    if 'pickle_protocol' in configs:
+        protocol = configs['pickle_protocol']
+        warnings.warn(
+            "'pickle_protocol' is a deprecated argument. Please use 'protocol' instead."
+        )
+
+    if not isinstance(protocol, int):
+        raise ValueError(
+            f"The 'protocol' MUST be `int`, but received {type(protocol)}"
+        )
+
+    if protocol < 2 or protocol > 4:
+        raise ValueError(
+            f"Expected 1<'protocol'<5, but received protocol={protocol}"
+        )
+
+    dir_name = os.path.dirname(model_path)
+    if dir_name and not os.path.exists(dir_name):
+        os.makedirs(dir_name)
+
+    def get_tensor(name):
+        t = global_scope().find_var(name).get_tensor()
+        return np.array(t)
+
+    # get parameters and optimizer variables
+    parameter_list, optimizer_param_list = get_pir_parameters(program)
+    param_dict = {name: get_tensor(name) for name in parameter_list}
+    opt_dict = {name: get_tensor(name) for name in optimizer_param_list}
+
+    # save parameters
+    param_dict = _unpack_saved_dict(param_dict, protocol)
+
+    # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+    if sys.platform == 'darwin' and sys.version_info.major == 3:
+        pickle_bytes = pickle.dumps(param_dict, protocol=protocol)
+        with open(model_path + ".pdparams", 'wb') as f:
+            max_bytes = 2**30
+            for i in range(0, len(pickle_bytes), max_bytes):
+                f.write(pickle_bytes[i : i + max_bytes])
+    else:
+        with open(model_path + ".pdparams", 'wb') as f:
+            pickle.dump(param_dict, f, protocol=protocol)
+
+    # save optimizer parameters
+    with open(model_path + ".pdopt", 'wb') as f:
+        pickle.dump(opt_dict, f, protocol=protocol)
+
+    ### TODO(chenzhiyang): save program
+
+
 @static_only
 def load(program, model_path, executor=None, var_list=None):
     """
@@ -1581,9 +2123,7 @@ def load(program, model_path, executor=None, var_list=None):
         # model file save by base.save not found, try to load model file saved with
         # [save_vars, save_params, save_persistables]
         _logger.debug(
-            "{} not found, try to load model file saved with [ save_params, save_persistables, save_vars ]".format(
-                parameter_file_name
-            )
+            f"{parameter_file_name} not found, try to load model file saved with [ save_params, save_persistables, save_vars ]"
         )
         if executor is None:
             raise ValueError(
@@ -1734,6 +2274,104 @@ def set_var(var, ndarray):
             set_var(v, load_dict[v.name])
 
 
+@static_only
+def load_pir(program, model_path, executor=None, var_list=None):
+    """
+    :api_attr: PIR Static Graph
+
+    This function gets parameters and optimizer information from program, and then gets corresponding value from file.
+    An exception will be thrown if shape or dtype of the parameters does not match.
+
+    This function can also load model file saved with [ save_params, save_persistables, save_vars ].
+    var_list can not be None when loading a single model file
+    ( filename is not None when save_params, save_persistables or save_vars is called ).
+
+    Args:
+        program(Program): The program to be loaded
+        model_path(str): The file prefix to store the program
+        executor(Executor, optional): The executor used for initializing the parameter
+                                      when startup program is not run.
+        var_list(list|tuple, optional): The Tensor list/tuple to load a single model file saved with
+                                  [ save_params, save_persistables, save_vars ].
+                                  Default: None
+
+    Returns:
+        None
+    """
+
+    assert executor is None or isinstance(executor, Executor)
+
+    model_prefix = model_path
+    if model_prefix.endswith(".pdparams"):
+        model_prefix = model_prefix[:-9]
+    elif model_prefix.endswith(".pdopt"):
+        model_prefix = model_prefix[:-6]
+    elif model_prefix.endswith(".pdmodel"):
+        model_prefix = model_prefix[:-8]
+
+    parameter_file_name = model_prefix + ".pdparams"
+
+    # TODO(chenzhiyang): if not os.path.exists(parameter_file_name): load_vars
+
+    def set_var(name, ndarray):
+        t = global_scope().find_var(name).get_tensor()
+        p = t._place()
+        if p.is_cpu_place():
+            place = paddle.base.CPUPlace()
+        elif p.is_cuda_pinned_place():
+            place = paddle.base.CUDAPinnedPlace()
+        elif p.is_xpu_place():
+            p = paddle.base.core.Place()
+            p.set_place(t._place())
+            place = paddle.base.XPUPlace(p.xpu_device_id())
+        elif p.is_custom_place():
+            p = paddle.base.core.Place()
+            p.set_place(t._place())
+            place = paddle.base.CustomPlace(
+                paddle.device.get_device().split(':')[0], p.custom_device_id()
+            )
+        else:
+            p = paddle.base.core.Place()
+            p.set_place(t._place())
+            place = paddle.base.CUDAPlace(p.gpu_device_id())
+
+        t.set(ndarray, place)
+
+    parameter_list, optimizer_param_list = get_pir_parameters(program)
+
+    with open(parameter_file_name, 'rb') as f:
+        # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+        if sys.platform == 'darwin' and sys.version_info.major == 3:
+            load_dict = _pickle_loads_mac(parameter_file_name, f)
+        else:
+            load_dict = _safe_load_pickle(f, encoding='latin1')
+        load_dict = _pack_loaded_dict(load_dict)
+    for name in parameter_list:
+        assert (
+            name in load_dict
+        ), f"Can not find [{name}] in model file [{parameter_file_name}]"
+        set_var(name, load_dict[name])
+
+    if len(optimizer_param_list) > 0:
+        opt_file_name = model_prefix + ".pdopt"
+        assert os.path.exists(
+            opt_file_name
+        ), f"Optimizer file [{opt_file_name}] not exits"
+
+        if executor:
+            paddle.base.core._create_loaded_parameter(
+                optimizer_param_list, global_scope(), executor._default_executor
+            )
+
+        with open(opt_file_name, 'rb') as f:
+            load_dict = _safe_load_pickle(f, encoding='latin1')
+        for name in optimizer_param_list:
+            assert (
+                name in load_dict
+            ), f"Can not find [{name}] in model file [{opt_file_name}]"
+            set_var(name, load_dict[name])
+
+
 @static_only
 def set_program_state(program, state_dict):
     """
@@ -1785,16 +2423,12 @@ def set_program_state(program, state_dict):
             orig_para_np = np.array(var_temp.get_tensor())
             new_para_np = state_dict[para.name]
             assert orig_para_np.shape == new_para_np.shape, (
-                "Parameter's shape does not match, the Program requires a parameter with the shape of ({}), "
-                "while the loaded parameter (namely [ {} ]) has a shape of  ({}).".format(
-                    orig_para_np.shape, para.name, new_para_np.shape
-                )
+                f"Parameter's shape does not match, the Program requires a parameter with the shape of ({orig_para_np.shape}), "
+                f"while the loaded parameter (namely [ {para.name} ]) has a shape of  ({new_para_np.shape})."
             )
             assert orig_para_np.dtype == new_para_np.dtype, (
-                "Parameter's data type does not match, the Program requires a parameter with a dtype of ({}), "
-                "while the loaded parameter (namely [ {} ]) has a dtype of  ({}).".format(
-                    orig_para_np.dtype, para.name, new_para_np.dtype
-                )
+                f"Parameter's data type does not match, the Program requires a parameter with a dtype of ({orig_para_np.dtype}), "
+                f"while the loaded parameter (namely [ {para.name} ]) has a dtype of  ({new_para_np.dtype})."
             )
 
             ten = var_temp.get_tensor()
@@ -1901,9 +2535,7 @@ def load_program_state(model_path, var_list=None):
         # model file saved with base.save is not found, try to load model file saved with
         # [save_vars, save_params, save_persistables]
         _logger.debug(
-            "{} not found, try to load model file saved with [ save_params, save_persistables, save_vars ]".format(
-                parameter_file_name
-            )
+            f"{parameter_file_name} not found, try to load model file saved with [ save_params, save_persistables, save_vars ]"
         )
 
         var_name_list = []
diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py
index 7d08a6eff11bf..4713ecffa0d38 100755
--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
@@ -37,7 +37,6 @@
 from .control_flow import case, cond, switch_case, while_loop
 from .loss import nce
 from .sequence_lod import (
-    sequence_concat,
     sequence_conv,
     sequence_enumerate,
     sequence_expand,
@@ -83,7 +82,6 @@
     'sequence_conv',
     'sequence_softmax',
     'sequence_pool',
-    'sequence_concat',
     'sequence_first_step',
     'sequence_last_step',
     'sequence_slice',
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index 2b26fffc70699..1ee83d374b697 100644
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -355,9 +355,7 @@ def instance_norm(
     input_shape = input.shape
     if len(input.shape) < 2 or len(input.shape) > 5:
         raise ValueError(
-            'expected 2D or 3D or 4D or 5D input (got {}D input, input shape is: {})'.format(
-                len(input.shape), input_shape
-            )
+            f'expected 2D or 3D or 4D or 5D input (got {len(input.shape)}D input, input shape is: {input_shape})'
         )
     channel_num = input_shape[1]
 
@@ -547,9 +545,7 @@ def data_norm(
     input_shape = input.shape
     if len(input_shape) < 2:
         raise ValueError(
-            "The shape pf Input < 2 (got {}D input, input shape is: {})".format(
-                len(input_shape), input_shape
-            )
+            f"The shape pf Input < 2 (got {len(input_shape)}D input, input shape is: {input_shape})"
         )
     if data_layout == 'NCHW':
         channel_num = input_shape[1]
@@ -942,8 +938,8 @@ def conv2d(
     num_channels = input.shape[3] if channel_last else input.shape[1]
     if num_channels < 0:
         raise ValueError(
-            "The channel dimension of the input({}) should be defined. "
-            "Received: {}.".format(str(input.shape), str(num_channels))
+            f"The channel dimension of the input({str(input.shape)}) should be defined. "
+            f"Received: {str(num_channels)}."
         )
     assert param_attr is not False, "param_attr should not be False here."
 
@@ -958,8 +954,8 @@ def conv2d(
         if num_channels % groups != 0:
             raise ValueError(
                 "the channel of input must be divisible by groups,"
-                "received: the channel of input is {}, the shape of input is {}"
-                ", the groups is {}".format(num_channels, input.shape, groups)
+                f"received: the channel of input is {num_channels}, the shape of input is {input.shape}"
+                f", the groups is {groups}"
             )
         num_filter_channels = num_channels // groups
 
@@ -1251,15 +1247,13 @@ def conv3d(
     channel_last = data_format == "NDHWC"
     if len(input.shape) != 5:
         raise ValueError(
-            "Input should be 5D tensor, but received input with the shape of {}".format(
-                input.shape
-            )
+            f"Input should be 5D tensor, but received input with the shape of {input.shape}"
         )
     num_channels = input.shape[4] if channel_last else input.shape[1]
     if num_channels < 0:
         raise ValueError(
-            "The channel dimension of the input({}) should be defined. "
-            "Received: {}.".format(str(input.shape), str(num_channels))
+            f"The channel dimension of the input({str(input.shape)}) should be defined. "
+            f"Received: {str(num_channels)}."
         )
 
     if groups is None:
@@ -1272,9 +1266,7 @@ def conv3d(
         if num_channels % groups != 0:
             raise ValueError(
                 "The number of input channels must be divisible by Attr(groups). "
-                "Received: number of channels({}), groups({}).".format(
-                    str(num_channels), str(groups)
-                )
+                f"Received: number of channels({str(num_channels)}), groups({str(groups)})."
             )
         num_filter_channels = num_channels // groups
 
@@ -1962,9 +1954,7 @@ def conv3d_transpose(
         raise TypeError("Input of conv3d_transpose must be Tensor")
     if len(input.shape) != 5:
         raise ValueError(
-            "Input should be 5D tensor, but received input with the shape of {}".format(
-                input.shape
-            )
+            f"Input should be 5D tensor, but received input with the shape of {input.shape}"
         )
     input_channel = (
         input.shape[1] if data_format == 'NCDHW' else input.shape[-1]
@@ -2601,9 +2591,7 @@ def bilinear_tensor_product(
     dtype = helper.input_dtype('x')
     if len(x.shape) != 2 or len(y.shape) != 2:
         raise ValueError(
-            "Input x and y should be 2D tensor, but received x with the shape of {}, y with the shape of {}".format(
-                x.shape, y.shape
-            )
+            f"Input x and y should be 2D tensor, but received x with the shape of {x.shape}, y with the shape of {y.shape}"
         )
     param_shape = [size, x.shape[1], y.shape[1]]
 
@@ -2777,9 +2765,7 @@ def batch_norm(
     input_shape = input.shape
     if len(input.shape) < 2 or len(input.shape) > 5:
         raise ValueError(
-            'expected 2D or 3D or 4D or 5D input (got {}D input, input shape is: {})'.format(
-                len(input.shape), input_shape
-            )
+            f'expected 2D or 3D or 4D or 5D input (got {len(input.shape)}D input, input shape is: {input_shape})'
         )
     if data_layout == 'NCHW':
         channel_num = input_shape[1]
diff --git a/python/paddle/static/nn/control_flow.py b/python/paddle/static/nn/control_flow.py
index d8f2503b9e925..85825b17d45e7 100644
--- a/python/paddle/static/nn/control_flow.py
+++ b/python/paddle/static/nn/control_flow.py
@@ -209,9 +209,7 @@ def __init__(self, cond):
             check_variable_and_dtype(cond, 'cond', ['bool'], 'static.nn.If')
             if reduce(lambda a, b: a * b, cond.shape, 1) != 1:
                 raise TypeError(
-                    "condition expected shape as [1], but given shape as {}.".format(
-                        list(cond.shape)
-                    )
+                    f"condition expected shape as [1], but given shape as {list(cond.shape)}."
                 )
         self.if_op = build_if_op(cond)
         self.cond_var = self.if_op.cond()
@@ -578,9 +576,7 @@ def __init__(self, cond, is_test=False, name=None):
         check_variable_and_dtype(cond, 'cond', ['bool'], 'static.nn.While')
         if reduce(lambda a, b: a * b, cond.shape, 1) != 1:
             raise TypeError(
-                "condition expected shape as [1], but given shape as {}.".format(
-                    list(cond.shape)
-                )
+                f"condition expected shape as [1], but given shape as {list(cond.shape)}."
             )
         if in_pir_mode():
             return
@@ -672,9 +668,7 @@ def has_shape_diff(x_var, y_var):
             and has_shape_diff(input, output)
         ):
             warnings.warn(
-                "In dy2static mode, we attempt to assign a variable with shape {} into a variable with shape{}, which is not always right.".format(
-                    input.shape, output.shape
-                )
+                f"In dy2static mode, we attempt to assign a variable with shape {input.shape} into a variable with shape{output.shape}, which is not always right."
             )
         # NOTE(dev): Avoid assign if input is output in Variable level which means
         # input is not generated in While sub block and modified by in-place and only
@@ -1554,18 +1548,14 @@ def cond(pred, true_fn=None, false_fn=None, name=None, return_names=None):
             if true_fn is not None:
                 if not callable(true_fn):
                     raise TypeError(
-                        "The true_fn in cond must be callable, but received {}".format(
-                            type(true_fn).__name__
-                        )
+                        f"The true_fn in cond must be callable, but received {type(true_fn).__name__}"
                     )
                 return true_fn()
         else:
             if false_fn is not None:
                 if not callable(false_fn):
                     raise TypeError(
-                        "The false_fn in cond must be callable, but received {}".format(
-                            type(false_fn).__name__
-                        )
+                        f"The false_fn in cond must be callable, but received {type(false_fn).__name__}"
                     )
                 return false_fn()
         return None
@@ -1578,18 +1568,14 @@ def cond(pred, true_fn=None, false_fn=None, name=None, return_names=None):
         if true_fn is not None:
             if not callable(true_fn):
                 raise TypeError(
-                    "The true_fn in cond must be callable, but received {}".format(
-                        type(true_fn).__name__
-                    )
+                    f"The true_fn in cond must be callable, but received {type(true_fn).__name__}"
                 )
             with if_op.true_block():
                 true_output = true_fn()
         if false_fn is not None:
             if not callable(false_fn):
                 raise TypeError(
-                    "The false_fn in cond must be callable, but received {}".format(
-                        type(false_fn).__name__
-                    )
+                    f"The false_fn in cond must be callable, but received {type(false_fn).__name__}"
                 )
             with if_op.false_block():
                 false_output = false_fn()
@@ -1599,9 +1585,7 @@ def cond(pred, true_fn=None, false_fn=None, name=None, return_names=None):
         if true_fn is not None:
             if not callable(true_fn):
                 raise TypeError(
-                    "The true_fn in cond must be callable, but received {}".format(
-                        type(true_fn).__name__
-                    )
+                    f"The true_fn in cond must be callable, but received {type(true_fn).__name__}"
                 )
             true_cond_block = ConditionalBlock([pred], is_scalar_condition=True)
             with true_cond_block.block():
@@ -1613,9 +1597,7 @@ def cond(pred, true_fn=None, false_fn=None, name=None, return_names=None):
         if false_fn is not None:
             if not callable(false_fn):
                 raise TypeError(
-                    "The false_fn in cond must be callable, but received {}".format(
-                        type(false_fn).__name__
-                    )
+                    f"The false_fn in cond must be callable, but received {type(false_fn).__name__}"
                 )
             false_cond_block = ConditionalBlock(
                 [paddle.logical_not(pred)], is_scalar_condition=True
@@ -1664,10 +1646,7 @@ def cond(pred, true_fn=None, false_fn=None, name=None, return_names=None):
         _to_sequence_except_dict(false_output)
     ):
         raise ValueError(
-            "true fn returns {} vars, but false fn returns {} vars, which is not equals".format(
-                len(_to_sequence_except_dict(true_output)),
-                len(_to_sequence_except_dict(false_output)),
-            )
+            f"true fn returns {len(_to_sequence_except_dict(true_output))} vars, but false fn returns {len(_to_sequence_except_dict(false_output))} vars, which is not equals"
         )
     for true_out, false_out, return_name in zip(
         _to_sequence_except_dict(true_output),
@@ -1678,9 +1657,7 @@ def cond(pred, true_fn=None, false_fn=None, name=None, return_names=None):
             assert_same_structure(true_out, false_out, check_types=False)
         except ValueError as e:
             raise ValueError(
-                "Incompatible return values of `{}` in true_fn and false_fn in cond: {}".format(
-                    return_name, e
-                )
+                f"Incompatible return values of `{return_name}` in true_fn and false_fn in cond: {e}"
             )
 
     def check_ret_none(seq_true, seq_false, seq_names):
@@ -1695,15 +1672,9 @@ def check_ret_none(seq_true, seq_false, seq_names):
                     and f_true[idx] is not None
                 ):
                     warnings.warn(
-                        "In cond : Var '{}' or part of it is set differently in ifelse branches, "
-                        "<{}, {}> in true branch and <{}, {}> in false branch. Set var to "
-                        "'None' in ifelse block might lead to error.".format(
-                            f_name,
-                            type(f_true[idx]),
-                            f_true[idx],
-                            type(f_false[idx]),
-                            f_false[idx],
-                        )
+                        f"In cond : Var '{f_name}' or part of it is set differently in ifelse branches, "
+                        f"<{type(f_true[idx])}, {f_true[idx]}> in true branch and <{type(f_false[idx])}, {f_false[idx]}> in false branch. Set var to "
+                        "'None' in ifelse block might lead to error."
                     )
 
     check_ret_none(
@@ -1927,8 +1898,8 @@ def start_select_input():
         inputs = [to_static_variable(false_var), to_static_variable(true_var)]
         warnings.warn(
             "Return results from different branches in cond are not same type: "
-            "false_var returned by false_fn is '{}' and true_var of true_fn is "
-            "'{}'".format(type(false_var), type(true_var))
+            f"false_var returned by false_fn is '{type(false_var)}' and true_var of true_fn is "
+            f"'{type(true_var)}'"
         )
     elif (
         isinstance(false_var, UndefinedVar)
@@ -1944,9 +1915,7 @@ def start_select_input():
     else:
         raise TypeError(
             "Unsupported return type of true_fn and false_fn in cond: false_var "
-            "returned by false_fn is '{}' and true_var of true_fn is '{}'".format(
-                type(false_var), type(true_var)
-            )
+            f"returned by false_fn is '{type(false_var)}' and true_var of true_fn is '{type(true_var)}'"
         )
     return start_select_input
 
@@ -1992,19 +1961,15 @@ def map_fn(n1, n2, name, order):
             if n1 is None and n2 is not None:
                 if order == 0:
                     warnings.warn(
-                        "In cond : Var '{}' or part of it is set differently in ifelse branches, "
-                        "<{}, {}> in true branch and <{}, {}> in false branch. Set var to "
-                        "'None' in ifelse block might lead to error.".format(
-                            name, type(n1), n1, type(n2), n2
-                        )
+                        f"In cond : Var '{name}' or part of it is set differently in ifelse branches, "
+                        f"<{type(n1)}, {n1}> in true branch and <{type(n2)}, {n2}> in false branch. Set var to "
+                        "'None' in ifelse block might lead to error."
                     )
                 else:
                     warnings.warn(
-                        "In cond : Var '{}' or part of it is set differently in ifelse branches, "
-                        "<{}, {}> in true branch and <{}, {}> in false branch. Set var to "
-                        "'None' in ifelse block might lead to error.".format(
-                            name, type(n2), n2, type(n1), n1
-                        )
+                        f"In cond : Var '{name}' or part of it is set differently in ifelse branches, "
+                        f"<{type(n2)}, {n2}> in true branch and <{type(n1)}, {n1}> in false branch. Set var to "
+                        "'None' in ifelse block might lead to error."
                     )
             return pack_undefined_var_as(n2)
         return n1
diff --git a/python/paddle/static/nn/sequence_lod.py b/python/paddle/static/nn/sequence_lod.py
index 51cfd0d5307d5..c8a1c080e5a6b 100644
--- a/python/paddle/static/nn/sequence_lod.py
+++ b/python/paddle/static/nn/sequence_lod.py
@@ -367,77 +367,6 @@ def sequence_pool(input, pool_type, is_test=False, pad_value=0.0):
     return pool_out
 
 
-@templatedoc()
-def sequence_concat(input, name=None):
-    """
-
-    Note:
-        Only receives Tensor as input. If your input is Tensor, please use concat Op.(static.nn.** :ref:`api_paddle_concat` ).
-
-    This operator only supports Tensor as input. It concatenates the multiple Tensor from input by the LoD information,
-    and outputs the concatenated Tensor.
-
-    .. code-block:: text
-
-        input is a list of Tensor:
-            input = [x1, x2]
-        where:
-            x1.lod = [[0, 3, 5]]
-            x1.data = [[1], [2], [3], [4], [5]]
-            x1.shape = [5, 1]
-
-            x2.lod = [[0, 2, 4]]
-            x2.data = [[6], [7], [8], [9]]
-            x2.shape = [4, 1]
-        and should satisfy: len(x1.lod[0]) == len(x2.lod[0])
-
-        output is Tensor:
-            out.lod = [[0, 3+2, 5+4]]
-            out.data = [[1], [2], [3], [6], [7], [4], [5], [8], [9]]
-            out.shape = [9, 1]
-
-    Args:
-        input(list of Tensor): List of Tensor to be concatenated. The length of each Tensor should be same.
-            The data type can be float32, float64 or int64.
-        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name` .
-
-    Returns:
-        Tensor: Output the concatenated Tensor. The data type is same as input.
-
-    Examples:
-        .. code-block:: python
-
-            >>> import paddle
-            >>> paddle.enable_static()
-
-            >>> x = paddle.static.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
-            >>> y = paddle.static.data(name='y', shape=[-1, 10], dtype='float32', lod_level=1)
-            >>> out = paddle.static.nn.sequence_concat(input=[x, y])
-    """
-    assert (
-        not in_dygraph_mode()
-    ), "sequence layer is not supported in dygraph mode yet."
-    helper = LayerHelper('sequence_concat', **locals())
-
-    check_type(
-        input, 'input', list, 'paddle.static.nn.sequence_lod.sequence_concat'
-    )
-    for i, input_x in enumerate(input):
-        check_variable_and_dtype(
-            input_x,
-            'input[' + str(i) + ']',
-            ['int64', 'float32', 'float64'],
-            'paddle.static.nn.sequence_lod.sequence_concat',
-        )
-
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(
-        type='sequence_concat', inputs={'X': input}, outputs={'Out': [out]}
-    )
-    return out
-
-
 def sequence_first_step(input):
     """
 
diff --git a/python/paddle/static/nn/static_pylayer.py b/python/paddle/static/nn/static_pylayer.py
index 835a9adeb3f41..3aae69bb3f732 100644
--- a/python/paddle/static/nn/static_pylayer.py
+++ b/python/paddle/static/nn/static_pylayer.py
@@ -321,9 +321,7 @@ def static_pylayer(forward_fn, inputs, backward_fn=None, name=None):
         for input_var in inputs:
             if input_var.stop_gradient is False:
                 raise ValueError(
-                    "``stop_gradient`` attr of all inputs to ``forward_fn`` are expected to be True, when ``backward_fn == None``, but {}.stop_gradient got {}".format(
-                        input_var.name, input_var.stop_gradient
-                    )
+                    f"``stop_gradient`` attr of all inputs to ``forward_fn`` are expected to be True, when ``backward_fn == None``, but {input_var.name}.stop_gradient got {input_var.stop_gradient}"
                 )
 
     if in_pir_mode():
@@ -392,9 +390,7 @@ def static_pylayer(forward_fn, inputs, backward_fn=None, name=None):
             bwd_var_name = _append_grad_suffix_(fwd_var_name)
             if not current_block.desc.has_var_recursive(fwd_var_name.encode()):
                 raise ValueError(
-                    "Grad var {} , we can't find its related forward var {}".format(
-                        bwd_var_name, fwd_var_name
-                    )
+                    f"Grad var {bwd_var_name} , we can't find its related forward var {fwd_var_name}"
                 )
 
             var = current_block.create_var(
diff --git a/python/paddle/static/quantization/__init__.py b/python/paddle/static/quantization/__init__.py
index d2c647865c18b..b04cf7fbb7a29 100644
--- a/python/paddle/static/quantization/__init__.py
+++ b/python/paddle/static/quantization/__init__.py
@@ -17,10 +17,10 @@
     PostTrainingQuantizationProgram,
     WeightQuantization,
 )
-from .quant2_int8_mkldnn_pass import (  # noqa: F401
+from .quant2_int8_onednn_pass import (  # noqa: F401
     Quant2Int8MkldnnPass,
 )
-from .quant_int8_mkldnn_pass import (  # noqa: F401
+from .quant_int8_onednn_pass import (  # noqa: F401
     QuantInt8MkldnnPass,
 )
 from .quanter import (  # noqa: F401
diff --git a/python/paddle/static/quantization/adaround.py b/python/paddle/static/quantization/adaround.py
index 8e807a11b9246..889a9f9dad9f0 100644
--- a/python/paddle/static/quantization/adaround.py
+++ b/python/paddle/static/quantization/adaround.py
@@ -255,8 +255,6 @@ def run_adaround(
                 fetch_op_name = quant_op_out_name
 
         # build adaround program
-        exec_strategy = static.ExecutionStrategy()
-        exec_strategy.num_iteration_per_drop_scope = 1
         startup_program = static.Program()
         train_program = static.Program()
         with static.program_guard(train_program, startup_program):
@@ -347,14 +345,7 @@ def run_adaround(
                 return_numpy=True,
             )
             _logger.info(
-                "Iter {:d}, lr {:.5f}, loss {:.5f}, loss_round {:.5f}, loss_recon {:.5f}, time {:.5f}s".format(
-                    i,
-                    lr,
-                    np.mean(out[0]),
-                    np.mean(out[1]),
-                    np.mean(out[2]),
-                    start_time - prev_start_time,
-                )
+                f"Iter {i:d}, lr {lr:.5f}, loss {np.mean(out[0]):.5f}, loss_round {np.mean(out[1]):.5f}, loss_recon {np.mean(out[2]):.5f}, time {start_time - prev_start_time:.5f}s"
             )
             sys.stdout.flush()
             if i == num_iterations:
diff --git a/python/paddle/static/quantization/post_training_quantization.py b/python/paddle/static/quantization/post_training_quantization.py
index 857398df7a4fc..9e41e7e2bbc65 100644
--- a/python/paddle/static/quantization/post_training_quantization.py
+++ b/python/paddle/static/quantization/post_training_quantization.py
@@ -318,14 +318,10 @@ def __init__(
         ), "The algo should be KL, hist, mse, avg, abs_max, min_max or ptf."
         assert (
             activation_quantize_type in self._support_activation_quantize_type
-        ), "The activation_quantize_type ({}) should in ({}).".format(
-            activation_quantize_type, self._support_activation_quantize_type
-        )
+        ), f"The activation_quantize_type ({activation_quantize_type}) should in ({self._support_activation_quantize_type})."
         assert (
             weight_quantize_type in self._support_weight_quantize_type
-        ), "The weight_quantize_type ({}) should in ({}).".format(
-            weight_quantize_type, self._support_weight_quantize_type
-        )
+        ), f"The weight_quantize_type ({weight_quantize_type}) should in ({self._support_weight_quantize_type})."
 
         # Save input params
         self._bias_correction = bias_correction
@@ -396,7 +392,7 @@ def __init__(
         assert (
             activation_bits == weight_bits
         ), "activation_bits and weight_bits must be the same, other cases are not supported."
-        support_deploy_backend = [None, "tensorrt", "mkldnn", "arm"]
+        support_deploy_backend = [None, "tensorrt", "mkldnn", "onednn", "arm"]
         if not deploy_backend:
             self.quant_config = BaseQuantizer(
                 quantizable_op_type=quantizable_op_type,
@@ -407,7 +403,10 @@ def __init__(
                 quantizable_op_type=quantizable_op_type,
                 quant_bits=weight_bits,
             )
-        elif deploy_backend.lower() == "mkldnn":
+        elif (
+            deploy_backend.lower() == "mkldnn"
+            or deploy_backend.lower() == "onednn"
+        ):
             self.quant_config = MKLDNNQuantizer(
                 quantizable_op_type=quantizable_op_type,
                 quant_bits=weight_bits,
@@ -418,9 +417,7 @@ def __init__(
                 quant_bits=weight_bits,
             )
         else:
-            assert "Deploy Backend {} not support, please choose one of {}.".format(
-                deploy_backend, support_deploy_backend
-            )
+            assert f"Deploy Backend {deploy_backend} not support, please choose one of {support_deploy_backend}."
 
     def quantize(self):
         '''
@@ -1352,17 +1349,13 @@ def save_info(
                 out_var_name not in threshold_map
             ):
                 _logger.warning(
-                    "{} is zero-size tensor and unable to calibrate, so skip quant it.".format(
-                        out_var_name
-                    )
+                    f"{out_var_name} is zero-size tensor and unable to calibrate, so skip quant it."
                 )
                 return
             else:
                 assert (
                     out_var_name in threshold_map
-                ), "The output ({}) of {} node does not have threshold.".format(
-                    out_var_name, op_node.type
-                )
+                ), f"The output ({out_var_name}) of {op_node.type} node does not have threshold."
             if self._onnx_format:
                 # For easy extension, every var_node set a dict to save parameters of quant.
                 self._calibration_scales[out_var_name] = {}
@@ -1640,9 +1633,7 @@ def quantize_weight_to_int(
         ], "Input error: weight_bits should be 8 or 16."
         assert (
             weight_quantize_type in self._supported_weight_quantize_type
-        ), "Input error: weight_quantize_type should in {}".format(
-            self._supported_weight_quantize_type
-        )
+        ), f"Input error: weight_quantize_type should in {self._supported_weight_quantize_type}"
 
         quantized_model_dir = os.path.join(save_model_dir, "quantized_model")
         self._quantize_weight_to_int(
diff --git a/python/paddle/static/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/static/quantization/quant2_int8_onednn_pass.py
similarity index 94%
rename from python/paddle/static/quantization/quant2_int8_mkldnn_pass.py
rename to python/paddle/static/quantization/quant2_int8_onednn_pass.py
index e693546e56d19..572a93c7990f1 100644
--- a/python/paddle/static/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/static/quantization/quant2_int8_onednn_pass.py
@@ -151,7 +151,7 @@ def _is_fc_quantized(self, graph):
 
     def _label_skip_quantized_op(self, graph):
         """
-        For some ops(conv2d, depthwise_conv2d, mul, matml), find and label
+        For some ops(conv2d, depthwise_conv2d, mul, matmul), find and label
         the skip quantized ops. cpu_quantize_placement_pass will use the
         label to identify it.
         For static models, the skip quantized ops have `skip_quant` attr.
@@ -192,9 +192,7 @@ def _gather_input_scales_from_fake(self, graph):
                 bit_length = op.op().attr("bit_length")
                 assert (
                     bit_length == 8
-                ), 'Unsupported number quantization bits ({}). Only 8 is supported now.'.format(
-                    bit_length
-                )
+                ), f'Unsupported number quantization bits ({bit_length}). Only 8 is supported now.'
 
                 input_name = op.input("X")[0]
                 scale_name = op.input("InScale")[0]
@@ -399,9 +397,7 @@ def _dequantize_op_weights(self, graph, op_node, weight_name, output_name):
             w_fp32 = np.multiply(np.divide(weight, self._s8_max), scales)
         else:
             raise ValueError(
-                "The size of weight scales vector ({}) does not match the dimensions ({}) of the weights tensor {}.".format(
-                    scales.size, weight.shape, weight_var_name
-                )
+                f"The size of weight scales vector ({scales.size}) does not match the dimensions ({weight.shape}) of the weights tensor {weight_var_name}."
             )
         w_fp32 = w_fp32.reshape(weight.shape).astype(np.float32)
         self._restore_var(weight_var_name, w_fp32)
@@ -433,7 +429,7 @@ def _optimize_fp32_graph(self, graph):
         graph = self._update_activations(graph)
         graph = self._remove_ctrl_vars(graph)
         graph = self._apply_pass(
-            graph, 'mkldnn_placement_pass', ['mkldnn_enabled_op_types'], [set()]
+            graph, 'onednn_placement_pass', ['mkldnn_enabled_op_types'], [set()]
         )
         # remove dropout ops
         graph = self._apply_pass(graph, 'simplify_with_basic_ops_pass')
@@ -458,38 +454,38 @@ def _optimize_fp32_graph(self, graph):
         graph = self._apply_pass(graph, 'matmul_scale_fuse_pass')
         graph = self._apply_pass(graph, 'gpu_cpu_map_matmul_to_mul_pass')
         graph = self._apply_pass(graph, 'repeated_fc_relu_fuse_pass')
-        graph = self._apply_pass(graph, 'depthwise_conv_mkldnn_pass')
+        graph = self._apply_pass(graph, 'depthwise_conv_onednn_pass')
         graph = self._apply_pass(graph, 'conv_bn_fuse_pass')
         graph = self._apply_pass(graph, 'conv_eltwiseadd_bn_fuse_pass')
-        graph = self._apply_pass(graph, 'conv_affine_channel_mkldnn_fuse_pass')
+        graph = self._apply_pass(graph, 'conv_affine_channel_onednn_fuse_pass')
         graph = self._apply_pass(graph, 'conv_transpose_bn_fuse_pass')
         graph = self._apply_pass(
             graph, 'conv_transpose_eltwiseadd_bn_fuse_pass'
         )
-        graph = self._apply_pass(graph, 'conv_bias_mkldnn_fuse_pass')
-        graph = self._apply_pass(graph, 'conv_transpose_bias_mkldnn_fuse_pass')
-        graph = self._apply_pass(graph, 'conv_elementwise_add_mkldnn_fuse_pass')
-        graph = self._apply_pass(graph, 'conv_activation_mkldnn_fuse_pass')
+        graph = self._apply_pass(graph, 'conv_bias_onednn_fuse_pass')
+        graph = self._apply_pass(graph, 'conv_transpose_bias_onednn_fuse_pass')
+        graph = self._apply_pass(graph, 'conv_elementwise_add_onednn_fuse_pass')
+        graph = self._apply_pass(graph, 'conv_activation_onednn_fuse_pass')
         graph = self._apply_pass(
             graph, 'fc_fuse_pass', ['use_gpu', 'use_fc_padding'], [False, False]
         )
         graph = self._apply_pass(graph, 'repeated_fc_relu_fuse_pass')
         if self._is_fc_quantized(graph):
             # Disabled due to topology-dependent speed-up
-            graph = self._apply_pass(graph, 'fc_mkldnn_pass')
-            graph = self._apply_pass(graph, 'fc_act_mkldnn_fuse_pass')
+            graph = self._apply_pass(graph, 'fc_onednn_pass')
+            graph = self._apply_pass(graph, 'fc_act_onednn_fuse_pass')
         graph = self._apply_pass(
-            graph, 'matmul_transpose_reshape_mkldnn_fuse_pass'
+            graph, 'matmul_transpose_reshape_onednn_fuse_pass'
         )
         graph = self._apply_pass(
-            graph, 'matmul_elementwise_add_mkldnn_fuse_pass'
+            graph, 'matmul_elementwise_add_onednn_fuse_pass'
         )
-        graph = self._apply_pass(graph, 'matmul_activation_mkldnn_fuse_pass')
+        graph = self._apply_pass(graph, 'matmul_activation_onednn_fuse_pass')
         graph = self._apply_pass(graph, 'batch_norm_act_fuse_pass')
         graph = self._apply_pass(graph, 'softplus_activation_onednn_fuse_pass')
         graph = self._apply_pass(graph, 'scale_matmul_fuse_pass')
         graph = self._apply_pass(
-            graph, 'reshape_transpose_matmul_mkldnn_fuse_pass'
+            graph, 'reshape_transpose_matmul_onednn_fuse_pass'
         )
         # the following pass should be the last one since it will work on all fused ops.
         graph = self._apply_pass(graph, 'runtime_context_cache_pass')
@@ -610,9 +606,7 @@ def _compute_gru_weight_scales(wx_name, wh_name):
                 if op.op().type() in self._gru_ops:
                     assert len(op.input(wx_name)) == len(
                         op.input(wh_name)
-                    ), 'Mismatch in number of weights inputs ({} for WeightX vs. {} for WeightH).'.format(
-                        len(op.input(wx_name)), len(op.input(wh_name))
-                    )
+                    ), f'Mismatch in number of weights inputs ({len(op.input(wx_name))} for WeightX vs. {len(op.input(wh_name))} for WeightH).'
                     for i, wx_var_name in enumerate(op.input(wx_name)):
                         wh_var_name = op.input(wh_name)[i]
                         use_unsigned_int = False
@@ -640,9 +634,7 @@ def _compute_lstm_weight_scales(wx_name, wh_name):
                 if op.op().type() in self._lstm_ops:
                     assert len(op.input(wx_name)) == len(
                         op.input(wh_name)
-                    ), 'Mismatch in number of weights inputs ({} for WeightX vs. {} for WeightH).'.format(
-                        len(op.input(wx_name)), len(op.input(wh_name))
-                    )
+                    ), f'Mismatch in number of weights inputs ({len(op.input(wx_name))} for WeightX vs. {len(op.input(wh_name))} for WeightH).'
                     for i, wx_var_name in enumerate(op.input(wx_name)):
                         wh_var_name = op.input(wh_name)[i]
                         use_unsigned_int = False
@@ -711,7 +703,7 @@ def _get_data_layout(self, graph):
     def _quantize_fp32_graph(self, graph):
         graph = self._apply_pass(graph, 'scale_matmul_fuse_pass')
         graph = self._apply_pass(
-            graph, 'reshape_transpose_matmul_mkldnn_fuse_pass'
+            graph, 'reshape_transpose_matmul_onednn_fuse_pass'
         )
         graph = self._apply_pass(
             graph,
@@ -726,6 +718,6 @@ def _quantize_fp32_graph(self, graph):
             [self._var_quant_scales, self._get_data_layout(graph)],
         )
         graph = self._apply_pass(graph, 'cpu_quantize_squash_pass')
-        graph = self._apply_pass(graph, 'int8_scale_calculation_mkldnn_pass')
-        graph = self._apply_pass(graph, 'params_quantization_mkldnn_pass')
+        graph = self._apply_pass(graph, 'int8_scale_calculation_onednn_pass')
+        graph = self._apply_pass(graph, 'params_quantization_onednn_pass')
         return graph
diff --git a/python/paddle/static/quantization/quant_int8_mkldnn_pass.py b/python/paddle/static/quantization/quant_int8_onednn_pass.py
similarity index 94%
rename from python/paddle/static/quantization/quant_int8_mkldnn_pass.py
rename to python/paddle/static/quantization/quant_int8_onednn_pass.py
index 2d4cc4bfd8364..ad706837e0653 100644
--- a/python/paddle/static/quantization/quant_int8_mkldnn_pass.py
+++ b/python/paddle/static/quantization/quant_int8_onednn_pass.py
@@ -54,8 +54,8 @@ def __init__(self, _scope=None, _place=None):
 
                 >>> graph = IrGraph(core.Graph(static.Program().desc), for_test=False)
                 >>> place = paddle.CPUPlace()
-                >>> mkldnn_pass = QuantInt8MkldnnPass(static.global_scope(), place)
-                >>> mkldnn_pass.apply(graph)
+                >>> onednn_pass = QuantInt8MkldnnPass(static.global_scope(), place)
+                >>> onednn_pass.apply(graph)
         """
 
         self._scope = _scope
@@ -121,23 +121,23 @@ def apply(self, graph):
         for op_node in ops:
             if op_node.name() in self._quantizable_ops:
                 if op_node.name() in self._conv_ops:
-                    self._transform_to_conv_mkldnn(graph, op_node)
+                    self._transform_to_conv_onednn(graph, op_node)
                 elif op_node.name() in self._pool_ops:
-                    self._transform_to_pool_mkldnn(graph, op_node)
+                    self._transform_to_pool_onednn(graph, op_node)
                 else:
-                    self._transform_to_mul_mkldnn(graph, op_node)
+                    self._transform_to_mul_onednn(graph, op_node)
             elif op_node.name() in self._quantize_type:
-                self._transform_to_quantize_mkldnn(graph, op_node)
+                self._transform_to_quantize_onednn(graph, op_node)
             elif op_node.name() in self._dequantize_type:
                 self._remove_fake_dequantize_op(graph, op_node)
             self._remove_unused_var_nodes(graph)
         return graph
 
-    def _transform_to_pool_mkldnn(self, graph, op):
+    def _transform_to_pool_onednn(self, graph, op):
         output_name = op.output("Out")[0]
         input_name = op.input("X")[0]
 
-    def _transform_to_conv_mkldnn(self, graph, op_node):
+    def _transform_to_conv_onednn(self, graph, op_node):
         weight_name = op_node.input("Filter")[0]
         output_name = op_node.output("Output")[0]
         # Convert int8 range weights to fp32 range weights
@@ -182,7 +182,7 @@ def _transform_to_conv_mkldnn(self, graph, op_node):
         graph.link_to(conv_op_node, output_var_node)
         graph.safe_remove_nodes(op_node)
 
-    def _transform_to_mul_mkldnn(self, graph, op_node):
+    def _transform_to_mul_onednn(self, graph, op_node):
         # For MKL-DNN INT8 mul, input Y should be the weights
         weight_name = op_node.input("Y")[0]
         output_name = op_node.output("Out")[0]
@@ -228,9 +228,9 @@ def _transform_to_mul_mkldnn(self, graph, op_node):
         graph.link_to(mul_op_node, output_var_node)
         graph.safe_remove_nodes(op_node)
 
-    def _transform_to_quantize_mkldnn(self, graph, op_node):
+    def _transform_to_quantize_onednn(self, graph, op_node):
         """
-        Transform fake_quantize_xx op to quantize mkldnn op in the graph.
+        Transform fake_quantize_xx op to quantize onednn op in the graph.
         """
         input_var_node = graph._find_node_by_name(
             op_node.inputs, op_node.input("X")[0]
diff --git a/python/paddle/static/quantization/quanter.py b/python/paddle/static/quantization/quanter.py
index 86696fe82247f..a8f3cc29b27f2 100644
--- a/python/paddle/static/quantization/quanter.py
+++ b/python/paddle/static/quantization/quanter.py
@@ -197,10 +197,8 @@ def _parse_configs(user_config):
         for op_type in configs['quantize_op_types']:
             assert (op_type in QUANT_DEQUANT_PASS_OP_TYPES) or (
                 op_type in TRANSFORM_PASS_OP_TYPES
-            ), "{} is not support, \
-                        now support op types are {}".format(
-                op_type, TRANSFORM_PASS_OP_TYPES + QUANT_DEQUANT_PASS_OP_TYPES
-            )
+            ), f"{op_type} is not support, \
+                        now support op types are {TRANSFORM_PASS_OP_TYPES + QUANT_DEQUANT_PASS_OP_TYPES}"
 
     assert isinstance(configs['dtype'], str), "dtype must be a str."
 
diff --git a/python/paddle/static/quantization/quantization_pass.py b/python/paddle/static/quantization/quantization_pass.py
index 813d10d2d4229..9d8a70ffcdaee 100644
--- a/python/paddle/static/quantization/quantization_pass.py
+++ b/python/paddle/static/quantization/quantization_pass.py
@@ -2945,9 +2945,7 @@ def apply(self, graph):
                             paddle.float16,
                         ]:
                             _logger.warning(
-                                "Since the {} contains an input of type INT, the quantization of this layer is skipped.".format(
-                                    op_node.name()
-                                )
+                                f"Since the {op_node.name()} contains an input of type INT, the quantization of this layer is skipped."
                             )
                             break
 
@@ -3430,9 +3428,7 @@ def _insert_quant_dequant_op(self, graph, var_node):
                 )
             else:
                 _logger.warning(
-                    "Cannot find the target node {} in scope, so skip adding quant node.".format(
-                        var_name
-                    )
+                    f"Cannot find the target node {var_name} in scope, so skip adding quant node."
                 )
                 return None
         try:
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 4513bcbdba8f8..3afdca0fb21ce 100644
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -93,6 +93,7 @@
     qr,
     solve,
     svd,
+    svd_lowrank,
     t,
     t_,
     transpose,
@@ -467,6 +468,7 @@
     'qr',
     'householder_product',
     'pca_lowrank',
+    'svd_lowrank',
     'eigvals',
     'eigvalsh',
     'abs',
diff --git a/python/paddle/tensor/array.py b/python/paddle/tensor/array.py
index f2e2571dc0eb4..5f0c128a6e9d8 100644
--- a/python/paddle/tensor/array.py
+++ b/python/paddle/tensor/array.py
@@ -292,9 +292,7 @@ def create_array(dtype, initialized_list=None):
     if initialized_list is not None:
         if not isinstance(initialized_list, (list, tuple)):
             raise TypeError(
-                "Require type(initialized_list) should be list/tuple, but received {}".format(
-                    type(initialized_list)
-                )
+                f"Require type(initialized_list) should be list/tuple, but received {type(initialized_list)}"
             )
         array = list(initialized_list)
 
@@ -302,9 +300,7 @@ def create_array(dtype, initialized_list=None):
     for val in array:
         if not isinstance(val, (Variable, paddle.pir.Value)):
             raise TypeError(
-                "All values in `initialized_list` should be Variable or pir.Value, but received {}.".format(
-                    type(val)
-                )
+                f"All values in `initialized_list` should be Variable or pir.Value, but received {type(val)}."
             )
 
     if in_dynamic_mode():
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index f8419f75c3694..ce23aa245fc5d 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -378,10 +378,8 @@ def linspace(start, stop, num, dtype=None, name=None):
             and out_dtype == "int32"
         ):
             raise ValueError(
-                "The dtype of start/stop is {}/{} but the attr(dtype) of linspace is {}, "
-                "which may cause data type overflows. Please reset attr(dtype) of linspace.".format(
-                    start_dtype, stop_dtype, dtype
-                )
+                f"The dtype of start/stop is {start_dtype}/{stop_dtype} but the attr(dtype) of linspace is {dtype}, "
+                "which may cause data type overflows. Please reset attr(dtype) of linspace."
             )
 
         out = helper.create_variable_for_type_inference(dtype=dtype)
@@ -532,10 +530,8 @@ def logspace(start, stop, num, base=10.0, dtype=None, name=None):
             and out_dtype == "int32"
         ):
             raise ValueError(
-                "The dtype of start/stop/base is {}/{}/{} but the attr(dtype) of logspace is {}, "
-                "which may cause data type overflows. Please reset attr(dtype) of logspace.".format(
-                    start_dtype, stop_dtype, base_dtype, dtype
-                )
+                f"The dtype of start/stop/base is {start_dtype}/{stop_dtype}/{base_dtype} but the attr(dtype) of logspace is {dtype}, "
+                "which may cause data type overflows. Please reset attr(dtype) of logspace."
             )
 
         out = helper.create_variable_for_type_inference(dtype=dtype)
@@ -612,9 +608,7 @@ def _handle_np_dtype(ndarray, dtype):
             return data
         else:
             raise TypeError(
-                "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|np.ndarray|paddle.Tensor".format(
-                    type(data)
-                )
+                f"Can't constructs a 'paddle.Tensor' with data type {type(data)}, data type must be scalar|list|tuple|np.ndarray|paddle.Tensor"
             )
         if not dtype:
             if data.dtype in [
@@ -911,7 +905,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
             paddle.utils.check_shape(shape)
             if isinstance(shape, (list, tuple)):
                 if paddle.utils._contain_var(shape):
-                    shape = paddle.utils.get_int_tensor_list(shape, place)
+                    shape = paddle.utils.get_int_tensor_list(shape)
             elif isinstance(shape, paddle.pir.Value):
                 pass
             else:
@@ -2058,9 +2052,7 @@ def diag(x, offset=0, padding_value=0, name=None):
         check_type(padding_value, 'padding_value', (int, float), 'diag_v2')
         if len(x.shape) != 1 and len(x.shape) != 2:
             raise ValueError(
-                "The dimension of input x must be either 1 or 2, but received {}".format(
-                    len(x.shape)
-                )
+                f"The dimension of input x must be either 1 or 2, but received {len(x.shape)}"
             )
 
         helper = LayerHelper("diag_v2", **locals())
@@ -2133,7 +2125,37 @@ def empty(shape, dtype=None, name=None):
     dtype = convert_dtype(dtype)
 
     if in_dynamic_or_pir_mode():
-        shape = paddle.utils.convert_shape_to_list(shape)
+        if in_dynamic_mode():
+            shape = paddle.utils.convert_shape_to_list(shape)
+        else:
+            check_dtype(
+                dtype,
+                'dtype',
+                [
+                    'bool',
+                    'float16',
+                    'float32',
+                    'float64',
+                    'uint16',
+                    'int32',
+                    'int64',
+                    'complex64',
+                    'complex128',
+                ],
+                'empty',
+            )
+
+            paddle.utils.check_shape(shape)
+            if isinstance(shape, np.ndarray):
+                shape = shape.tolist()
+            if isinstance(shape, (list, tuple)):
+                if paddle.utils._contain_var(shape):
+                    shape = paddle.utils.get_int_tensor_list(shape)
+            elif isinstance(shape, paddle.pir.Value):
+                pass
+            else:
+                raise TypeError("Shape only supports Value, or list, or tuple.")
+
         out = _C_ops.empty(
             shape, convert_np_dtype_to_dtype_(dtype), _current_expected_place()
         )
@@ -2714,6 +2736,15 @@ def tril_indices(row, col, offset=0, dtype='int64'):
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
+    if not isinstance(row, int) or row < 0:
+        raise TypeError("row should be a non-negative int")
+
+    if col is not None:
+        if not isinstance(col, int) or col < 0:
+            raise TypeError("col should be a non-negative int")
+    else:
+        col = row
+
     if in_dynamic_or_pir_mode():
         if col is None:
             col = row
@@ -2722,15 +2753,6 @@ def tril_indices(row, col, offset=0, dtype='int64'):
         )
         return out
     else:
-        if not isinstance(row, int) or row < 0:
-            raise TypeError("row should be a non-negative int")
-
-        if col is not None:
-            if not isinstance(col, int) or col < 0:
-                raise TypeError("col should be a non-negative int")
-        else:
-            col = row
-
         if not isinstance(offset, int):
             raise TypeError("offset should be a  int")
 
@@ -2793,6 +2815,15 @@ def triu_indices(row, col=None, offset=0, dtype='int64'):
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
+    if not isinstance(row, int) or row < 0:
+        raise TypeError("row should be a non-negative int")
+
+    if col is not None:
+        if not isinstance(col, int) or col < 0:
+            raise TypeError("col should be a non-negative int")
+    else:
+        col = row
+
     if in_dynamic_or_pir_mode():
         if col is None:
             col = row
@@ -2801,15 +2832,6 @@ def triu_indices(row, col=None, offset=0, dtype='int64'):
         )
         return out
     else:
-        if not isinstance(row, int) or row < 0:
-            raise TypeError("row should be a non-negative int")
-
-        if col is not None:
-            if not isinstance(col, int) or col < 0:
-                raise TypeError("col should be a non-negative int")
-        else:
-            col = row
-
         if not isinstance(offset, int):
             raise TypeError("offset should be a int")
 
diff --git a/python/paddle/tensor/layer_function_generator.py b/python/paddle/tensor/layer_function_generator.py
index d5a875794fe7d..91d9885b31ea2 100644
--- a/python/paddle/tensor/layer_function_generator.py
+++ b/python/paddle/tensor/layer_function_generator.py
@@ -199,9 +199,7 @@ def infer_and_check_dtype(op_proto, *args, **kwargs):
                     dtype = each.dtype
                 elif dtype != each.dtype:
                     raise ValueError(
-                        "operator {} must input same dtype. {} vs {}".format(
-                            op_type, dtype, each.dtype
-                        )
+                        f"operator {op_type} must input same dtype. {dtype} vs {each.dtype}"
                     )
 
         if dtype is None:
@@ -352,12 +350,10 @@ def func(x, name=None):
                 return op(x)
 
     func.__name__ = inplace_op_type
-    func.__doc__ = """
-Inplace version of ``{}`` API, the output Tensor will be inplaced with input ``x``.
-Please refer to :ref:`api_paddle_{}`.
-""".format(
-        origin_op_type, origin_op_type
-    )
+    func.__doc__ = f"""
+Inplace version of ``{origin_op_type}`` API, the output Tensor will be inplaced with input ``x``.
+Please refer to :ref:`api_paddle_{origin_op_type}`.
+"""
     return func
 
 
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 5ff36cdb754d5..6e0edd1394c99 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -661,7 +661,7 @@ def nuclear_norm(input, axis=axis, keepdim=False, name=None):
         perm = _backshift_permutation(axis[0], axis[1], len(input.shape))
         inv_perm = _inverse_permutation(perm)
 
-        if in_dynamic_mode():
+        if in_dynamic_or_pir_mode():
             transposed = _C_ops.transpose(input, perm)
             u, s, vh = _C_ops.svd(transposed, False)
             result = _C_ops.sum(s, -1, None, keepdim)
@@ -754,7 +754,7 @@ def p_matrix_norm(input, porder=1.0, axis=axis, keepdim=False, name=None):
         perm = _backshift_permutation(axis[0], axis[1], len(input.shape))
         inv_perm = _inverse_permutation(perm)
 
-        if in_dynamic_mode():
+        if in_dynamic_or_pir_mode():
             abs_ord = abs(porder)
 
             max_min = _C_ops.max if porder > 0.0 else _C_ops.min
@@ -1723,10 +1723,8 @@ def cov(x, rowvar=True, ddof=True, fweights=None, aweights=None, name=None):
             )
         if fweights.shape[0] != observation_num:
             raise ValueError(
-                "The number of Input(fweights) should equal to x's dim[1]: {}, but received "
-                "size of Input(fweights) is {}.".format(
-                    observation_num, fweights.shape[0]
-                )
+                f"The number of Input(fweights) should equal to x's dim[1]: {observation_num}, but received "
+                f"size of Input(fweights) is {fweights.shape[0]}."
             )
         if fweights.min() < 0:
             raise ValueError(
@@ -1748,10 +1746,8 @@ def cov(x, rowvar=True, ddof=True, fweights=None, aweights=None, name=None):
         )
         if aweights.shape[0] != observation_num:
             raise ValueError(
-                "The number of Input(aweights) should equal to x's dim[1]: {}, but received "
-                "size of Input(aweights) is {}.".format(
-                    observation_num, aweights.shape[0]
-                )
+                f"The number of Input(aweights) should equal to x's dim[1]: {observation_num}, but received "
+                f"size of Input(aweights) is {aweights.shape[0]}."
             )
         if aweights.min() < 0:
             raise ValueError(
@@ -1904,8 +1900,8 @@ def cross(x, y, axis=9, name=None):
     If `axis` is not given, it defaults to the first axis found with the length 3.
 
     Args:
-        x (Tensor): The first input tensor, the data type is float16, float32, float64, int32, int64.
-        y (Tensor): The second input tensor, the data type is float16, float32, float64, int32, int64.
+        x (Tensor): The first input tensor, the data type is float16, float32, float64, int32, int64, complex64, complex128.
+        y (Tensor): The second input tensor, the data type is float16, float32, float64, int32, int64, complex64, complex128.
         axis (int, optional): The axis along which to compute the cross product. It defaults to be 9 which indicates using the first axis found with the length 3.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
@@ -1945,13 +1941,31 @@ def cross(x, y, axis=9, name=None):
         check_variable_and_dtype(
             x,
             'x',
-            ['float16', 'uint16', 'float32', 'float64', "int32", "int64"],
+            [
+                'float16',
+                'uint16',
+                'float32',
+                'float64',
+                "int32",
+                "int64",
+                "complex64",
+                "complex128",
+            ],
             'cross',
         )
         check_variable_and_dtype(
             y,
             'y',
-            ['float16', 'uint16', 'float32', 'float64', "int32", "int64"],
+            [
+                'float16',
+                'uint16',
+                'float32',
+                'float64',
+                "int32",
+                "int64",
+                "complex64",
+                "complex128",
+            ],
             'cross',
         )
         helper = LayerHelper("cross", **locals())
@@ -2158,21 +2172,15 @@ def bmm(x, y, name=None):
         y_shape = y.shape
         if not len(x_shape) == len(y_shape) == 3:
             raise ValueError(
-                "x and y should be 3-dimensional. But received x's dimension: {}, y's dimension: {}".format(
-                    x_shape, y_shape
-                )
+                f"x and y should be 3-dimensional. But received x's dimension: {x_shape}, y's dimension: {y_shape}"
             )
         if x_shape[2] != -1 and y_shape[1] != -1 and x_shape[2] != y_shape[1]:
             raise ValueError(
-                "x's width must be equal with y's height. But received x's shape: {}, y's shape: {}".format(
-                    x_shape, y_shape
-                )
+                f"x's width must be equal with y's height. But received x's shape: {x_shape}, y's shape: {y_shape}"
             )
         if x_shape[0] != -1 and y_shape[0] != -1 and x_shape[0] != y_shape[0]:
             raise ValueError(
-                "x's batch (shape[0]) must be equal with y's batch (shape[0]). But received x's shape: {}, y's shape: {}".format(
-                    x_shape, y_shape
-                )
+                f"x's batch (shape[0]) must be equal with y's batch (shape[0]). But received x's shape: {x_shape}, y's shape: {y_shape}"
             )
         helper = LayerHelper('bmm', **locals())
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -2339,9 +2347,7 @@ def __check_input(x, vec):
                 )
             if len(vec_shape) != 1:
                 raise ValueError(
-                    "vec should be 1-dimensional. But received vec's dimension: {}".format(
-                        vec_shape
-                    )
+                    f"vec should be 1-dimensional. But received vec's dimension: {vec_shape}"
                 )
 
         __check_input(x, vec)
@@ -2393,11 +2399,9 @@ def det(x, name=None):
             "but received Input x's dimensional: %s.\n" % len(input_shape)
         )
 
-        assert (
-            input_shape[-1] == input_shape[-2]
-        ), "Expect squared input," "but received {} by {} matrix.\n".format(
-            input_shape[-2],
-            input_shape[-1],
+        assert input_shape[-1] == input_shape[-2], (
+            "Expect squared input,"
+            f"but received {input_shape[-2]} by {input_shape[-1]} matrix.\n"
         )
         helper = LayerHelper('determinant', **locals())
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -2452,11 +2456,9 @@ def slogdet(x, name=None):
             "but received Input x's dimensional: %s.\n" % len(input_shape)
         )
 
-        assert (
-            input_shape[-1] == input_shape[-2]
-        ), "Expect squared input," "but received {} by {} matrix.\n".format(
-            input_shape[-2],
-            input_shape[-1],
+        assert input_shape[-1] == input_shape[-2], (
+            "Expect squared input,"
+            f"but received {input_shape[-2]} by {input_shape[-1]} matrix.\n"
         )
         helper = LayerHelper('slogdeterminant', **locals())
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -2547,6 +2549,161 @@ def svd(x, full_matrices=False, name=None):
         return u, s, vh
 
 
+def _conjugate(x):
+    if x.is_complex():
+        return x.conj()
+    return x
+
+
+def _transpose(x):
+    shape = x.shape
+    perm = list(range(0, len(shape)))
+    perm = perm[:-2] + [perm[-1]] + [perm[-2]]
+    return paddle.transpose(x, perm)
+
+
+def _transjugate(x):
+    return _conjugate(_transpose(x))
+
+
+def _get_approximate_basis(x, q, niter=2, M=None):
+    niter = 2 if niter is None else niter
+    m, n = x.shape[-2:]
+    qr = paddle.linalg.qr
+
+    R = paddle.randn((n, q), dtype=x.dtype)
+
+    A_t = _transpose(x)
+    A_H = _conjugate(A_t)
+    if M is None:
+        Q = qr(paddle.matmul(x, R))[0]
+        for i in range(niter):
+            Q = qr(paddle.matmul(A_H, Q))[0]
+            Q = qr(paddle.matmul(x, Q))[0]
+    else:
+        M_H = _transjugate(M)
+        Q = qr(paddle.matmul(x, R) - paddle.matmul(M, R))[0]
+        for i in range(niter):
+            Q = qr(paddle.matmul(A_H, Q) - paddle.matmul(M_H, Q))[0]
+            Q = qr(paddle.matmul(x, Q) - paddle.matmul(M, Q))[0]
+
+    return Q
+
+
+def svd_lowrank(x, q=None, niter=2, M=None, name=None):
+    r"""
+    Return the singular value decomposition (SVD) on a low-rank matrix or batches of such matrices.
+
+    If :math:`X` is the input matrix or a batch of input matrices, the output should satisfies:
+
+    .. math::
+        X \approx U * diag(S) * V^{T}
+
+    When :math:`M` is given, the output should satisfies:
+
+    .. math::
+        X - M \approx U * diag(S) * V^{T}
+
+    Args:
+        x (Tensor): The input tensor. Its shape should be `[..., N, M]`, where `...` is
+            zero or more batch dimensions. N and M can be arbitrary positive number.
+            The data type of ``x`` should be float32 or float64.
+        q (int, optional): A slightly overestimated rank of :math:`X`.
+            Default value is None, which means the overestimated rank is 6.
+        niter (int, optional): The number of iterations to perform. Default: 2.
+        M (Tensor, optional): The input tensor's mean. Its shape should be `[..., 1, M]`.
+            Default value is None.
+        name (str, optional): Name for the operation. For more information, please
+            refer to :ref:`api_guide_Name`. Default: None.
+
+    Returns:
+        - Tensor U, is N x q matrix.
+        - Tensor S, is a vector with length q.
+        - Tensor V, is M x q matrix.
+
+        tuple (U, S, V): which is the nearly optimal approximation of a singular value decomposition of the matrix :math:`X` or :math:`X - M`.
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+            >>> paddle.seed(2024)
+
+            >>> x = paddle.randn((5, 5), dtype='float64')
+            >>> U, S, V = paddle.linalg.svd_lowrank(x)
+            >>> print(U)
+            Tensor(shape=[5, 5], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[-0.03586982, -0.17211503,  0.31536566, -0.38225676, -0.85059629],
+             [-0.38386839,  0.67754925,  0.23222694,  0.51777188, -0.26749766],
+             [-0.85977150, -0.28442378, -0.41412094, -0.08955629, -0.01948348],
+             [ 0.18611503,  0.56047358, -0.67717019, -0.39286761, -0.19577062],
+             [ 0.27841082, -0.34099254, -0.46535957,  0.65071250, -0.40770727]])
+
+            >>> print(S)
+            Tensor(shape=[5], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [4.11253399, 3.03227120, 2.45499752, 1.25602436, 0.45825337])
+
+            >>> print(V)
+            Tensor(shape=[5, 5], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[ 0.46401347,  0.50977695, -0.08742316, -0.11140428, -0.71046833],
+             [-0.48927226, -0.35047624,  0.07918771,  0.45431083, -0.65200463],
+             [-0.20494730,  0.67097011, -0.05427719,  0.66510472,  0.24997083],
+             [-0.69645001,  0.40237917,  0.09360970, -0.58032322, -0.08666357],
+             [ 0.13512270,  0.07199989,  0.98710572,  0.04529277,  0.01134594]])
+    """
+    if not paddle.is_tensor(x):
+        raise ValueError(f'Input must be tensor, but got {type(x)}')
+
+    m, n = x.shape[-2:]
+    if q is None:
+        q = min(6, m, n)
+    elif not (q >= 0 and q <= min(m, n)):
+        raise ValueError(
+            f'q(={q}) must be non-negative integer'
+            f' and not greater than min(m, n)={min(m, n)}'
+        )
+
+    if not (niter >= 0):
+        raise ValueError(f'niter(={niter}) must be non-negative integer')
+
+    if M is None:
+        M_t = None
+    else:
+        M = M.broadcast_to(x.shape)
+        M_t = _transpose(M)
+    A_t = _transpose(x)
+
+    if m < n or n > q:
+        Q = _get_approximate_basis(A_t, q, niter=niter, M=M_t)
+        Q_c = _conjugate(Q)
+        if M is None:
+            B_t = paddle.matmul(x, Q_c)
+        else:
+            B_t = paddle.matmul(x, Q_c) - paddle.matmul(M, Q_c)
+        assert B_t.shape[-2] == m, (B_t.shape, m)
+        assert B_t.shape[-1] == q, (B_t.shape, q)
+        assert B_t.shape[-1] <= B_t.shape[-2], B_t.shape
+        U, S, Vh = paddle.linalg.svd(B_t, full_matrices=False)
+        V = _transjugate(Vh)
+        V = Q.matmul(V)
+    else:
+        Q = _get_approximate_basis(x, q, niter=niter, M=M)
+        Q_c = _conjugate(Q)
+        if M is None:
+            B = paddle.matmul(A_t, Q_c)
+        else:
+            B = paddle.matmul(A_t, Q_c) - paddle.matmul(M_t, Q_c)
+        B_t = _transpose(B)
+        assert B_t.shape[-2] == q, (B_t.shape, q)
+        assert B_t.shape[-1] == n, (B_t.shape, n)
+        assert B_t.shape[-1] <= B_t.shape[-2], B_t.shape
+        U, S, Vh = paddle.linalg.svd(B_t, full_matrices=False)
+        V = _transjugate(Vh)
+        U = Q.matmul(U)
+
+    return U, S, V
+
+
 def pca_lowrank(x, q=None, center=True, niter=2, name=None):
     r"""
     Performs linear Principal Component Analysis (PCA) on a low-rank matrix or batches of such matrices.
@@ -2604,82 +2761,6 @@ def pca_lowrank(x, q=None, center=True, niter=2, name=None):
              [-0.67131070, -0.19071018,  0.07795789, -0.04615811,  0.71046714]])
     """
 
-    def conjugate(x):
-        if x.is_complex():
-            return x.conj()
-        return x
-
-    def transpose(x):
-        shape = x.shape
-        perm = list(range(0, len(shape)))
-        perm = perm[:-2] + [perm[-1]] + [perm[-2]]
-        return paddle.transpose(x, perm)
-
-    def transjugate(x):
-        return conjugate(transpose(x))
-
-    def get_approximate_basis(x, q, niter=2, M=None):
-        niter = 2 if niter is None else niter
-        m, n = x.shape[-2:]
-        qr = paddle.linalg.qr
-
-        R = paddle.randn((n, q), dtype=x.dtype)
-
-        A_t = transpose(x)
-        A_H = conjugate(A_t)
-        if M is None:
-            Q = qr(paddle.matmul(x, R))[0]
-            for i in range(niter):
-                Q = qr(paddle.matmul(A_H, Q))[0]
-                Q = qr(paddle.matmul(x, Q))[0]
-        else:
-            M_H = transjugate(M)
-            Q = qr(paddle.matmul(x, R) - paddle.matmul(M, R))[0]
-            for i in range(niter):
-                Q = qr(paddle.matmul(A_H, Q) - paddle.matmul(M_H, Q))[0]
-                Q = qr(paddle.matmul(x, Q) - paddle.matmul(M, Q))[0]
-
-        return Q
-
-    def svd_lowrank(x, q=6, niter=2, M=None):
-        q = 6 if q is None else q
-        m, n = x.shape[-2:]
-        if M is None:
-            M_t = None
-        else:
-            M_t = transpose(M)
-        A_t = transpose(x)
-
-        if m < n or n > q:
-            Q = get_approximate_basis(A_t, q, niter=niter, M=M_t)
-            Q_c = conjugate(Q)
-            if M is None:
-                B_t = paddle.matmul(x, Q_c)
-            else:
-                B_t = paddle.matmul(x, Q_c) - paddle.matmul(M, Q_c)
-            assert B_t.shape[-2] == m, (B_t.shape, m)
-            assert B_t.shape[-1] == q, (B_t.shape, q)
-            assert B_t.shape[-1] <= B_t.shape[-2], B_t.shape
-            U, S, Vh = paddle.linalg.svd(B_t, full_matrices=False)
-            V = transjugate(Vh)
-            V = Q.matmul(V)
-        else:
-            Q = get_approximate_basis(x, q, niter=niter, M=M)
-            Q_c = conjugate(Q)
-            if M is None:
-                B = paddle.matmul(A_t, Q_c)
-            else:
-                B = paddle.matmul(A_t, Q_c) - paddle.matmul(M_t, Q_c)
-            B_t = transpose(B)
-            assert B_t.shape[-2] == q, (B_t.shape, q)
-            assert B_t.shape[-1] == n, (B_t.shape, n)
-            assert B_t.shape[-1] <= B_t.shape[-2], B_t.shape
-            U, S, Vh = paddle.linalg.svd(B_t, full_matrices=False)
-            V = transjugate(Vh)
-            U = Q.matmul(U)
-
-        return U, S, V
-
     if not paddle.is_tensor(x):
         raise ValueError(f'Input must be tensor, but got {type(x)}')
 
@@ -3153,9 +3234,7 @@ def eigvals(x, name=None):
     x_shape = list(x.shape)
     if len(x_shape) < 2:
         raise ValueError(
-            "The dimension of Input(x) should be at least 2, but received x's dimension = {}, x's shape = {}".format(
-                len(x_shape), x_shape
-            )
+            f"The dimension of Input(x) should be at least 2, but received x's dimension = {len(x_shape)}, x's shape = {x_shape}"
         )
 
     if x_shape[-1] != x_shape[-2]:
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index 64fb7c0aadd97..a5a2ea7846578 100755
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -161,9 +161,7 @@ def logical_and_(x, y, name=None):
     out_shape = broadcast_shape(x.shape, y.shape)
     if out_shape != x.shape:
         raise ValueError(
-            "The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(
-                out_shape, x.shape
-            )
+            f"The shape of broadcast output {out_shape} is different from that of inplace tensor {x.shape} in the Inplace operation."
         )
     if in_dynamic_mode():
         return _C_ops.logical_and_(x, y)
@@ -222,9 +220,7 @@ def logical_or_(x, y, name=None):
     out_shape = broadcast_shape(x.shape, y.shape)
     if out_shape != x.shape:
         raise ValueError(
-            "The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(
-                out_shape, x.shape
-            )
+            f"The shape of broadcast output {out_shape} is different from that of inplace tensor {x.shape} in the Inplace operation."
         )
     if in_dynamic_mode():
         return _C_ops.logical_or_(x, y)
@@ -284,9 +280,7 @@ def logical_xor_(x, y, name=None):
     out_shape = broadcast_shape(x.shape, y.shape)
     if out_shape != x.shape:
         raise ValueError(
-            "The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(
-                out_shape, x.shape
-            )
+            f"The shape of broadcast output {out_shape} is different from that of inplace tensor {x.shape} in the Inplace operation."
         )
     if in_dynamic_mode():
         return _C_ops.logical_xor_(x, y)
@@ -605,9 +599,7 @@ def equal_(x, y, name=None):
     out_shape = broadcast_shape(x.shape, y.shape)
     if out_shape != x.shape:
         raise ValueError(
-            "The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(
-                out_shape, x.shape
-            )
+            f"The shape of broadcast output {out_shape} is different from that of inplace tensor {x.shape} in the Inplace operation."
         )
     if in_dynamic_or_pir_mode():
         return _C_ops.equal_(x, y)
@@ -699,9 +691,7 @@ def greater_equal_(x, y, name=None):
     out_shape = broadcast_shape(x.shape, y.shape)
     if out_shape != x.shape:
         raise ValueError(
-            "The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(
-                out_shape, x.shape
-            )
+            f"The shape of broadcast output {out_shape} is different from that of inplace tensor {x.shape} in the Inplace operation."
         )
     if in_dynamic_mode():
         return _C_ops.greater_equal_(x, y)
@@ -793,9 +783,7 @@ def greater_than_(x, y, name=None):
     out_shape = broadcast_shape(x.shape, y.shape)
     if out_shape != x.shape:
         raise ValueError(
-            "The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(
-                out_shape, x.shape
-            )
+            f"The shape of broadcast output {out_shape} is different from that of inplace tensor {x.shape} in the Inplace operation."
         )
     if in_dynamic_mode():
         return _C_ops.greater_than_(x, y)
@@ -888,9 +876,7 @@ def less_equal_(x, y, name=None):
     out_shape = broadcast_shape(x.shape, y.shape)
     if out_shape != x.shape:
         raise ValueError(
-            "The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(
-                out_shape, x.shape
-            )
+            f"The shape of broadcast output {out_shape} is different from that of inplace tensor {x.shape} in the Inplace operation."
         )
     if in_dynamic_mode():
         return _C_ops.less_equal_(x, y)
@@ -983,9 +969,7 @@ def less_than_(x, y, name=None):
     out_shape = broadcast_shape(x.shape, y.shape)
     if out_shape != x.shape:
         raise ValueError(
-            "The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(
-                out_shape, x.shape
-            )
+            f"The shape of broadcast output {out_shape} is different from that of inplace tensor {x.shape} in the Inplace operation."
         )
     if in_dynamic_mode():
         return _C_ops.less_than_(x, y)
@@ -1078,9 +1062,7 @@ def not_equal_(x, y, name=None):
     out_shape = broadcast_shape(x.shape, y.shape)
     if out_shape != x.shape:
         raise ValueError(
-            "The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(
-                out_shape, x.shape
-            )
+            f"The shape of broadcast output {out_shape} is different from that of inplace tensor {x.shape} in the Inplace operation."
         )
     if in_dynamic_mode():
         return _C_ops.not_equal_(x, y)
@@ -1214,9 +1196,7 @@ def bitwise_and_(x, y, name=None):
     out_shape = broadcast_shape(x.shape, y.shape)
     if out_shape != x.shape:
         raise ValueError(
-            "The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(
-                out_shape, x.shape
-            )
+            f"The shape of broadcast output {out_shape} is different from that of inplace tensor {x.shape} in the Inplace operation."
         )
     if in_dynamic_or_pir_mode():
         return _C_ops.bitwise_and_(x, y)
@@ -1273,9 +1253,7 @@ def bitwise_or_(x, y, name=None):
     out_shape = broadcast_shape(x.shape, y.shape)
     if out_shape != x.shape:
         raise ValueError(
-            "The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(
-                out_shape, x.shape
-            )
+            f"The shape of broadcast output {out_shape} is different from that of inplace tensor {x.shape} in the Inplace operation."
         )
     if in_dynamic_mode():
         return _C_ops.bitwise_or_(x, y)
@@ -1331,9 +1309,7 @@ def bitwise_xor_(x, y, name=None):
     out_shape = broadcast_shape(x.shape, y.shape)
     if out_shape != x.shape:
         raise ValueError(
-            "The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(
-                out_shape, x.shape
-            )
+            f"The shape of broadcast output {out_shape} is different from that of inplace tensor {x.shape} in the Inplace operation."
         )
     if in_dynamic_mode():
         return _C_ops.bitwise_xor_(x, y)
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 2b450202fd99a..70bcfd1c8291b 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -1655,9 +1655,7 @@ def rot90(x, k=1, axes=[0, 1], name=None):
 
     if not (axes[0] != axes[1] and abs(axes[0] - axes[1]) != input_total_dims):
         raise ValueError(
-            "expected rotation axes to be different, but got axis0 = {}, and axis1 = {}".format(
-                axes[0], axes[1]
-            )
+            f"expected rotation axes to be different, but got axis0 = {axes[0]}, and axis1 = {axes[1]}"
         )
 
     if not (axes[0] < input_total_dims and axes[0] >= -input_total_dims):
@@ -1909,9 +1907,7 @@ def roll(x, shifts, axis=None, name=None):
         for i in range(len(axis)):
             if axis[i] >= len_origin_shape or axis[i] < -len_origin_shape:
                 raise ValueError(
-                    "axis is out of range, it should be in range [{}, {}), but received {}".format(
-                        -len_origin_shape, len_origin_shape, axis
-                    )
+                    f"axis is out of range, it should be in range [{-len_origin_shape}, {len_origin_shape}), but received {axis}"
                 )
     else:
         axis = []
@@ -5831,17 +5827,13 @@ def take_along_axis(arr, indices, axis, broadcast=True):
         for i in range(len(arr.shape)):
             if i != axis and arr.shape[i] < indices.shape[i]:
                 raise RuntimeError(
-                    "Size does not match at dimension {} expected index {} to be smaller than self {} apart from dimension {}".format(
-                        i, indices.shape, arr.shape, axis
-                    )
+                    f"Size does not match at dimension {i} expected index {indices.shape} to be smaller than self {arr.shape} apart from dimension {axis}"
                 )
 
         axis_max_size = arr.shape[axis]
         if in_dynamic_mode() and not (indices < axis_max_size).all():
             raise RuntimeError(
-                "one of element of indices is out of bounds for dimension {} with size {}".format(
-                    axis, axis_max_size
-                )
+                f"one of element of indices is out of bounds for dimension {axis} with size {axis_max_size}"
             )
     if in_dynamic_or_pir_mode():
         return _C_ops.take_along_axis(arr, indices, axis)
@@ -5981,9 +5973,7 @@ def put_along_axis(
                     i != axis and arr.shape[i] < indices.shape[i]
                 ) or indices.shape[i] > values.shape[i]:
                     raise RuntimeError(
-                        "Size does not match at dimension {} expected index {} to be smaller than self {} apart from dimension {} and to be smaller size than values {}".format(
-                            i, indices.shape, arr.shape, axis, values.shape
-                        )
+                        f"Size does not match at dimension {i} expected index {indices.shape} to be smaller than self {arr.shape} apart from dimension {axis} and to be smaller size than values {values.shape}"
                     )
         else:
             values = paddle.to_tensor(values).astype(arr.dtype)
@@ -5995,16 +5985,12 @@ def put_along_axis(
         axis_max_size = arr.shape[axis]
         if in_dynamic_mode() and not (indices < axis_max_size).all():
             raise RuntimeError(
-                "one of element of indices is out of bounds for dimension {} with size {}".format(
-                    axis, axis_max_size
-                )
+                f"one of element of indices is out of bounds for dimension {axis} with size {axis_max_size}"
             )
     if in_dynamic_or_pir_mode():
         if convert_dtype(indices.dtype) not in ['int32', 'int64']:
             raise TypeError(
-                "The data type of indices should be one of ['int32', 'int64'], but got {}".format(
-                    str(convert_dtype(indices.dtype))
-                )
+                f"The data type of indices should be one of ['int32', 'int64'], but got {str(convert_dtype(indices.dtype))}"
             )
         return _C_ops.put_along_axis(
             arr, indices, values, axis, reduce, include_self
@@ -6331,9 +6317,7 @@ def unflatten(x, axis, shape, name=None):
         )
     else:
         raise TypeError(
-            "The data type of x should be one of ['List', 'Tuple', 'Tensor'], but got {}".format(
-                type(shape)
-            )
+            f"The data type of x should be one of ['List', 'Tuple', 'Tensor'], but got {type(shape)}"
         )
     x = x.reshape(new_shape)
     return x
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index eace002859e86..bcee27d687c73 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -122,9 +122,7 @@ def _get_reduce_axis(axis, x):
             axis = [axis]
         else:
             raise TypeError(
-                "The type of axis must be int, list or tuple, but received {}".format(
-                    type(axis)
-                )
+                f"The type of axis must be int, list or tuple, but received {type(axis)}"
             )
     if axis is None:
         axis = []
@@ -719,9 +717,7 @@ def add_(x, y, name=None):
     out_shape = broadcast_shape(x.shape, y.shape)
     if out_shape != x.shape:
         raise ValueError(
-            "The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(
-                out_shape, x.shape
-            )
+            f"The shape of broadcast output {out_shape} is different from that of inplace tensor {x.shape} in the Inplace operation."
         )
 
     return _C_ops.add_(x, y)
@@ -859,9 +855,7 @@ def subtract_(x, y, name=None):
     out_shape = broadcast_shape(x.shape, y.shape)
     if out_shape != x.shape:
         raise ValueError(
-            "The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(
-                out_shape, x.shape
-            )
+            f"The shape of broadcast output {out_shape} is different from that of inplace tensor {x.shape} in the Inplace operation."
         )
 
     return _C_ops.subtract_(x, y)
@@ -916,9 +910,7 @@ def divide_(x, y, name=None):
     out_shape = broadcast_shape(x.shape, y.shape)
     if out_shape != x.shape:
         raise ValueError(
-            "The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(
-                out_shape, x.shape
-            )
+            f"The shape of broadcast output {out_shape} is different from that of inplace tensor {x.shape} in the Inplace operation."
         )
     return _C_ops.divide_(x, y)
 
@@ -977,9 +969,7 @@ def floor_divide_(x, y, name=None):
     out_shape = broadcast_shape(x.shape, y.shape)
     if out_shape != x.shape:
         raise ValueError(
-            "The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(
-                out_shape, x.shape
-            )
+            f"The shape of broadcast output {out_shape} is different from that of inplace tensor {x.shape} in the Inplace operation."
         )
     return _C_ops.floor_divide_(x, y)
 
@@ -1046,9 +1036,7 @@ def remainder_(x, y, name=None):
     out_shape = broadcast_shape(x.shape, y.shape)
     if out_shape != x.shape:
         raise ValueError(
-            "The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(
-                out_shape, x.shape
-            )
+            f"The shape of broadcast output {out_shape} is different from that of inplace tensor {x.shape} in the Inplace operation."
         )
     return _C_ops.remainder_(x, y)
 
@@ -1133,9 +1121,7 @@ def multiply_(x, y, name=None):
     out_shape = broadcast_shape(x.shape, y.shape)
     if out_shape != x.shape:
         raise ValueError(
-            "The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(
-                out_shape, x.shape
-            )
+            f"The shape of broadcast output {out_shape} is different from that of inplace tensor {x.shape} in the Inplace operation."
         )
 
     return _C_ops.multiply_(x, y)
@@ -2186,9 +2172,7 @@ def __check_input(x, y):
                 raise ValueError(
                     "After performing an optional transpose, Input X's width should be "
                     "equal to Y's width for multiplication "
-                    "prerequisites. But received X's shape: {}, Y's shape: {}\n".format(
-                        x_shape, y_shape
-                    )
+                    f"prerequisites. But received X's shape: {x_shape}, Y's shape: {y_shape}\n"
                 )
 
         if len(y_shape) > 2 and len(x_shape) > 2:
@@ -2263,49 +2247,35 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
     y_shape = y.shape
     if not len(x_shape) == len(y_shape) == 2:
         raise ValueError(
-            "The dimension of x, y should be 2 but receive x's shape: {}, y's shape: {}".format(
-                x_shape, y_shape
-            )
+            f"The dimension of x, y should be 2 but receive x's shape: {x_shape}, y's shape: {y_shape}"
         )
     if x_shape[1] != y_shape[0]:
         raise ValueError(
-            "The input Variable x's width must be equal with Variable y' height. But received x's shape = {}, y's shape = {}.".format(
-                x_shape, y_shape
-            )
+            f"The input Variable x's width must be equal with Variable y' height. But received x's shape = {x_shape}, y's shape = {y_shape}."
         )
     if len(input_shape) == 2:
         if input_shape[0] != x_shape[0]:
             if input_shape[0] != 1:
                 raise ValueError(
-                    "When x's dimension[0] is not equal with input's dimension[0], input's dimension[0] must be 1 but got {}".format(
-                        input_shape[0]
-                    )
+                    f"When x's dimension[0] is not equal with input's dimension[0], input's dimension[0] must be 1 but got {input_shape[0]}"
                 )
             if input_shape[1] != y_shape[1] and input_shape[1] != 1:
                 raise ValueError(
-                    "When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {}".format(
-                        input_shape[1]
-                    )
+                    f"When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {input_shape[1]}"
                 )
         if input_shape[1] != y_shape[1]:
             if input_shape[1] != 1:
                 raise ValueError(
-                    "When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {}".format(
-                        input_shape[1]
-                    )
+                    f"When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {input_shape[1]}"
                 )
     elif len(input_shape) == 1:
         if input_shape[0] not in (y_shape[1], 1):
             raise ValueError(
-                "The input's shape: {} is not broadcastable with [x.shape[0], y.shape[1]]: [{},{}]".format(
-                    input_shape, x_shape[0], y_shape[1]
-                )
+                f"The input's shape: {input_shape} is not broadcastable with [x.shape[0], y.shape[1]]: [{x_shape[0]},{y_shape[1]}]"
             )
     else:
         raise ValueError(
-            "The dimension of input should be 2 or 1 but receive input's shape: {}".format(
-                input_shape
-            )
+            f"The dimension of input should be 2 or 1 but receive input's shape: {input_shape}"
         )
 
     if in_dynamic_mode():
@@ -2343,49 +2313,35 @@ def addmm_(input, x, y, beta=1.0, alpha=1.0, name=None):
     y_shape = y.shape
     if not len(x_shape) == len(y_shape) == 2:
         raise ValueError(
-            "The dimension of x, y should be 2 but receive x's shape: {}, y's shape: {}".format(
-                x_shape, y_shape
-            )
+            f"The dimension of x, y should be 2 but receive x's shape: {x_shape}, y's shape: {y_shape}"
         )
     if x_shape[1] != y_shape[0]:
         raise ValueError(
-            "The input Variable x's width must be equal with Variable y' height. But received x's shape = {}, y's shape = {}.".format(
-                x_shape, y_shape
-            )
+            f"The input Variable x's width must be equal with Variable y' height. But received x's shape = {x_shape}, y's shape = {y_shape}."
         )
     if len(input_shape) == 2:
         if input_shape[0] != x_shape[0]:
             if input_shape[0] != 1:
                 raise ValueError(
-                    "When x's dimension[0] is not equal with input's dimension[0], input's dimension[0] must be 1 but got {}".format(
-                        input_shape[0]
-                    )
+                    f"When x's dimension[0] is not equal with input's dimension[0], input's dimension[0] must be 1 but got {input_shape[0]}"
                 )
             if input_shape[1] != y_shape[1] and input_shape[1] != 1:
                 raise ValueError(
-                    "When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {}".format(
-                        input_shape[1]
-                    )
+                    f"When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {input_shape[1]}"
                 )
         if input_shape[1] != y_shape[1]:
             if input_shape[1] != 1:
                 raise ValueError(
-                    "When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {}".format(
-                        input_shape[1]
-                    )
+                    f"When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {input_shape[1]}"
                 )
     elif len(input_shape) == 1:
         if input_shape[0] not in (y_shape[1], 1):
             raise ValueError(
-                "The input's shape: {} is not broadcastable with [x.shape[0], y.shape[1]]: [{},{}]".format(
-                    input_shape, x_shape[0], y_shape[1]
-                )
+                f"The input's shape: {input_shape} is not broadcastable with [x.shape[0], y.shape[1]]: [{x_shape[0]},{y_shape[1]}]"
             )
     else:
         raise ValueError(
-            "The dimension of input should be 2 or 1 but receive input's shape: {}".format(
-                input_shape
-            )
+            f"The dimension of input should be 2 or 1 but receive input's shape: {input_shape}"
         )
 
     if in_dynamic_mode():
@@ -2431,16 +2387,12 @@ def renorm(x, p, axis, max_norm):
     input_shape = x.shape
     if not axis < len(input_shape):
         raise ValueError(
-            "the axis:{} should be less then the shape's size {}:{}".format(
-                axis, len(input_shape), input_shape
-            )
+            f"the axis:{axis} should be less then the shape's size {len(input_shape)}:{input_shape}"
         )
     if not axis >= 0:
         if not axis >= -1 * len(input_shape):
             raise ValueError(
-                "the axis:{} should not be less than -1 * length of input_shape:{}".format(
-                    axis, -1 * len(input_shape)
-                )
+                f"the axis:{axis} should not be less than -1 * length of input_shape:{-1 * len(input_shape)}"
             )
         axis = axis + len(input_shape)
     if in_dynamic_or_pir_mode():
@@ -2469,16 +2421,12 @@ def renorm_(x, p, axis, max_norm):
     input_shape = x.shape
     if not axis < len(input_shape):
         raise ValueError(
-            "the axis:{} should be less then the shape's size {}:{}".format(
-                axis, len(input_shape), input_shape
-            )
+            f"the axis:{axis} should be less then the shape's size {len(input_shape)}:{input_shape}"
         )
     if not axis >= 0:
         if not axis >= -1 * len(input_shape):
             raise ValueError(
-                "the axis:{} should not be less than -1 * length of input_shape:{}".format(
-                    axis, -1 * len(input_shape)
-                )
+                f"the axis:{axis} should not be less than -1 * length of input_shape:{-1 * len(input_shape)}"
             )
         axis = axis + len(input_shape)
     if in_dynamic_mode():
@@ -2540,9 +2488,7 @@ def __check_input(x, y):
                     raise ValueError(
                         "After performing an optional transpose, Input X's last dim should be "
                         "equal to Y's last dim for multiplication "
-                        "prerequisites. But received X's shape: {}, Y's shape: {}\n".format(
-                            x_shape, y_shape
-                        )
+                        f"prerequisites. But received X's shape: {x_shape}, Y's shape: {y_shape}\n"
                     )
 
         __check_input(nx, ny)
@@ -4454,7 +4400,7 @@ def isinf(x, name=None):
     Return whether every element of input tensor is `+/-INF` or not.
 
     Args:
-        x (Tensor): The input tensor, it's data type should be float16, float32, float64, int32, int64.
+        x (Tensor): The input tensor, it's data type should be float16, float32, float64, uint8, int8, int16, int32, int64.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -4482,8 +4428,11 @@ def isinf(x, name=None):
                 'float16',
                 'float32',
                 'float64',
+                'int8',
+                'int16',
                 'int32',
                 'int64',
+                'uint8',
                 'uint16',
             ],
             'isinf',
@@ -5593,9 +5542,7 @@ def lerp_(x, y, weight, name=None):
         out_shape = broadcast_shape(out_shape, weight.shape)
     if out_shape != x.shape:
         raise ValueError(
-            "The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(
-                out_shape, x.shape
-            )
+            f"The shape of broadcast output {out_shape} is different from that of inplace tensor {x.shape} in the Inplace operation."
         )
     return _C_ops.lerp_(x, y, weight)
 
@@ -5878,9 +5825,7 @@ def gcd_(x, y, name=None):
     shape = paddle.broadcast_shape(x.shape, y.shape)
     if shape != x.shape:
         raise ValueError(
-            "The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(
-                shape, x.shape
-            )
+            f"The shape of broadcast output {shape} is different from that of inplace tensor {x.shape} in the Inplace operation."
         )
     y = paddle.broadcast_to(y, shape)
     x = paddle.abs_(x)
@@ -6528,9 +6473,7 @@ def take(x, index, mode='raise', name=None):
             DataType.INT64,
         ]:
             raise TypeError(
-                "The data type of 'index' must be one of ['int32', 'int64'], but got {}".format(
-                    index.dtype
-                )
+                f"The data type of 'index' must be one of ['int32', 'int64'], but got {index.dtype}"
             )
 
     else:
@@ -7351,9 +7294,7 @@ def bitwise_left_shift_(x, y, is_arithmetic=True, out=None, name=None):
     out_shape = broadcast_shape(x.shape, y.shape)
     if out_shape != x.shape:
         raise ValueError(
-            "The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(
-                out_shape, x.shape
-            )
+            f"The shape of broadcast output {out_shape} is different from that of inplace tensor {x.shape} in the Inplace operation."
         )
     if in_dynamic_or_pir_mode():
         return _C_ops.bitwise_left_shift_(x, y, is_arithmetic)
@@ -7429,9 +7370,7 @@ def bitwise_right_shift_(x, y, is_arithmetic=True, out=None, name=None):
     out_shape = broadcast_shape(x.shape, y.shape)
     if out_shape != x.shape:
         raise ValueError(
-            "The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(
-                out_shape, x.shape
-            )
+            f"The shape of broadcast output {out_shape} is different from that of inplace tensor {x.shape} in the Inplace operation."
         )
 
     if in_dynamic_or_pir_mode():
@@ -7505,9 +7444,7 @@ def copysign(x, y, name=None):
     out_shape = broadcast_shape(x.shape, y.shape)
     if out_shape != x.shape:
         warnings.warn(
-            "The shape of broadcast output {} is different from the input tensor x with shape: {}, please make sure you are using copysign api correctly.".format(
-                out_shape, x.shape
-            )
+            f"The shape of broadcast output {out_shape} is different from the input tensor x with shape: {x.shape}, please make sure you are using copysign api correctly."
         )
 
     if in_dynamic_or_pir_mode():
@@ -7532,9 +7469,7 @@ def copysign_(x, y, name=None):
     out_shape = broadcast_shape(x.shape, y.shape)
     if out_shape != x.shape:
         raise ValueError(
-            "The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(
-                out_shape, x.shape
-            )
+            f"The shape of broadcast output {out_shape} is different from that of inplace tensor {x.shape} in the Inplace operation."
         )
     return _C_ops.copysign_(x, y)
 
@@ -7692,7 +7627,7 @@ def signbit(x, name=None):
             Tensor(shape=[3], dtype=bool, place=Place(cpu), stop_gradient=True,
             [True , True , False])
     """
-    if not isinstance(x, (paddle.Tensor, Variable)):
+    if not isinstance(x, (paddle.Tensor, Variable, paddle.pir.Value)):
         raise TypeError(f"x must be tensor type, but got {type(x)}")
 
     check_variable_and_dtype(
@@ -7711,7 +7646,9 @@ def signbit(x, name=None):
         ],
         "signbit",
     )
-    neg_zero_x = paddle.to_tensor(np.copysign(1, x.numpy()), dtype=x.dtype)
+    ones = [1.0] * math.prod(x.shape)
+    ones = paddle.to_tensor(ones, x.dtype).reshape(x.shape)
+    neg_zero_x = paddle.copysign(ones, x)
     x = paddle.sign(neg_zero_x)
     out = paddle.cast(x < 0, dtype='bool')
     return out
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index a35e243074893..8c0af43f718cc 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -477,9 +477,7 @@ def gaussian(shape, mean=0.0, std=1.0, seed=0, dtype=None, name=None):
         dtype = paddle.framework.get_default_dtype()
         if dtype not in supported_dtypes:
             raise TypeError(
-                "{} only supports {}, but the default dtype is {}".format(
-                    op_type_for_check, supported_dtypes, dtype
-                )
+                f"{op_type_for_check} only supports {supported_dtypes}, but the default dtype is {dtype}"
             )
     if not isinstance(dtype, (core.VarDesc.VarType, core.DataType)):
         dtype = convert_np_dtype_to_dtype_(dtype)
@@ -909,9 +907,7 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
         dtype = paddle.framework.get_default_dtype()
         if dtype not in supported_dtypes:
             raise TypeError(
-                "uniform/rand only supports {}, but the default dtype is {}".format(
-                    supported_dtypes, dtype
-                )
+                f"uniform/rand only supports {supported_dtypes}, but the default dtype is {dtype}"
             )
 
     if not isinstance(dtype, (core.VarDesc.VarType, core.DataType)):
@@ -935,9 +931,7 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
         check_type(min, 'min', (float, int, paddle.pir.Value), 'uniform/rand')
         check_type(max, 'max', (float, int, paddle.pir.Value), 'uniform/rand')
         if paddle.utils._contain_var(shape):
-            shape = paddle.utils.get_int_tensor_list(
-                shape, _current_expected_place()
-            )
+            shape = paddle.utils.get_int_tensor_list(shape)
         return _C_ops.uniform(
             shape,
             dtype,
@@ -1119,9 +1113,7 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
         check_shape(shape, 'randint')
         check_dtype(dtype, 'dtype', ['int32', 'int64'], 'randint')
         if paddle.utils._contain_var(shape):
-            shape = paddle.utils.get_int_tensor_list(
-                shape, _current_expected_place()
-            )
+            shape = paddle.utils.get_int_tensor_list(shape)
         return _C_ops.randint(
             low, high, shape, dtype, _current_expected_place()
         )
@@ -1340,9 +1332,7 @@ def randint_like(x, low=0, high=None, dtype=None, name=None):
                 'randint_like',
             )
             if paddle.utils._contain_var(shape):
-                shape = paddle.utils.get_int_tensor_list(
-                    shape, _current_expected_place()
-                )
+                shape = paddle.utils.get_int_tensor_list(shape)
             out = _C_ops.randint(
                 low, high, shape, DataType.INT64, _current_expected_place()
             )
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index c88d8fa367e20..d7e3a7a7d6e87 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -624,9 +624,7 @@ def _compute_quantile(
         "midpoint",
     ]:
         raise ValueError(
-            "interpolation must be one of 'linear', 'lower', 'higher', 'nearest' or 'midpoint', but got {}".format(
-                interpolation
-            )
+            f"interpolation must be one of 'linear', 'lower', 'higher', 'nearest' or 'midpoint', but got {interpolation}"
         )
     # Validate axis
     dims = len(x.shape)
diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py
index b48f9fcaa2c28..9b58c34660a1e 100644
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -207,9 +207,7 @@ def setup(**attr):
         ext_modules = [ext_modules]
     assert (
         len(ext_modules) == 1
-    ), "Required only one Extension, but received {}. If you want to compile multi operators, you can include all necessary source files in one Extension.".format(
-        len(ext_modules)
-    )
+    ), f"Required only one Extension, but received {len(ext_modules)}. If you want to compile multi operators, you can include all necessary source files in one Extension."
     # replace Extension.name with attr['name] to keep consistent with Package name.
     for ext_module in ext_modules:
         ext_module.name = attr['name']
@@ -910,9 +908,7 @@ def load(
     ), f"Required type(extra_cxx_cflags) == list[str], but received {extra_cxx_cflags}"
     assert isinstance(
         extra_cuda_cflags, list
-    ), "Required type(extra_cuda_cflags) == list[str], but received {}".format(
-        extra_cuda_cflags
-    )
+    ), f"Required type(extra_cuda_cflags) == list[str], but received {extra_cuda_cflags}"
 
     log_v(
         "additional extra_cxx_cflags: [{}], extra_cuda_cflags: [{}]".format(
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index 009176f61fe80..9f8961803cee5 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -305,9 +305,7 @@ def hasher(self, version_field):
                 md5 = combine_hash(md5, tuple(flat_elem))
             else:
                 raise RuntimeError(
-                    "Support types with list, tuple and dict, but received {} with {}.".format(
-                        type(elem), elem
-                    )
+                    f"Support types with list, tuple and dict, but received {type(elem)} with {elem}."
                 )
 
         return md5.hexdigest()
@@ -362,9 +360,7 @@ def deserialize(path):
         # delete shared library file if version is changed to re-compile it.
         if so_version is not None and so_version != versioner.version:
             log_v(
-                "Re-Compiling {}, because specified cflags have been changed. New signature {} has been saved into {}.".format(
-                    so_name, versioner.version, version_file
-                )
+                f"Re-Compiling {so_name}, because specified cflags have been changed. New signature {versioner.version} has been saved into {version_file}."
             )
             os.remove(so_path)
             # update new version information
@@ -630,13 +626,8 @@ def create_sym_link_if_not_exist():
                 os.symlink(core_path, new_dll_core_path)
             except Exception:
                 warnings.warn(
-                    "Failed to create soft symbol link for {}.\n You can run prompt as administrator and execute the "
-                    "following command manually: `mklink {} {}`. Now it will create hard link for {} trickly.".format(
-                        raw_core_name,
-                        new_dll_core_path,
-                        core_path,
-                        raw_core_name,
-                    )
+                    f"Failed to create soft symbol link for {raw_core_name}.\n You can run prompt as administrator and execute the "
+                    f"following command manually: `mklink {new_dll_core_path} {core_path}`. Now it will create hard link for {raw_core_name} trickly."
                 )
                 run_cmd(f'mklink /H {new_dll_core_path} {core_path}')
         # libpaddle with lib suffix
@@ -652,9 +643,7 @@ def create_sym_link_if_not_exist():
                 assert os.path.exists(new_lib_core_path)
             except Exception:
                 raise RuntimeError(
-                    "Failed to create soft symbol link for {}.\n Please execute the following command manually: `ln -s {} {}`".format(
-                        raw_core_name, core_path, new_lib_core_path
-                    )
+                    f"Failed to create soft symbol link for {raw_core_name}.\n Please execute the following command manually: `ln -s {core_path} {new_lib_core_path}`"
                 )
 
         # libpaddle without suffix
@@ -924,9 +913,7 @@ def get_build_directory(verbose=False):
             )
 
         log_v(
-            "$PADDLE_EXTENSION_DIR is not set, using path: {} by default.".format(
-                root_extensions_directory
-            ),
+            f"$PADDLE_EXTENSION_DIR is not set, using path: {root_extensions_directory} by default.",
             verbose,
         )
 
diff --git a/python/paddle/utils/deprecated.py b/python/paddle/utils/deprecated.py
index 39b1f73748098..5118460f2ad66 100755
--- a/python/paddle/utils/deprecated.py
+++ b/python/paddle/utils/deprecated.py
@@ -81,9 +81,7 @@ def decorator(func):
         if len(_update_to) > 0:
             assert _update_to.startswith(
                 "paddle."
-            ), 'Argument update_to must start with "paddle.", your value is "{}"'.format(
-                update_to
-            )
+            ), f'Argument update_to must start with "paddle.", your value is "{update_to}"'
             msg += f' Please use "{_update_to}" instead.'
         if len(_reason) > 0:
             msg += f"\n    Reason: {_reason}"
diff --git a/python/paddle/utils/inplace_utils.py b/python/paddle/utils/inplace_utils.py
index b6bc7c5c750f5..f8a94346417ae 100644
--- a/python/paddle/utils/inplace_utils.py
+++ b/python/paddle/utils/inplace_utils.py
@@ -29,9 +29,7 @@ def __impl__(*args, **kwargs):
         if not in_dynamic_mode():
             origin_api_name = func.__name__[:-1]
             warnings.warn(
-                "In static graph mode, {}() is the same as {}() and does not perform inplace operation.".format(
-                    func.__name__, origin_api_name
-                )
+                f"In static graph mode, {func.__name__}() is the same as {origin_api_name}() and does not perform inplace operation."
             )
             from ..base.dygraph.base import in_to_static_mode
 
diff --git a/python/paddle/utils/install_check.py b/python/paddle/utils/install_check.py
index b444b71834233..94fa03faedbb5 100644
--- a/python/paddle/utils/install_check.py
+++ b/python/paddle/utils/install_check.py
@@ -244,9 +244,7 @@ def run_check():
         use_custom = True
         if len(paddle.framework.core.get_all_custom_device_type()) > 1:
             logging.warning(
-                "More than one kind of custom devices detected, but run check would only be executed on {}.".format(
-                    paddle.framework.core.get_all_custom_device_type()[0]
-                )
+                f"More than one kind of custom devices detected, but run check would only be executed on {paddle.framework.core.get_all_custom_device_type()[0]}."
             )
 
     if use_cuda:
diff --git a/python/paddle/utils/layers_utils.py b/python/paddle/utils/layers_utils.py
index 4c0950a3da558..ca8cc02561580 100644
--- a/python/paddle/utils/layers_utils.py
+++ b/python/paddle/utils/layers_utils.py
@@ -27,7 +27,6 @@
 from ..base.framework import (
     Block,
     Variable,
-    _current_expected_place,
     in_dygraph_mode,
 )
 from ..pir import Value
@@ -308,9 +307,7 @@ def _recursive_assert_same_structure(nest1, nest2, check_types):
         if type_nest1 != type_nest2:
             raise TypeError(
                 "The two structures don't have the same sequence type. First "
-                "structure has type {}, while second structure has type {}.".format(
-                    type_nest1, type_nest2
-                )
+                f"structure has type {type_nest1}, while second structure has type {type_nest2}."
             )
         if isinstance(nest1, dict):
             keys1 = set(nest1.keys())
@@ -318,9 +315,7 @@ def _recursive_assert_same_structure(nest1, nest2, check_types):
             if keys1 != keys2:
                 raise ValueError(
                     "The two dictionaries don't have the same set of keys. First "
-                    "structure has keys {}, while second structure has keys {}.".format(
-                        keys1, keys2
-                    )
+                    f"structure has keys {keys1}, while second structure has keys {keys2}."
                 )
     nest1_as_sequence = list(_yield_value(nest1))
     nest2_as_sequence = list(_yield_value(nest2))
@@ -387,10 +382,7 @@ def _contain_var(list_or_tuple):
     return False
 
 
-def get_int_tensor_list(ele_list, place=None, default_dtype='int64'):
-    if place is None:
-        place = _current_expected_place()
-
+def get_int_tensor_list(ele_list, default_dtype='int64'):
     int_tensor_list = []
     for ele in ele_list:
         if isinstance(ele, paddle.pir.Value):
@@ -401,11 +393,11 @@ def get_int_tensor_list(ele_list, place=None, default_dtype='int64'):
                 ele = paddle.reshape(ele, [])
             int_tensor_list.append(ele)
         else:
-            temp_out = paddle.full(
-                [],
-                ele,
-                convert_np_dtype_to_dtype_(np.dtype(default_dtype)),
-                place,
+            temp_out = paddle.tensor.fill_constant(
+                shape=[],
+                dtype=convert_np_dtype_to_dtype_(np.dtype(default_dtype)),
+                value=ele,
+                force_cpu=True,
             )
             int_tensor_list.append(temp_out)
     return int_tensor_list
diff --git a/python/paddle/utils/lazy_import.py b/python/paddle/utils/lazy_import.py
index f604e2d8058bd..398951585417a 100644
--- a/python/paddle/utils/lazy_import.py
+++ b/python/paddle/utils/lazy_import.py
@@ -34,8 +34,8 @@ def try_import(module_name, err_msg=None):
     except ImportError:
         if err_msg is None:
             err_msg = (
-                "Failed importing {}. This likely means that some paddle modules "
+                f"Failed importing {module_name}. This likely means that some paddle modules "
                 "require additional dependencies that have to be "
-                "manually installed (usually with `pip install {}`). "
-            ).format(module_name, install_name)
+                f"manually installed (usually with `pip install {install_name}`). "
+            )
         raise ImportError(err_msg)
diff --git a/python/paddle/vision/transforms/functional.py b/python/paddle/vision/transforms/functional.py
index fd0f53f13db27..3fe57bff72313 100644
--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
@@ -86,9 +86,7 @@ def to_tensor(pic, data_format='CHW'):
         _is_pil_image(pic) or _is_numpy_image(pic) or _is_tensor_image(pic)
     ):
         raise TypeError(
-            'pic should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.format(
-                type(pic)
-            )
+            f'pic should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {type(pic)}'
         )
 
     if _is_pil_image(pic):
@@ -144,9 +142,7 @@ def resize(img, size, interpolation='bilinear'):
         _is_pil_image(img) or _is_numpy_image(img) or _is_tensor_image(img)
     ):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.format(
-                type(img)
-            )
+            f'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {type(img)}'
         )
 
     if _is_pil_image(img):
@@ -210,9 +206,7 @@ def pad(img, padding, fill=0, padding_mode='constant'):
         _is_pil_image(img) or _is_numpy_image(img) or _is_tensor_image(img)
     ):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.format(
-                type(img)
-            )
+            f'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {type(img)}'
         )
 
     if _is_pil_image(img):
@@ -254,9 +248,7 @@ def crop(img, top, left, height, width):
         _is_pil_image(img) or _is_numpy_image(img) or _is_tensor_image(img)
     ):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.format(
-                type(img)
-            )
+            f'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {type(img)}'
         )
 
     if _is_pil_image(img):
@@ -294,9 +286,7 @@ def center_crop(img, output_size):
         _is_pil_image(img) or _is_numpy_image(img) or _is_tensor_image(img)
     ):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.format(
-                type(img)
-            )
+            f'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {type(img)}'
         )
 
     if _is_pil_image(img):
@@ -333,9 +323,7 @@ def hflip(img):
         _is_pil_image(img) or _is_numpy_image(img) or _is_tensor_image(img)
     ):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.format(
-                type(img)
-            )
+            f'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {type(img)}'
         )
 
     if _is_pil_image(img):
@@ -372,9 +360,7 @@ def vflip(img):
         _is_pil_image(img) or _is_numpy_image(img) or _is_tensor_image(img)
     ):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.format(
-                type(img)
-            )
+            f'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {type(img)}'
         )
 
     if _is_pil_image(img):
@@ -426,9 +412,7 @@ def adjust_brightness(img, brightness_factor):
         _is_pil_image(img) or _is_numpy_image(img) or _is_tensor_image(img)
     ):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.format(
-                type(img)
-            )
+            f'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {type(img)}'
         )
 
     if _is_pil_image(img):
@@ -467,9 +451,7 @@ def adjust_contrast(img, contrast_factor):
         _is_pil_image(img) or _is_numpy_image(img) or _is_tensor_image(img)
     ):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.format(
-                type(img)
-            )
+            f'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {type(img)}'
         )
 
     if _is_pil_image(img):
@@ -509,9 +491,7 @@ def adjust_saturation(img, saturation_factor):
         _is_pil_image(img) or _is_numpy_image(img) or _is_tensor_image(img)
     ):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.format(
-                type(img)
-            )
+            f'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {type(img)}'
         )
 
     if _is_pil_image(img):
@@ -560,9 +540,7 @@ def adjust_hue(img, hue_factor):
         _is_pil_image(img) or _is_numpy_image(img) or _is_tensor_image(img)
     ):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.format(
-                type(img)
-            )
+            f'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {type(img)}'
         )
 
     if _is_pil_image(img):
@@ -657,9 +635,7 @@ def affine(
         _is_pil_image(img) or _is_numpy_image(img) or _is_tensor_image(img)
     ):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.format(
-                type(img)
-            )
+            f'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {type(img)}'
         )
 
     if not isinstance(angle, (int, float)):
@@ -790,9 +766,7 @@ def rotate(
         _is_pil_image(img) or _is_numpy_image(img) or _is_tensor_image(img)
     ):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.format(
-                type(img)
-            )
+            f'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {type(img)}'
         )
 
     if isinstance(center, list):
@@ -896,9 +870,7 @@ def perspective(img, startpoints, endpoints, interpolation='nearest', fill=0):
         _is_pil_image(img) or _is_numpy_image(img) or _is_tensor_image(img)
     ):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.format(
-                type(img)
-            )
+            f'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {type(img)}'
         )
 
     if _is_pil_image(img):
@@ -943,9 +915,7 @@ def to_grayscale(img, num_output_channels=1):
         _is_pil_image(img) or _is_numpy_image(img) or _is_tensor_image(img)
     ):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.format(
-                type(img)
-            )
+            f'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {type(img)}'
         )
 
     if _is_pil_image(img):
diff --git a/python/paddle/vision/transforms/functional_tensor.py b/python/paddle/vision/transforms/functional_tensor.py
index 17cb765262cb1..ecf43c59d2dd5 100644
--- a/python/paddle/vision/transforms/functional_tensor.py
+++ b/python/paddle/vision/transforms/functional_tensor.py
@@ -33,9 +33,7 @@ def _assert_image_tensor(img, data_format):
         or data_format.lower() not in ('chw', 'hwc')
     ):
         raise RuntimeError(
-            'not support [type={}, ndim={}, data_format={}] paddle image'.format(
-                type(img), img.ndim, data_format
-            )
+            f'not support [type={type(img)}, ndim={img.ndim}, data_format={data_format}] paddle image'
         )
 
 
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index cd44e43cd45c7..908408bd39cce 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -39,9 +39,7 @@ def _get_image_size(img):
             return img.shape[2:][::-1]  # nchw -> wh
         else:
             raise ValueError(
-                "The dim for input Tensor should be 3-D or 4-D, but received {}".format(
-                    len(img.shape)
-                )
+                f"The dim for input Tensor should be 3-D or 4-D, but received {len(img.shape)}"
             )
     else:
         raise TypeError(f"Unexpected type {type(img)}")
diff --git a/python/setup.py.in b/python/setup.py.in
index 437ffc5b80940..7d50eb2418507 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -525,6 +525,7 @@ packages=['paddle',
           'paddle.distributed.auto_parallel.static.tuner',
           'paddle.distributed.auto_parallel.static.cost',
           'paddle.distributed.passes',
+          'paddle.distributed.passes.pipeline_scheduler_pass',
           'paddle.distributed.models',
           'paddle.distributed.models.moe',
           'paddle.distributed.transpiler',
@@ -770,7 +771,7 @@ if '${WITH_PSLIB}' == 'ON':
     package_data['paddle.libs'] += ['libps' + ext_name]
     package_data['paddle.libs'] += ['libjvm' + ext_name]
 
-if '${WITH_MKLDNN}' == 'ON':
+if '${WITH_ONEDNN}' == 'ON':
     if '${CMAKE_BUILD_TYPE}' == 'Release' and os.name != 'nt':
         # only change rpath in Release mode.
         # TODO(typhoonzero): use install_name_tool to patch mkl libs once
@@ -911,7 +912,7 @@ jit_layer_headers = ['layer.h', 'serializer.h', 'serializer_utils.h', 'all.h', '
 for f in jit_layer_headers:
     headers += list(find_files(f, '@PADDLE_SOURCE_DIR@/paddle/fluid/jit', recursive=True))
 
-if '${WITH_MKLDNN}' == 'ON':
+if '${WITH_ONEDNN}' == 'ON':
     headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn
 
 if '${WITH_GPU}' == 'ON' or '${WITH_ROCM}' == 'ON':
diff --git a/python/setup_cinn.py.in b/python/setup_cinn.py.in
index dec595ae4e463..3f578bb926948 100644
--- a/python/setup_cinn.py.in
+++ b/python/setup_cinn.py.in
@@ -138,7 +138,7 @@ if '${WITH_MKL}' == 'ON':
     cinnlibs.append('${MKLML_LIB}')
     cinnlibs.append('${MKLML_IOMP_LIB}')
 
-if '${WITH_MKLDNN}' == 'ON':
+if '${WITH_ONEDNN}' == 'ON':
     cinnlibs.append('${MKLDNN_SHARED_LIB}')
 
 cinnlibs.append('${PHI_LIB}')
diff --git a/setup.py b/setup.py
index cbd2dbc1896df..a5cdf1c211d41 100644
--- a/setup.py
+++ b/setup.py
@@ -34,7 +34,6 @@
 from setuptools.command.install_lib import install_lib
 from setuptools.dist import Distribution
 
-# check python
 python_version = platform.python_version()
 version_detail = sys.version_info
 version = str(version_detail[0]) + '.' + str(version_detail[1])
@@ -251,9 +250,7 @@ def run(self):
             filename=f'{paddle_source_dir}/python/paddle/cuda_env.py'
         )
         write_parameter_server_version_py(
-            filename='{}/python/paddle/incubate/distributed/fleet/parameter_server/version.py'.format(
-                paddle_source_dir
-            )
+            filename=f'{paddle_source_dir}/python/paddle/incubate/distributed/fleet/parameter_server/version.py'
         )
         DevelopCommandBase.run(self)
 
@@ -699,7 +696,7 @@ def cinn():
                 'with_mkl': env_dict.get("WITH_MKL"),
                 'cinn': get_cinn_version(),
                 'with_pip_cuda_libraries': env_dict.get(
-                    "with_pip_cuda_libraries"
+                    "WITH_PIP_CUDA_LIBRARIES"
                 ),
             }
         )
@@ -1166,7 +1163,7 @@ def get_package_data_and_package_dir():
             )
         package_data['paddle.libs'] += ['libps' + ext_suffix]
         package_data['paddle.libs'] += ['libjvm' + ext_suffix]
-    if env_dict.get("WITH_MKLDNN") == 'ON':
+    if env_dict.get("WITH_ONEDNN") == 'ON':
         if env_dict.get("CMAKE_BUILD_TYPE") == 'Release' and os.name != 'nt':
             # only change rpath in Release mode.
             # TODO(typhoonzero): use install_name_tool to patch mkl libs once
@@ -1436,7 +1433,7 @@ def get_headers():
             )
         )
 
-    if env_dict.get("WITH_MKLDNN") == 'ON':
+    if env_dict.get("WITH_ONEDNN") == 'ON':
         headers += list(
             find_files('*', env_dict.get("MKLDNN_INSTALL_DIR") + '/include')
         )  # mkldnn
@@ -1531,6 +1528,7 @@ def get_setup_parameters():
         'paddle.distributed.auto_parallel.static.tuner',
         'paddle.distributed.auto_parallel.static.cost',
         'paddle.distributed.passes',
+        'paddle.distributed.passes.pipeline_scheduler_pass',
         'paddle.distributed.models',
         'paddle.distributed.models.moe',
         'paddle.distributed.transpiler',
@@ -1799,9 +1797,7 @@ def main():
         filename=f'{paddle_binary_dir}/python/paddle/cuda_env.py'
     )
     write_parameter_server_version_py(
-        filename='{}/python/paddle/incubate/distributed/fleet/parameter_server/version.py'.format(
-            paddle_binary_dir
-        )
+        filename=f'{paddle_binary_dir}/python/paddle/incubate/distributed/fleet/parameter_server/version.py'
     )
     (
         setup_requires,
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index c0c4c39dc7fc6..f53312b85482c 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -155,7 +155,7 @@ if(WITH_TESTING)
     add_subdirectory(standalone_executor)
     add_subdirectory(tokenizer)
     add_subdirectory(rpc)
-    if(WITH_MKLDNN)
+    if(WITH_ONEDNN)
       add_subdirectory(mkldnn)
     endif()
   endif()
diff --git a/test/auto_parallel/1F1B_pass_unittest.py b/test/auto_parallel/1F1B_pass_unittest.py
index 6666c3b7161c3..c2e24789a2eb0 100644
--- a/test/auto_parallel/1F1B_pass_unittest.py
+++ b/test/auto_parallel/1F1B_pass_unittest.py
@@ -84,9 +84,7 @@ def check_results(self, ref_losses, check_losses):
             check_losses,
             rtol=self.rtol,
             atol=self.atol,
-            err_msg='pass {} has wrong results!, \nu={}\nv={}\ndiff={}'.format(
-                __class__, ref_losses, check_losses, ref_losses - check_losses
-            ),
+            err_msg=f'pass {__class__} has wrong results!, \nu={ref_losses}\nv={check_losses}\ndiff={ref_losses - check_losses}',
         )
 
     def test_1f1b_pass(self):
diff --git a/test/auto_parallel/CMakeLists.txt b/test/auto_parallel/CMakeLists.txt
index f9a7214cf9321..f0b04b0efc441 100644
--- a/test/auto_parallel/CMakeLists.txt
+++ b/test/auto_parallel/CMakeLists.txt
@@ -262,7 +262,6 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
   py_test_modules(test_comp_cost MODULES test_comp_cost)
   py_test_modules(test_base_cost MODULES test_base_cost)
   py_test_modules(test_dist_context MODULES test_dist_context)
-  py_test_modules(test_prim_dist_op MODULES test_prim_dist_op)
   py_test_modules(test_to_static MODULES test_to_static)
   py_test_modules(test_dist_op_cost MODULES test_dist_op_cost)
   py_test_modules(test_cluster_v2 MODULES test_cluster_v2)
diff --git a/test/auto_parallel/amp_pass_unittest.py b/test/auto_parallel/amp_pass_unittest.py
index 5d326936eb28e..593d968a49e5a 100644
--- a/test/auto_parallel/amp_pass_unittest.py
+++ b/test/auto_parallel/amp_pass_unittest.py
@@ -82,9 +82,7 @@ def check_results(self, ref_losses, check_losses, rtol=None, atol=None):
             check_losses,
             rtol=rtol or self.rtol,
             atol=atol or self.atol,
-            err_msg='pass {} has wrong results!, \nu={}\nv={}\ndiff={}'.format(
-                __class__, ref_losses, check_losses, ref_losses - check_losses
-            ),
+            err_msg=f'pass {__class__} has wrong results!, \nu={ref_losses}\nv={check_losses}\ndiff={ref_losses - check_losses}',
         )
 
     def test_amp_pass(self):
diff --git a/test/auto_parallel/clip_grad_by_global_norm.py b/test/auto_parallel/clip_grad_by_global_norm.py
index 071d9c52c7891..dcc48d24847c8 100644
--- a/test/auto_parallel/clip_grad_by_global_norm.py
+++ b/test/auto_parallel/clip_grad_by_global_norm.py
@@ -94,9 +94,7 @@ def check_result(self, dp_params, sharding_params):
                 sharding_p,
                 rtol=1e-05,
                 atol=1e-08,
-                err_msg='gradient clip by global norm has wrong results!, \nu={}\nv={}\ndiff={}'.format(
-                    dp_p, sharding_p, dp_p - sharding_p
-                ),
+                err_msg=f'gradient clip by global norm has wrong results!, \nu={dp_p}\nv={sharding_p}\ndiff={dp_p - sharding_p}',
             )
 
     def test_grad_clip(self):
diff --git a/test/auto_parallel/custom_op/CMakeLists.txt b/test/auto_parallel/custom_op/CMakeLists.txt
index b3537bc09c4e0..0ce6e5ff0a8f6 100644
--- a/test/auto_parallel/custom_op/CMakeLists.txt
+++ b/test/auto_parallel/custom_op/CMakeLists.txt
@@ -8,7 +8,7 @@ if(WITH_DISTRIBUTE
     MODULES
     test_semi_auto_parallel_custom_op
     ENVS
-    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python;PADDLE_SOURCE_DIR=${PROJECT_SOURCE_DIR};WITH_MKLDNN=${WITH_MKLDNN};MKLDNN_INSTALL_DIR=${MKLDNN_INSTALL_DIR};WITH_MKLDNN=${WITH_MKLDNN};WITH_GPU=${WITH_GPU};WITH_ROCM=${WITH_ROCM};externalError_INCLUDE_DIR=${externalError_INCLUDE_DIR};PYBIND_INCLUDE_DIR=${PYBIND_INCLUDE_DIR}"
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python;PADDLE_SOURCE_DIR=${PROJECT_SOURCE_DIR};WITH_ONEDNN=${WITH_ONEDNN};MKLDNN_INSTALL_DIR=${MKLDNN_INSTALL_DIR};WITH_ONEDNN=${WITH_ONEDNN};WITH_GPU=${WITH_GPU};WITH_ROCM=${WITH_ROCM};externalError_INCLUDE_DIR=${externalError_INCLUDE_DIR};PYBIND_INCLUDE_DIR=${PYBIND_INCLUDE_DIR}"
   )
   set_tests_properties(test_semi_auto_parallel_custom_op
                        PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
diff --git a/test/auto_parallel/custom_op/utils.py b/test/auto_parallel/custom_op/utils.py
index 07f08648c0a62..1bdbfddc1d6d5 100644
--- a/test/auto_parallel/custom_op/utils.py
+++ b/test/auto_parallel/custom_op/utils.py
@@ -27,7 +27,7 @@ def get_paddle_includes():
     paddle_includes.append(f"{env_dict.get('PADDLE_SOURCE_DIR')}")
 
     # mkldnn
-    if env_dict.get("WITH_MKLDNN") == 'ON':
+    if env_dict.get("WITH_ONEDNN") == 'ON':
         paddle_includes.append(f"{env_dict.get('MKLDNN_INSTALL_DIR')}/include")
     if env_dict.get("WITH_GPU") == 'ON' or env_dict.get("WITH_ROCM") == 'ON':
         paddle_includes.append(f"{env_dict.get('externalError_INCLUDE_DIR')}")
diff --git a/test/auto_parallel/gpt_with_pir.py b/test/auto_parallel/gpt_with_pir.py
index 7e0f9fcfca3c7..af26a581d937b 100644
--- a/test/auto_parallel/gpt_with_pir.py
+++ b/test/auto_parallel/gpt_with_pir.py
@@ -118,9 +118,7 @@ def check_results(self, ref_losses, check_losses):
         np.testing.assert_equal(
             ref_losses,
             check_losses,
-            err_msg='pass {} has wrong results!, \nu={}\nv={}\ndiff={}'.format(
-                __class__, ref_losses, check_losses, ref_losses - check_losses
-            ),
+            err_msg=f'pass {__class__} has wrong results!, \nu={ref_losses}\nv={check_losses}\ndiff={ref_losses - check_losses}',
         )
 
     def enable_pir(self, flag):
diff --git a/test/auto_parallel/gpt_with_prim.py b/test/auto_parallel/gpt_with_prim.py
index e7a5911c59305..67da8546206fd 100644
--- a/test/auto_parallel/gpt_with_prim.py
+++ b/test/auto_parallel/gpt_with_prim.py
@@ -137,9 +137,7 @@ def check_results(self, ref_losses, check_losses):
         np.testing.assert_equal(
             ref_losses,
             check_losses,
-            err_msg='pass {} has wrong results!, \nu={}\nv={}\ndiff={}'.format(
-                __class__, ref_losses, check_losses, ref_losses - check_losses
-            ),
+            err_msg=f'pass {__class__} has wrong results!, \nu={ref_losses}\nv={check_losses}\ndiff={ref_losses - check_losses}',
         )
 
     def check_results_prim(self, ref_losses, check_losses):
@@ -148,9 +146,7 @@ def check_results_prim(self, ref_losses, check_losses):
             check_losses,
             rtol=2e-2,
             atol=2e-2,
-            err_msg='pass {} has wrong results!, \nu={}\nv={}\ndiff={}'.format(
-                __class__, ref_losses, check_losses, ref_losses - check_losses
-            ),
+            err_msg=f'pass {__class__} has wrong results!, \nu={ref_losses}\nv={check_losses}\ndiff={ref_losses - check_losses}',
         )
 
     def enable_pir(self, flag):
diff --git a/test/auto_parallel/gradient_merge_pass_unittest.py b/test/auto_parallel/gradient_merge_pass_unittest.py
index 048016be0c702..f79e1ae7e6980 100644
--- a/test/auto_parallel/gradient_merge_pass_unittest.py
+++ b/test/auto_parallel/gradient_merge_pass_unittest.py
@@ -76,9 +76,7 @@ def check_results(self, ref_losses, check_losses):
             check_losses,
             rtol=self.rtol,
             atol=self.atol,
-            err_msg='pass {} has wrong results!, \nu={}\nv={}\ndiff={}'.format(
-                __class__, ref_losses, check_losses, ref_losses - check_losses
-            ),
+            err_msg=f'pass {__class__} has wrong results!, \nu={ref_losses}\nv={check_losses}\ndiff={ref_losses - check_losses}',
         )
 
     def test_gradient_merge_pass(self):
diff --git a/test/auto_parallel/hybrid_strategy/semi_auto_parallel_global_input.py b/test/auto_parallel/hybrid_strategy/semi_auto_parallel_global_input.py
index 768b78163fedc..4d62182992a08 100644
--- a/test/auto_parallel/hybrid_strategy/semi_auto_parallel_global_input.py
+++ b/test/auto_parallel/hybrid_strategy/semi_auto_parallel_global_input.py
@@ -21,7 +21,7 @@
 from paddle.io import BatchSampler, DataLoader, Dataset
 
 SEQ_LEN = 4
-HIDDLE_SIZE = 8
+HIDDEN_SIZE = 8
 global_mesh = dist.ProcessMesh(
     [[[0, 1], [2, 3]], [[4, 5], [6, 7]]], dim_names=['pp', 'dp', 'mp']
 )
@@ -33,19 +33,19 @@ class MlpModel(paddle.nn.Layer):
     def __init__(self, variable_initial_values, run_single_process=False):
         super().__init__()
         self.w0 = self.create_parameter(
-            shape=[HIDDLE_SIZE, HIDDLE_SIZE],
+            shape=[HIDDEN_SIZE, HIDDEN_SIZE],
             default_initializer=paddle.nn.initializer.Assign(
                 variable_initial_values[0]
             ),
         )
         self.w1 = self.create_parameter(
-            shape=[HIDDLE_SIZE, HIDDLE_SIZE],
+            shape=[HIDDEN_SIZE, HIDDEN_SIZE],
             default_initializer=paddle.nn.initializer.Assign(
                 variable_initial_values[1]
             ),
         )
         self.global_input = paddle.uniform(
-            shape=[SEQ_LEN, HIDDLE_SIZE],
+            shape=[SEQ_LEN, HIDDEN_SIZE],
             dtype=paddle.float32,
             min=-0.0001,
             max=0.0001,
@@ -121,7 +121,7 @@ def __len__(self):
 
 
 def create_dataloader():
-    dataset = RandomDataset(SEQ_LEN, HIDDLE_SIZE)
+    dataset = RandomDataset(SEQ_LEN, HIDDEN_SIZE)
     sampler = BatchSampler(
         dataset,
         batch_size=2,
@@ -138,7 +138,7 @@ def get_variable_initial_value(var_num=2):
     for i in range(var_num):
         res.append(
             paddle.uniform(
-                shape=[HIDDLE_SIZE, HIDDLE_SIZE],
+                shape=[HIDDEN_SIZE, HIDDEN_SIZE],
                 dtype=paddle.float32,
                 min=-0.0001,
                 max=0.0001,
diff --git a/test/auto_parallel/hybrid_strategy/semi_auto_parallel_multi_inputs.py b/test/auto_parallel/hybrid_strategy/semi_auto_parallel_multi_inputs.py
index a7166ca901d09..b544a89f86717 100644
--- a/test/auto_parallel/hybrid_strategy/semi_auto_parallel_multi_inputs.py
+++ b/test/auto_parallel/hybrid_strategy/semi_auto_parallel_multi_inputs.py
@@ -21,7 +21,7 @@
 from paddle.io import BatchSampler, DataLoader, Dataset
 
 SEQ_LEN = 4
-HIDDLE_SIZE = 8
+HIDDEN_SIZE = 8
 global_mesh = dist.ProcessMesh(
     [[[0, 1], [2, 3]], [[4, 5], [6, 7]]], dim_names=['pp', 'dp', 'mp']
 )
@@ -33,13 +33,13 @@ class MlpModel(paddle.nn.Layer):
     def __init__(self, variable_initial_values, run_single_process=False):
         super().__init__()
         self.w0 = self.create_parameter(
-            shape=[HIDDLE_SIZE, HIDDLE_SIZE],
+            shape=[HIDDEN_SIZE, HIDDEN_SIZE],
             default_initializer=paddle.nn.initializer.Assign(
                 variable_initial_values[0]
             ),
         )
         self.w1 = self.create_parameter(
-            shape=[HIDDLE_SIZE, HIDDLE_SIZE],
+            shape=[HIDDEN_SIZE, HIDDEN_SIZE],
             default_initializer=paddle.nn.initializer.Assign(
                 variable_initial_values[1]
             ),
@@ -102,7 +102,7 @@ def __len__(self):
 
 
 def create_dataloader():
-    dataset = RandomDataset(SEQ_LEN, HIDDLE_SIZE)
+    dataset = RandomDataset(SEQ_LEN, HIDDEN_SIZE)
     sampler = BatchSampler(
         dataset,
         batch_size=2,
@@ -119,7 +119,7 @@ def get_variable_initial_value(var_num=2):
     for i in range(var_num):
         res.append(
             paddle.uniform(
-                shape=[HIDDLE_SIZE, HIDDLE_SIZE],
+                shape=[HIDDEN_SIZE, HIDDEN_SIZE],
                 dtype=paddle.float32,
                 min=-0.0001,
                 max=0.0001,
diff --git a/test/auto_parallel/mp_allreduce_matmul_grad_overlapping_unittest.py b/test/auto_parallel/mp_allreduce_matmul_grad_overlapping_unittest.py
index 2945dd1b31151..1da69c40d3f8d 100644
--- a/test/auto_parallel/mp_allreduce_matmul_grad_overlapping_unittest.py
+++ b/test/auto_parallel/mp_allreduce_matmul_grad_overlapping_unittest.py
@@ -74,9 +74,7 @@ def check_results(self, ref_losses, check_losses, rtol=None, atol=None):
             check_losses,
             rtol=rtol or self.rtol,
             atol=atol or self.atol,
-            err_msg='pass {} has wrong results!, \nu={}\nv={}\ndiff={}'.format(
-                __class__, ref_losses, check_losses, ref_losses - check_losses
-            ),
+            err_msg=f'pass {__class__} has wrong results!, \nu={ref_losses}\nv={check_losses}\ndiff={ref_losses - check_losses}',
         )
 
     def test_mp_allreduce_matmul_grad_overlapping(self):
diff --git a/test/auto_parallel/pipeline_scheduler_unittest.py b/test/auto_parallel/pipeline_scheduler_unittest.py
index e668cd4acda77..7f71e29012d8a 100644
--- a/test/auto_parallel/pipeline_scheduler_unittest.py
+++ b/test/auto_parallel/pipeline_scheduler_unittest.py
@@ -82,9 +82,7 @@ def check_results(self, ref_losses, check_losses):
             check_losses,
             rtol=self.rtol,
             atol=self.atol,
-            err_msg='pass {} has wrong results!, \nu={}\nv={}\ndiff={}'.format(
-                __class__, ref_losses, check_losses, ref_losses - check_losses
-            ),
+            err_msg=f'pass {__class__} has wrong results!, \nu={ref_losses}\nv={check_losses}\ndiff={ref_losses - check_losses}',
         )
 
     def test_pp_pass(self):
diff --git a/test/auto_parallel/pir/CMakeLists.txt b/test/auto_parallel/pir/CMakeLists.txt
index 8bdb4f3176d4f..8df07b2554d0e 100644
--- a/test/auto_parallel/pir/CMakeLists.txt
+++ b/test/auto_parallel/pir/CMakeLists.txt
@@ -11,4 +11,10 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
                   FLAGS_enable_pir_api=1)
   py_test_modules(test_pir_mse_spmd MODULES test_mse_spmd_rule ENVS
                   FLAGS_enable_pir_api=1)
+  py_test_modules(test_mlp MODULES test_mlp ENVS FLAGS_enable_pir_api=1)
+  py_test_modules(test_reshard MODULES test_reshard ENVS FLAGS_enable_pir_api=1)
+  py_test_modules(test_learning_rate MODULES test_learning_rate ENVS
+                  FLAGS_enable_pir_api=1)
+  set_tests_properties(test_mlp PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT
+                                           60)
 endif()
diff --git a/test/auto_parallel/pir/mlp_demo.py b/test/auto_parallel/pir/mlp_demo.py
new file mode 100644
index 0000000000000..bb8e4e04bff47
--- /dev/null
+++ b/test/auto_parallel/pir/mlp_demo.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_to_static_pir_program import DemoNet, create_data_loader
+
+import paddle
+import paddle.distributed as dist
+from paddle import nn
+from paddle.framework import _current_expected_place
+
+
+class TestMLPTensorParallel(unittest.TestCase):
+    def test_to_static_program(self):
+        paddle.base.set_flags({'FLAGS_enable_pir_api': 1})
+        mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+        mp_layer = DemoNet(mesh, True)
+        opt = paddle.optimizer.SGD(
+            learning_rate=0.1, parameters=mp_layer.parameters()
+        )
+        loss_fn = nn.MSELoss()
+        loader = create_data_loader()
+        dist_loader = dist.shard_dataloader(loader, meshes=[mesh])
+        dist_model = dist.to_static(mp_layer, dist_loader, loss_fn, opt)
+
+        dist_model.train()
+        mode = "train"
+
+        # TODO(2024-Q2) hack for engine api
+        dist_model._engine._has_prepared[mode] = True
+        dist_model._mode = mode
+        dist_model._engine._mode = mode
+        paddle.disable_static()
+        dist_model._engine._initialize(mode)
+        dist_model._engine._executor = paddle.static.Executor(
+            _current_expected_place()
+        )
+        dist_model._engine._init_comm()
+
+        for batch_id, (image, label) in enumerate(dist_loader()):
+            loss = dist_model(image, label)
+
+
+class TestMLPReplicated(unittest.TestCase):
+    def test_to_static_program(self):
+        paddle.base.set_flags({'FLAGS_enable_pir_api': 1})
+        mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+        replicated_layer = DemoNet(mesh, False)
+        opt = paddle.optimizer.SGD(
+            learning_rate=0.1, parameters=replicated_layer.parameters()
+        )
+        loss_fn = nn.MSELoss()
+        loader = create_data_loader()
+        dist_loader = dist.shard_dataloader(loader, meshes=[mesh])
+        dist_model = dist.to_static(replicated_layer, dist_loader, loss_fn, opt)
+
+        dist_model.train()
+        mode = "train"
+        # dist_model._engine._build(mode)
+        # main_program = dist_model._engine._pir_main_progs[mode]
+
+        # TODO(2024-Q2) hack for engine api
+        # main_program = dist_model._engine._pir_main_progs[mode]
+        dist_model._engine._has_prepared[mode] = True
+        dist_model._mode = mode
+        dist_model._engine._mode = mode
+        paddle.disable_static()
+        dist_model._engine._initialize(mode)
+        dist_model._engine._executor = paddle.static.Executor(
+            _current_expected_place()
+        )
+
+        for batch_id, (image, label) in enumerate(dist_loader()):
+            loss = dist_model(image, label)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/auto_parallel/pir/test_learning_rate.py b/test/auto_parallel/pir/test_learning_rate.py
new file mode 100644
index 0000000000000..f41a57fe252b9
--- /dev/null
+++ b/test/auto_parallel/pir/test_learning_rate.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.distributed as dist
+from paddle import nn
+from paddle.io import DataLoader
+
+BATCH_SIZE = 4
+BATCH_NUM = 40
+IMAGE_SIZE = 16
+CLASS_NUM = 8
+np.random.seed(2024)
+paddle.seed(2024)
+
+
+class RandomDataset(paddle.io.Dataset):
+    def __init__(self, images, labels, num_samples):
+        self.images = images
+        self.labels = labels
+        self.num_samples = num_samples
+
+    def __getitem__(self, idx):
+        return self.images[idx], self.labels[idx]
+
+    def __len__(self):
+        return self.num_samples
+
+
+class SimpleDemoNet(nn.Layer):
+    def __init__(self, mesh1, mesh2):
+        super().__init__()
+        self._mesh1 = mesh1
+        self._mesh2 = mesh2
+        self.linear_0 = nn.Linear(IMAGE_SIZE, IMAGE_SIZE, bias_attr=False)
+        self.linear_1 = nn.Linear(IMAGE_SIZE, CLASS_NUM, bias_attr=False)
+        self.relu_0 = nn.ReLU()
+        self.relu_1 = nn.ReLU()
+        self.relu_2 = nn.ReLU()
+        # shard the weights of this layer
+        self.linear_0.weight = dist.shard_tensor(
+            self.linear_0.weight,
+            self._mesh1,
+            [dist.Replicate()],
+            stop_gradient=False,
+        )
+        self.linear_1.weight = dist.shard_tensor(
+            self.linear_1.weight,
+            self._mesh2,
+            [dist.Replicate()],
+            stop_gradient=False,
+        )
+
+    def forward(self, x):
+        x.stop_gradient = False
+        out = self.relu_0(x)  # triggle backward partial allreduce
+        out = self.linear_0(out)
+        out = self.relu_1(out)
+        out = dist.reshard(out, self._mesh2, [dist.Replicate()])
+        out = self.linear_1(out)
+        out = self.relu_2(out)  # triggle forward partial allreduce
+        return out
+
+
+def create_data_loader():
+    images = np.random.rand(BATCH_NUM, IMAGE_SIZE).astype('float32')
+    labels = np.random.rand(BATCH_NUM, CLASS_NUM).astype('float32')
+    dataset = RandomDataset(images, labels, BATCH_NUM)
+    loader = DataLoader(dataset, batch_size=BATCH_SIZE)
+    return loader
+
+
+class TestLearningRate(unittest.TestCase):
+    def test_copy_between_mesh(self):
+        mesh1 = dist.ProcessMesh([0], dim_names=["x"])
+        mesh2 = dist.ProcessMesh([1], dim_names=["y"])
+        layer = SimpleDemoNet(mesh1, mesh2)
+        opt = paddle.optimizer.SGD(
+            learning_rate=0.1, parameters=layer.parameters()
+        )
+        loss_fn = nn.MSELoss()
+        loader = create_data_loader()
+        dist_loader = dist.shard_dataloader(loader, meshes=[mesh1, mesh2])
+        dist_model = dist.to_static(layer, dist_loader, loss_fn, opt)
+
+        engine = dist_model._engine
+        engine._build("train")
+        dist_program = engine._fwd_main_progs["train"]
+        dist_program = paddle.base.libpaddle.pir.apply_mix2dist_pass(
+            dist_program
+        )
+        loss = dist_program.get_output_value_by_name(engine._loss_names[0])
+        with paddle.static.program_guard(dist_program):
+            params_grads = paddle.autograd.ir_backward.append_backward(loss)
+            engine._optimizer._apply_optimize(
+                loss, startup_program=None, params_grads=params_grads
+            )
+        sgd_idx = 0
+        ops = dist_program.global_block().ops
+        for op in ops:
+            if op.name() != 'pd_op.sgd_':
+                continue
+            param = op.operand_source(0)
+            learning_rate = op.operand_source(1)
+            op_dist_attr = learning_rate.get_defining_op().dist_attr
+            self.assertEqual(
+                learning_rate.dist_attr().process_mesh,
+                param.dist_attr().process_mesh,
+            )
+            self.assertEqual(
+                learning_rate.dist_attr().process_mesh,
+                op_dist_attr.process_mesh,
+            )
+            if sgd_idx == 0:
+                self.assertEqual(param.dist_attr().process_mesh, mesh2)
+            elif sgd_idx == 1:
+                self.assertEqual(param.dist_attr().process_mesh, mesh1)
+            sgd_idx += 1
+        self.assertEqual(sgd_idx, 2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/auto_parallel/pir/test_mlp.py b/test/auto_parallel/pir/test_mlp.py
new file mode 100644
index 0000000000000..971c88dd8a39f
--- /dev/null
+++ b/test/auto_parallel/pir/test_mlp.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import collective.test_communication_api_base as test_base
+
+
+class TestSemiAutoParallelStaticPirDecorate(
+    test_base.CommunicationTestDistBase
+):
+    def setUp(self):
+        super().setUp(
+            num_of_devices=2,
+            timeout=300,
+        )
+        self._default_envs = {"dtype": "float32", "seed": "2023"}
+        self._changeable_envs = {"backend": ["gpu"]}
+
+    def test_mlp(self):
+        envs_list = test_base.gen_product_envs_list(
+            {"dtype": "float32", "seed": "2023"}, {"backend": ["gpu"]}
+        )
+        for envs in envs_list:
+            self.run_test_case(
+                "mlp_demo.py",
+                user_defined_envs=envs,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/auto_parallel/pir/test_reshard.py b/test/auto_parallel/pir/test_reshard.py
new file mode 100644
index 0000000000000..0163b4f853e2d
--- /dev/null
+++ b/test/auto_parallel/pir/test_reshard.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pathlib
+import sys
+import unittest
+
+import paddle
+import paddle.distributed as dist
+from paddle import nn
+
+sys.path.append(str(pathlib.Path(__file__).resolve().parents[0]))
+from test_to_static_pir_program import (
+    BATCH_SIZE,
+    CLASS_NUM,
+    DemoNet,
+    create_data_loader,
+)
+
+
+class ReshardDemoNet(DemoNet):
+    def __init__(self, mesh, shard=True):
+        super().__init__(mesh, shard=True)
+
+    def forward(self, x):
+        out = DemoNet.forward(self, x)
+        out = dist.reshard(out, self._mesh, [dist.Shard(0)])
+        return out
+
+
+class TestToStaticPirProgramTrain(unittest.TestCase):
+    def test_to_static_program(self):
+        mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+        layer = ReshardDemoNet(mesh)
+        opt = paddle.optimizer.SGD(
+            learning_rate=0.1, parameters=layer.parameters()
+        )
+        loss_fn = nn.MSELoss()
+        loader = create_data_loader()
+        dist_loader = dist.shard_dataloader(loader, meshes=[mesh])
+        dist_model = dist.to_static(layer, dist_loader, loss_fn, opt)
+        engine = dist_model._engine
+        engine._build("train")
+        dist_program = engine._fwd_main_progs["train"]
+        dist_program = paddle.base.libpaddle.pir.apply_mix2dist_pass(
+            dist_program
+        )
+        loss = dist_program.get_output_value_by_name(engine._loss_names[0])
+        with paddle.static.program_guard(dist_program):
+            params_grads = paddle.autograd.ir_backward.append_backward(loss)
+            engine._optimizer._apply_optimize(
+                loss, startup_program=None, params_grads=params_grads
+            )
+
+        index = 0
+        for op in dist_program.global_block().ops:
+            if op.name() == 'dist_op.reshard':
+                if index == 0:
+                    # forward reshard op
+                    self.fwd_input = op.operand_source(0)
+                    self.assertEqual(
+                        self.fwd_input.dist_attr().dims_mapping, [-1, -1]
+                    )
+                    self.assertEqual(
+                        self.fwd_input.dist_attr().partial_dims, set()
+                    )
+                    self.assertEqual(
+                        self.fwd_input._local_shape,
+                        [BATCH_SIZE, CLASS_NUM],
+                    )
+                    self.fwd_output = op.result(0)
+                    self.assertEqual(
+                        self.fwd_output.dist_attr().dims_mapping, [0, -1]
+                    )
+                    self.assertEqual(
+                        self.fwd_output.dist_attr().partial_dims, set()
+                    )
+                    self.assertEqual(
+                        self.fwd_output._local_shape,
+                        [BATCH_SIZE / 2, CLASS_NUM],
+                    )
+                elif index == 1:
+                    # backward reshard op
+                    self.assertEqual(op.result(0).type(), self.fwd_input.type())
+                index += 1
+        self.assertEqual(index, 2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/auto_parallel/pir/test_static_pir_program.py b/test/auto_parallel/pir/test_static_pir_program.py
index 8ae4d5fc6aa55..d40e7c5237205 100644
--- a/test/auto_parallel/pir/test_static_pir_program.py
+++ b/test/auto_parallel/pir/test_static_pir_program.py
@@ -92,19 +92,19 @@ def test_build_with_shard_tensor(self):
         self.assertFalse(w0.get_defining_op().has_attr("op_dist_attr"))
         self.assertFalse(w1.get_defining_op().has_attr("op_dist_attr"))
 
-        dist_input_op_dist_attr = dist_input.get_defining_op().dist_attr()
+        dist_input_op_dist_attr = dist_input.get_defining_op().dist_attr
         # #check attrs
 
         self.assertEqual(dist_input_op_dist_attr.process_mesh, mesh)
         self.assertEqual(dist_input_op_dist_attr.num_operand_dist_attrs(), 0)
         self.assertEqual(dist_input_op_dist_attr.num_result_dist_attrs(), 1)
 
-        dist_w0_op_dist_attr = dist_w0.get_defining_op().dist_attr()
+        dist_w0_op_dist_attr = dist_w0.get_defining_op().dist_attr
         self.assertEqual(dist_w0_op_dist_attr.process_mesh, mesh)
         self.assertEqual(dist_w0_op_dist_attr.num_operand_dist_attrs(), 0)
         self.assertEqual(dist_w0_op_dist_attr.num_result_dist_attrs(), 1)
 
-        dist_w1_op_dist_attr = dist_w1.get_defining_op().dist_attr()
+        dist_w1_op_dist_attr = dist_w1.get_defining_op().dist_attr
         self.assertEqual(dist_w1_op_dist_attr.process_mesh, mesh)
         self.assertEqual(dist_w1_op_dist_attr.num_operand_dist_attrs(), 0)
         self.assertEqual(dist_w1_op_dist_attr.num_result_dist_attrs(), 1)
diff --git a/test/auto_parallel/pir/test_to_static_pir_program.py b/test/auto_parallel/pir/test_to_static_pir_program.py
index 3085e3a726de0..4ddbe9ee1f6ad 100644
--- a/test/auto_parallel/pir/test_to_static_pir_program.py
+++ b/test/auto_parallel/pir/test_to_static_pir_program.py
@@ -19,14 +19,15 @@
 import paddle
 import paddle.distributed as dist
 from paddle import nn
-from paddle.distributed import Shard
+from paddle.distributed import Replicate, Shard
 from paddle.io import DataLoader
 
 BATCH_SIZE = 4
-BATCH_NUM = 4
+BATCH_NUM = 40
 IMAGE_SIZE = 16
 CLASS_NUM = 8
 np.random.seed(2024)
+paddle.seed(2024)
 
 
 class RandomDataset(paddle.io.Dataset):
@@ -43,7 +44,7 @@ def __len__(self):
 
 
 class DemoNet(nn.Layer):
-    def __init__(self, mesh):
+    def __init__(self, mesh, shard=True):
         super().__init__()
         self._mesh = mesh
         self.linear_0 = nn.Linear(IMAGE_SIZE, IMAGE_SIZE, bias_attr=False)
@@ -51,19 +52,34 @@ def __init__(self, mesh):
         self.relu_0 = nn.ReLU()
         self.relu_1 = nn.ReLU()
         self.relu_2 = nn.ReLU()
+        self.shard = shard
         # shard the weights of this layer
-        self.linear_0.weight = dist.shard_tensor(
-            self.linear_0.weight,
-            self._mesh,
-            [Shard(1)],
-            stop_gradient=False,
-        )
-        self.linear_1.weight = dist.shard_tensor(
-            self.linear_1.weight,
-            self._mesh,
-            [Shard(0)],
-            stop_gradient=False,
-        )
+        if self.shard:
+            self.linear_0.weight = dist.shard_tensor(
+                self.linear_0.weight,
+                self._mesh,
+                [Shard(1)],
+                stop_gradient=False,
+            )
+            self.linear_1.weight = dist.shard_tensor(
+                self.linear_1.weight,
+                self._mesh,
+                [Shard(0)],
+                stop_gradient=False,
+            )
+        else:
+            self.linear_0.weight = dist.shard_tensor(
+                self.linear_0.weight,
+                self._mesh,
+                [Replicate()],
+                stop_gradient=False,
+            )
+            self.linear_1.weight = dist.shard_tensor(
+                self.linear_1.weight,
+                self._mesh,
+                [Replicate()],
+                stop_gradient=False,
+            )
 
     def forward(self, x):
         x.stop_gradient = False
@@ -76,43 +92,13 @@ def forward(self, x):
 
 
 def create_data_loader():
-    images = np.random.rand(BATCH_SIZE, IMAGE_SIZE).astype('float32')
-    labels = np.random.rand(BATCH_SIZE, CLASS_NUM).astype('float32')
-    dataset = RandomDataset(images, labels, BATCH_SIZE)
+    images = np.random.rand(BATCH_NUM, IMAGE_SIZE).astype('float32')
+    labels = np.random.rand(BATCH_NUM, CLASS_NUM).astype('float32')
+    dataset = RandomDataset(images, labels, BATCH_NUM)
     loader = DataLoader(dataset, batch_size=BATCH_SIZE)
     return loader
 
 
-class TestToStaticPirProgramEval(unittest.TestCase):
-    def test_to_static_program(self):
-        paddle.base.set_flags({'FLAGS_enable_pir_api': 1})
-        mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
-        layer = DemoNet(mesh)
-        opt = None  # forward only
-        loss_fn = nn.MSELoss()
-        loader = create_data_loader()
-        dist_loader = dist.shard_dataloader(loader, meshes=[mesh])
-        dist_model = dist.to_static(layer, dist_loader, loss_fn, opt)
-
-        dist_model.eval()
-        main_program = dist_model._engine._pir_main_progs["eval"]
-
-        for op in main_program.global_block().ops:
-            if op.num_results() == 0:
-                continue
-            tensor = op.result(0)
-            if op.name() == 'pd_op.data':
-                self.assertTrue(tensor.is_dist_dense_tensor_type())
-                self.assertEqual(tensor.dist_attr().process_mesh.shape, [2])
-                self.assertEqual(
-                    tensor.dist_attr().process_mesh.process_ids, [0, 1]
-                )
-                self.assertEqual(tensor.dist_attr().dims_mapping, [-1, -1])
-                self.assertEqual(tensor.dist_attr().partial_dims, set())
-            elif op.name() == "builtin.parameter":
-                pass  # TODO check
-
-
 class TestToStaticPirProgramTrain(unittest.TestCase):
     def test_to_static_program(self):
         paddle.base.set_flags({'FLAGS_enable_pir_api': 1})
@@ -126,8 +112,10 @@ def test_to_static_program(self):
         dist_loader = dist.shard_dataloader(loader, meshes=[mesh])
         dist_model = dist.to_static(layer, dist_loader, loss_fn, opt)
 
+        # dist_model.train()
+        mode = "train"
         dist_model.train()
-        main_program = dist_model._engine._pir_main_progs["train"]
+        main_program = dist_model._engine._pir_dist_main_progs["train"]
 
         relu_idx = 0
         matmul_idx = 0
diff --git a/test/auto_parallel/recompute_pass_unittest.py b/test/auto_parallel/recompute_pass_unittest.py
index 3888ad9597329..1b9c24d84fee5 100644
--- a/test/auto_parallel/recompute_pass_unittest.py
+++ b/test/auto_parallel/recompute_pass_unittest.py
@@ -72,9 +72,7 @@ def check_results(self, ref_losses, check_losses):
             check_losses,
             rtol=self.rtol,
             atol=self.atol,
-            err_msg='pass {} has wrong results!, \nu={}\nv={}\ndiff={}'.format(
-                __class__, ref_losses, check_losses, ref_losses - check_losses
-            ),
+            err_msg=f'pass {__class__} has wrong results!, \nu={ref_losses}\nv={check_losses}\ndiff={ref_losses - check_losses}',
         )
 
     def test_recompute_pass(self):
diff --git a/test/auto_parallel/sharding_pass_unittest.py b/test/auto_parallel/sharding_pass_unittest.py
index 762fb6e239582..ec23307c7f001 100644
--- a/test/auto_parallel/sharding_pass_unittest.py
+++ b/test/auto_parallel/sharding_pass_unittest.py
@@ -87,9 +87,7 @@ def check_results(self, ref_losses, check_losses):
         np.testing.assert_equal(
             ref_losses,
             check_losses,
-            err_msg='pass {} has wrong results!, \nu={}\nv={}\ndiff={}'.format(
-                __class__, ref_losses, check_losses, ref_losses - check_losses
-            ),
+            err_msg=f'pass {__class__} has wrong results!, \nu={ref_losses}\nv={check_losses}\ndiff={ref_losses - check_losses}',
         )
 
     def test_sharding_pass(self):
diff --git a/test/auto_parallel/test_dist_matmul.py b/test/auto_parallel/test_dist_matmul.py
index 62551cc466d45..4cc6dae08e5c1 100644
--- a/test/auto_parallel/test_dist_matmul.py
+++ b/test/auto_parallel/test_dist_matmul.py
@@ -15,6 +15,7 @@
 import unittest
 
 import paddle
+import paddle.distributed as dist
 from paddle.base import program_guard
 from paddle.base.backward import append_backward
 from paddle.distributed.fleet import auto
@@ -365,5 +366,80 @@ def test_trans_x_trans_y(self):
         self.check_row_program(dist_main_prog, dist_ctx)
 
 
+class TestDistMatmulReshard(unittest.TestCase):
+    def _matmul_dp2mp2(self):
+        main_program = paddle.base.Program()
+        start_program = paddle.base.Program()
+        with paddle.static.program_guard(main_program, start_program):
+            local_mesh = auto.ProcessMesh(
+                [[0, 1], [2, 3]], dim_names=["dp", "mp"]
+            )
+
+            x = paddle.static.data(name='x', shape=[8, 6], dtype='float32')
+            x = dist.shard_tensor(
+                x, local_mesh, [dist.Shard(0), dist.Replicate()]
+            )
+            x.stop_gradient = False
+
+            y = paddle.static.create_parameter(
+                name="y", shape=[6, 4], dtype='float32'
+            )
+            # y = paddle.static.data(name="y", shape=[6, 4], dtype='float32')
+            y = dist.shard_tensor(
+                y, local_mesh, [dist.Replicate(), dist.Shard(1)]
+            )
+            y.stop_gradient = False
+
+            z = dist.reshard(y, local_mesh, [dist.Replicate(), dist.Shard(1)])
+            out = paddle.matmul(x, z)
+            loss = paddle.mean(out)
+        return main_program, start_program, loss
+
+    def check_program(self, main_program, dist_ctx):
+        # [0, -1] * [-1, 1] --> [0, 1]
+        ref_ops = [
+            "assign",
+            "matmul_v2",
+            "reduce_mean",
+            "fill_constant",
+            "reduce_mean_grad",
+            "matmul_v2_grad",
+            "c_allreduce_sum",
+            "scale",
+            "c_allreduce_sum",
+            "assign",
+        ]
+        ops = []
+        block = main_program.global_block()
+        for op in block.ops:
+            ops.append(op.type)
+            if op.type == "matmul_v2":
+                out_name = op.output('Out')[0]
+                out_var = block.vars[out_name]
+                op_dist_attr = dist_ctx.get_op_dist_attr_for_program(op)
+                assert op_dist_attr.impl_idx == 0
+                assert op_dist_attr.impl_type == "matmul_v2"
+                out_dims_mapping = op_dist_attr.get_output_dims_mapping(
+                    out_name
+                )
+                assert out_dims_mapping == [0, 1]
+                tensor_dist_attr = dist_ctx.get_tensor_dist_attr_for_program(
+                    out_var
+                )
+                assert tensor_dist_attr.dims_mapping == [0, 1]
+            if op.type == "matmul_v2_grad":
+                op_dist_attr = dist_ctx.get_op_dist_attr_for_program(op)
+                assert op_dist_attr.impl_idx == 0
+                assert op_dist_attr.impl_type == "matmul_v2"
+
+        assert ops == ref_ops, f"ops: {ops}, ref_ops: {ref_ops}"
+
+    def test_matmul_col(self):
+        dist_main_prog, dist_ctx = dist_main_prog, dist_ctx = parallelizer(
+            self._matmul_dp2mp2
+        )
+        self.check_program(dist_main_prog, dist_ctx)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/auto_parallel/test_fused_linear_pass.py b/test/auto_parallel/test_fused_linear_pass.py
index 575b83d0df5fb..aa1f32abfb75e 100644
--- a/test/auto_parallel/test_fused_linear_pass.py
+++ b/test/auto_parallel/test_fused_linear_pass.py
@@ -72,9 +72,7 @@ def check_results(self, ref_losses, check_losses, rtol=None, atol=None):
             check_losses,
             rtol=rtol or self.rtol,
             atol=atol or self.atol,
-            err_msg='pass {} has wrong results!, \nu={}\nv={}\ndiff={}'.format(
-                __class__, ref_losses, check_losses, ref_losses - check_losses
-            ),
+            err_msg=f'pass {__class__} has wrong results!, \nu={ref_losses}\nv={check_losses}\ndiff={ref_losses - check_losses}',
         )
 
     def test_passes(self):
diff --git a/test/auto_parallel/test_pass_base_list.py b/test/auto_parallel/test_pass_base_list.py
index da7df4ad6fc85..6d0193342bf59 100644
--- a/test/auto_parallel/test_pass_base_list.py
+++ b/test/auto_parallel/test_pass_base_list.py
@@ -72,9 +72,7 @@ def check_results(self, ref_losses, check_losses, rtol=None, atol=None):
             check_losses,
             rtol=rtol or self.rtol,
             atol=atol or self.atol,
-            err_msg='pass {} has wrong results!, \nu={}\nv={}\ndiff={}'.format(
-                __class__, ref_losses, check_losses, ref_losses - check_losses
-            ),
+            err_msg=f'pass {__class__} has wrong results!, \nu={ref_losses}\nv={check_losses}\ndiff={ref_losses - check_losses}',
         )
 
     def test_passes(self):
diff --git a/test/auto_parallel/test_prim_dist_op.py b/test/auto_parallel/test_prim_dist_op.py
deleted file mode 100644
index 99e12b2099874..0000000000000
--- a/test/auto_parallel/test_prim_dist_op.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle
-from paddle.base.layer_helper import LayerHelper
-from paddle.distributed.auto_parallel.static.completion import Completer
-from paddle.distributed.auto_parallel.static.dist_context import (
-    DistributedContext,
-    get_default_distributed_context,
-)
-from paddle.distributed.auto_parallel.static.partitioner import Partitioner
-from paddle.distributed.auto_parallel.static.utils import set_var_dist_attr
-from paddle.distributed.fleet import auto
-from paddle.incubate.autograd import enable_prim
-
-paddle.enable_static()
-enable_prim()
-nranks = 2
-rank = 0
-
-
-class TestPrimDistOp(unittest.TestCase):
-    def setUp(self):
-        self.main_program = paddle.static.Program()
-        self.startup_program = paddle.static.Program()
-        self.layer_help = LayerHelper('TestPrimDistOp')
-
-        with paddle.static.program_guard(
-            self.main_program, self.startup_program
-        ):
-            self.init_prog()
-
-    def init_prog(self):
-        # block = self.main_program.global_block()
-        # block = self.main_program.global_block()
-        self.w = self.layer_help.create_parameter(
-            dtype="float", shape=[20], attr=None
-        )
-        self.w_grad = paddle.static.data(
-            name='w_grad', shape=[20], dtype='float'
-        )
-        self.tmp1 = paddle.static.data(name='tmp1', shape=[20], dtype='float')
-        self.tmp2 = paddle.static.data(name='tmp2', shape=[20], dtype='float')
-        self.batch_reduced = paddle.static.data(
-            name='batch_reduced', shape=[], dtype='float'
-        )
-        self.attrs = {}
-
-        default_dist_context = get_default_distributed_context()
-        _global_process_mesh = auto.ProcessMesh(list(range(nranks)))
-        tensor_dist_attr = set_var_dist_attr(
-            default_dist_context,
-            self.tmp1,
-            [-1],
-            _global_process_mesh,
-            mark_annotated=True,
-        )
-        tensor_dist_attr = set_var_dist_attr(
-            default_dist_context,
-            self.tmp1,
-            [-1],
-            _global_process_mesh,
-            mark_annotated=True,
-        )
-
-        op = self.layer_help.append_op(
-            type="add_p",
-            inputs={'X': self.tmp1, 'Y': self.w},
-            outputs={'Z': self.w_grad},
-            attrs=self.attrs,
-        )
-
-        op = self.layer_help.append_op(
-            type="reduce_sum_p",
-            inputs={'X': self.tmp2},
-            outputs={'Y': self.batch_reduced},
-            attrs={"axis": [0]},
-        )
-
-    def test_loss_and_grad_allreduce(self):
-        dist_context = DistributedContext(
-            self.main_program, self.startup_program
-        )
-        completer = Completer(dist_context)
-        completer.complete_prim_annotation(self.main_program)
-        dist_context.block_state.parse_forward_blocks(self.main_program)
-        dist_context.block_state.parse_backward_blocks(self.main_program)
-        dist_context.grads_params = {}
-        dist_context.grads_params[self.w_grad.name] = self.w.name
-        dist_context.synced_gradient = set()
-        dist_context.data_parallel_group = list(range(nranks))
-        partitioner = Partitioner(dist_context, rank)
-        dist_main_prog, dist_startup_prog, _ = partitioner.partition(
-            self.main_program, self.startup_program, [(self.w, self.w_grad)]
-        )
-        ops = dist_main_prog.global_block().ops
-        self.assertTrue(ops[1].type == "c_allreduce_sum")
-        self.assertTrue(ops[3].type == "c_allreduce_sum")
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/auto_parallel/test_selective_recompute.py b/test/auto_parallel/test_selective_recompute.py
index 5099a6adefa4f..18f833cf2feea 100644
--- a/test/auto_parallel/test_selective_recompute.py
+++ b/test/auto_parallel/test_selective_recompute.py
@@ -118,9 +118,7 @@ def check_results(self, ref_losses, check_losses):
             check_losses,
             rtol=self.rtol,
             atol=self.atol,
-            err_msg='pass {} has wrong results!, \nu={}\nv={}\ndiff={}'.format(
-                __class__, ref_losses, check_losses, ref_losses - check_losses
-            ),
+            err_msg=f'pass {__class__} has wrong results!, \nu={ref_losses}\nv={check_losses}\ndiff={ref_losses - check_losses}',
         )
 
     def recompute_vars(self, program):
diff --git a/test/auto_parallel/test_static_sequence_parallel_pass.py b/test/auto_parallel/test_static_sequence_parallel_pass.py
index 823e193f0c9fc..4c34d947fec00 100644
--- a/test/auto_parallel/test_static_sequence_parallel_pass.py
+++ b/test/auto_parallel/test_static_sequence_parallel_pass.py
@@ -208,9 +208,7 @@ def test_decoder_dp_sp(self):
             elif op.type == "c_allreduce_sum":
                 assert (
                     "layer_norm" in op.output_arg_names[0]
-                ), "sequence parallel reducescatter error grad sync var [{}]".format(
-                    op.output_arg_names[0]
-                )
+                ), f"sequence parallel reducescatter error grad sync var [{op.output_arg_names[0]}]"
                 assert sp_ring_id == int(
                     op.attr("ring_id")
                 ), "sequence parallel reducescatter error with ring_id [{}]".format(
@@ -220,19 +218,13 @@ def test_decoder_dp_sp(self):
 
         assert (
             allgather_count == 4
-        ), "sequence parallel should have 4 allgather, but got [{}]".format(
-            allgather_count
-        )
+        ), f"sequence parallel should have 4 allgather, but got [{allgather_count}]"
         assert (
             reducescatter_count == 4
-        ), "sequence parallel should have 4 allgather, but got [{}]".format(
-            reducescatter_count
-        )
+        ), f"sequence parallel should have 4 allgather, but got [{reducescatter_count}]"
         assert (
             allreduce_count == 4
-        ), "sequence parallel should have 4 allgather, but got [{}]".format(
-            allreduce_count
-        )
+        ), f"sequence parallel should have 4 allgather, but got [{allreduce_count}]"
 
 
 if __name__ == "__main__":
diff --git a/test/autograd/CMakeLists.txt b/test/autograd/CMakeLists.txt
index 592517cb8e3da..9bdb0b88daf63 100644
--- a/test/autograd/CMakeLists.txt
+++ b/test/autograd/CMakeLists.txt
@@ -8,7 +8,6 @@ set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0)
 if(WIN32)
   # TODO: Fix these unittests failed on Windows
   list(REMOVE_ITEM TEST_OPS test_autograd_functional_prim)
-  list(REMOVE_ITEM TEST_OPS test_primapi)
 endif()
 
 foreach(TEST_OP ${TEST_OPS})
@@ -21,5 +20,4 @@ set_tests_properties(test_autograd_functional_static PROPERTIES TIMEOUT 160)
 set_tests_properties(test_minimize PROPERTIES TIMEOUT 60)
 if(NOT WIN32)
   set_tests_properties(test_autograd_functional_prim PROPERTIES TIMEOUT 60)
-  set_tests_properties(test_primapi PROPERTIES TIMEOUT 60)
 endif()
diff --git a/test/autograd/test_jvp_and_transpose.py b/test/autograd/test_jvp_and_transpose.py
deleted file mode 100644
index b37fd4e201a4e..0000000000000
--- a/test/autograd/test_jvp_and_transpose.py
+++ /dev/null
@@ -1,1336 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle
-from paddle.base.layer_helper import LayerHelper
-from paddle.incubate.autograd.primrules import _jvp, _transpose
-
-paddle.enable_static()
-
-
-# --------------------- Test linearize rules ----------------------- #
-class TestAddPJVPAndTranspose(unittest.TestCase):
-    def setUp(self):
-        self.main_program = paddle.static.Program()
-        self.startup_program = paddle.static.Program()
-        self.layer_help = LayerHelper('TestPrim2Orig')
-
-        with paddle.static.program_guard(
-            self.main_program, self.startup_program
-        ):
-            self.init_data()
-
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'add_p'
-        X = paddle.static.data(name='X', shape=[2, 2], dtype='float')
-        Y = paddle.static.data(name='Y', shape=[2, 2], dtype='float')
-        self.prim_input = {'X': X, 'Y': Y}
-        self.prim_output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[2, 2], dtype='float')
-        Y_DOT = paddle.static.data(name='Y_DOT', shape=[2, 2], dtype='float')
-        self.jvp_args = (X_DOT, Y_DOT)
-        self.jvp_out_shape_map = {0: self.prim_output['Z']}
-
-        # Set transpose
-        check_dot = lambda v: True
-        Z_BAR = paddle.static.data(name='Z_BAR', shape=[2, 2], dtype='float')
-        self.transpose_args = (check_dot, Z_BAR)
-        self.transpose_out_shape_map = {0: X, 1: Y}
-
-        self.all_ops = [
-            # prim op:
-            'add_p',
-            # jvp op:
-            'add_p',
-            # transpose op:
-        ]
-
-    def test_op(self):
-        with paddle.static.program_guard(
-            self.main_program, self.startup_program
-        ):
-            op = self.layer_help.append_op(
-                type=self.op_type,
-                inputs=self.prim_input,
-                outputs=self.prim_output,
-                attrs=self.prim_attrs,
-            )
-
-            jvp_out = _jvp(op, *self.jvp_args)
-            jvp_out = paddle.utils.flatten(jvp_out)
-            for k, v in self.jvp_out_shape_map.items():
-                self.assertEqual(jvp_out[k].shape, v.shape)
-
-            # Some prim ops dont have transpose rule
-            if hasattr(self, 'transpose_args'):
-                transpose_out = _transpose(op, *self.transpose_args)
-                transpose_out = paddle.utils.flatten(transpose_out)
-                for k, v in self.transpose_out_shape_map.items():
-                    self.assertEqual(transpose_out[k].shape, v.shape)
-
-            all_ops = [op.type for op in self.main_program.block(0).ops]
-            self.assertEqual(sorted(all_ops), sorted(self.all_ops))
-
-
-class TestSubPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'sub_p'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-        Y = paddle.static.data(name='Y', shape=[5, 6], dtype='int64')
-        self.prim_input = {'X': X, 'Y': Y}
-        self.prim_output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[5, 6], dtype='int64')
-        Y_DOT = paddle.static.data(name='Y_DOT', shape=[5, 6], dtype='int64')
-        self.jvp_args = (X_DOT, Y_DOT)
-        self.jvp_out_shape_map = {0: self.prim_output['Z']}
-
-        # Set transpose
-        check_dot = lambda v: True
-        Z_BAR = paddle.static.data(name='Z_BAR', shape=[5, 6], dtype='int64')
-        self.transpose_args = (check_dot, Z_BAR)
-        self.transpose_out_shape_map = {0: X, 1: Y}
-
-        self.all_ops = [
-            # prim op:
-            'sub_p',
-            # jvp op:
-            'sub_p',
-            # transpose op:
-            'fill_constant_p',
-            'sub_p',
-        ]
-
-
-class TestMulPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'mul_p'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-        Y = paddle.static.data(name='Y', shape=[5, 6], dtype='int64')
-        self.prim_input = {'X': X, 'Y': Y}
-        self.prim_output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[5, 6], dtype='int64')
-        Y_DOT = paddle.static.data(name='Y_DOT', shape=[5, 6], dtype='int64')
-        self.jvp_args = (X_DOT, Y_DOT)
-        self.jvp_out_shape_map = {0: self.prim_output['Z']}
-
-        # Set transpose
-        check_dot = lambda v: v is X
-        Z_BAR = paddle.static.data(name='Z_BAR', shape=[5, 6], dtype='int64')
-        self.transpose_args = (check_dot, Z_BAR)
-        self.transpose_out_shape_map = {
-            0: X,
-        }
-
-        self.all_ops = [
-            # prim op:
-            'mul_p',
-            # jvp op:
-            'mul_p',
-            'mul_p',
-            'add_p',
-            # transpose op:
-            'mul_p',
-        ]
-
-
-class TestDivPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'div_p'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-        Y = paddle.static.data(name='Y', shape=[5, 6], dtype='int64')
-        self.prim_input = {'X': X, 'Y': Y}
-        self.prim_output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[5, 6], dtype='int64')
-        Y_DOT = paddle.static.data(name='Y_DOT', shape=[5, 6], dtype='int64')
-        self.jvp_args = (X_DOT, Y_DOT)
-        self.jvp_out_shape_map = {0: self.prim_output['Z']}
-
-        # Set transpose
-        check_dot = lambda v: v is X
-        Z_BAR = paddle.static.data(name='Z_BAR', shape=[5, 6], dtype='int64')
-        self.transpose_args = (check_dot, Z_BAR)
-        self.transpose_out_shape_map = {
-            0: X,
-        }
-
-        self.all_ops = [
-            # prim op:
-            'div_p',
-            # jvp op:
-            'div_p',
-            'div_p',
-            'mul_p',
-            'mul_p',
-            'sub_p',
-            # transpose op:
-            'div_p',
-        ]
-
-
-class TestSqrtPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'sqrt_p'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-        self.prim_input = {
-            'X': X,
-        }
-        self.prim_output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[5, 6], dtype='int64')
-        self.jvp_args = (X_DOT,)
-        self.jvp_out_shape_map = {0: self.prim_output['Y']}
-
-        self.all_ops = [
-            # prim op:
-            'sqrt_p',
-            # jvp op:
-            'div_p',
-            'mul_p',
-            'fill_constant_p',
-            # 'sqrt_p',
-            # transpose op:
-        ]
-
-
-class TestRSqrtPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'rsqrt_p'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-        self.prim_input = {
-            'X': X,
-        }
-        self.prim_output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[5, 6], dtype='int64')
-        self.jvp_args = (X_DOT,)
-        self.jvp_out_shape_map = {0: self.prim_output['Y']}
-
-        self.all_ops = [
-            # prim op:
-            'rsqrt_p',
-            # jvp op:
-            'div_p',
-            'div_p',
-            'mul_p',
-            'fill_constant_p',
-            # 'sqrt_p',
-            # transpose op:
-        ]
-
-
-class TestTanhPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'tanh_p'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-        self.prim_input = {
-            'X': X,
-        }
-        self.prim_output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[5, 6], dtype='int64')
-        self.jvp_args = (X_DOT,)
-        self.jvp_out_shape_map = {0: self.prim_output['Y']}
-
-        self.all_ops = [
-            # prim op:
-            'tanh_p',
-            # jvp op:
-            'mul_p',
-            'sub_p',
-            'fill_constant_p',
-            'mul_p',
-            # transpose op:
-        ]
-
-
-class TestSinPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'sin_p'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-        self.prim_input = {
-            'X': X,
-        }
-        self.prim_output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[5, 6], dtype='int64')
-        self.jvp_args = (X_DOT,)
-        self.jvp_out_shape_map = {0: self.prim_output['Y']}
-
-        self.all_ops = [
-            # prim op:
-            'sin_p',
-            # jvp op:
-            'mul_p',
-            'cos_p',
-            # transpose op:
-        ]
-
-
-class TestCosPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'cos_p'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-        self.prim_input = {
-            'X': X,
-        }
-        self.prim_output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[5, 6], dtype='int64')
-        self.jvp_args = (X_DOT,)
-        self.jvp_out_shape_map = {0: self.prim_output['Y']}
-
-        self.all_ops = [
-            # prim op:
-            'cos_p',
-            # jvp op:
-            'mul_p',
-            'sin_p',
-            'fill_constant_p',
-            'sub_p'
-            # transpose op:
-        ]
-
-
-class TestExpPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'exp_p'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-        self.prim_input = {
-            'X': X,
-        }
-        self.prim_output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[5, 6], dtype='int64')
-        self.jvp_args = (X_DOT,)
-        self.jvp_out_shape_map = {0: self.prim_output['Y']}
-
-        self.all_ops = [
-            # prim op:
-            'exp_p',
-            # jvp op:
-            'mul_p',
-            # transpose op:
-        ]
-
-
-class TestErfPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'erf_p'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-        self.prim_input = {
-            'X': X,
-        }
-        self.prim_output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[5, 6], dtype='int64')
-        self.jvp_args = (X_DOT,)
-        self.jvp_out_shape_map = {0: self.prim_output['Y']}
-
-        self.all_ops = [
-            # prim op:
-            'erf_p',
-            # jvp op:
-            'exp_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            'mul_p',
-            'mul_p',
-            'pow_p',
-            'sub_p',
-            # transpose op:
-        ]
-
-
-class TestAbsPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'abs_p'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-        self.prim_input = {
-            'X': X,
-        }
-        self.prim_output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[5, 6], dtype='int64')
-        self.jvp_args = (X_DOT,)
-        self.jvp_out_shape_map = {0: self.prim_output['Y']}
-
-        self.all_ops = [
-            # prim op:
-            'abs_p',
-            # jvp op:
-            'select_p',
-            'ge_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            'sub_p',
-            # transpose op:
-        ]
-
-
-class TestCastPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'cast_p'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-        self.prim_input = {
-            'X': X,
-        }
-        self.prim_output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {'dtype': paddle.float64}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[5, 6], dtype='int64')
-        self.jvp_args = (X_DOT,)
-        self.jvp_out_shape_map = {0: self.prim_output['Y']}
-
-        # Set transpose
-        check_dot = lambda v: True
-        Y_BAR = paddle.static.data(name='Y_BAR', shape=[5, 6], dtype='float')
-        self.transpose_args = (check_dot, Y_BAR)
-        self.transpose_out_shape_map = {0: X}
-
-        self.all_ops = [
-            # prim op:
-            'cast_p',
-            # jvp op:
-            'cast_p',
-            # transpose op:
-            'cast_p',
-        ]
-
-
-class TestLogPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'log_p'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-        self.prim_input = {
-            'X': X,
-        }
-        self.prim_output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[5, 6], dtype='int64')
-        self.jvp_args = (X_DOT,)
-        self.jvp_out_shape_map = {0: self.prim_output['Y']}
-
-        self.all_ops = [
-            # prim op:
-            'log_p',
-            # jvp op:
-            'div_p',
-            # transpose op:
-        ]
-
-
-class TestReshapePJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'reshape_p'
-        X = paddle.static.data(name='X', shape=[8, 8], dtype='int64')
-        self.prim_input = {
-            'X': X,
-        }
-        self.prim_output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {'shape': [2, 32]}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[8, 8], dtype='int64')
-        self.jvp_args = (X_DOT,)
-        self.jvp_out_shape_map = {0: self.prim_output['Y']}
-
-        # Set transpose
-        check_dot = lambda v: v is X
-        Y_BAR = paddle.static.data(name='Y_BAR', shape=[2, 32], dtype='int64')
-        self.transpose_args = (check_dot, Y_BAR)
-        self.transpose_out_shape_map = {
-            0: X,
-        }
-
-        self.all_ops = [
-            # prim op:
-            'reshape_p',
-            # jvp op:
-            'reshape_p',
-            # transpose op:
-            'reshape_p',
-        ]
-
-
-class TestBroadcastPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'broadcast_p'
-        X = paddle.static.data(name='X', shape=[10, 1], dtype='int64')
-        self.prim_input = {
-            'X': X,
-        }
-        self.prim_output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {'shape': [2, 10, 7]}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[10, 7], dtype='int64')
-        self.jvp_args = (X_DOT,)
-        self.jvp_out_shape_map = {0: self.prim_output['Y']}
-
-        # Set transpose
-        check_dot = lambda v: v is X
-        Y_BAR = paddle.static.data(
-            name='Y_BAR', shape=[2, 10, 7], dtype='int64'
-        )
-        self.transpose_args = (check_dot, Y_BAR)
-        self.transpose_out_shape_map = {
-            0: X,
-        }
-
-        self.all_ops = [
-            # prim op:
-            'broadcast_p',
-            # jvp op:
-            'broadcast_p',
-            # transpose op:
-            'reduce_sum_p',
-            'reshape_p',
-        ]
-
-
-class TestTransposePJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'transpose_p'
-        X = paddle.static.data(name='X', shape=[2, 3, 4, 5], dtype='int64')
-        self.prim_input = {
-            'X': X,
-        }
-        self.prim_output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {'axis': [0, 2, 3, 1]}
-
-        # Set JVP
-        X_DOT = paddle.static.data(
-            name='X_DOT', shape=[2, 3, 4, 5], dtype='int64'
-        )
-        self.jvp_args = (X_DOT,)
-        self.jvp_out_shape_map = {0: self.prim_output['Y']}
-
-        # Set transpose
-        check_dot = lambda v: v is X
-        Y_BAR = paddle.static.data(
-            name='Y_BAR', shape=[2, 4, 5, 3], dtype='int64'
-        )
-        self.transpose_args = (check_dot, Y_BAR)
-        self.transpose_out_shape_map = {
-            0: X,
-        }
-
-        self.all_ops = [
-            # prim op:
-            'transpose_p',
-            # jvp op:
-            'transpose_p',
-            # transpose op:
-            'transpose_p',
-        ]
-
-
-class TestSplitPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'split_p'
-        X = paddle.static.data(name='X', shape=[2, 7, 10], dtype='int64')
-        self.prim_input = {
-            'X': X,
-        }
-        self.prim_output = {
-            'YS': [
-                self.layer_help.create_variable_for_type_inference(
-                    dtype=X.dtype
-                )
-                for i in range(4)
-            ]
-        }
-        self.prim_attrs = {'num_or_sections': [2, 3, 4, 1], 'axis': 2}
-
-        # Set JVP
-        X_DOT = paddle.static.data(
-            name='X_DOT', shape=[2, 7, 10], dtype='int64'
-        )
-        self.jvp_args = (X_DOT,)
-        self.jvp_out_shape_map = {
-            0: self.prim_output['YS'][0],
-            1: self.prim_output['YS'][1],
-            2: self.prim_output['YS'][2],
-            3: self.prim_output['YS'][3],
-        }
-
-        # Set transpose
-        check_dot = lambda v: v is X
-        YS_BAR = [
-            paddle.static.data(name='Y_BAR1', shape=[2, 7, 2], dtype='int64'),
-            paddle.static.data(name='Y_BAR2', shape=[2, 7, 3], dtype='int64'),
-            paddle.static.data(name='Y_BAR3', shape=[2, 7, 4], dtype='int64'),
-            paddle.static.data(name='Y_BAR4', shape=[2, 7, 1], dtype='int64'),
-        ]
-        self.transpose_args = (check_dot, YS_BAR)
-        self.transpose_out_shape_map = {
-            0: X,
-        }
-
-        self.all_ops = [
-            # prim op:
-            'split_p',
-            # jvp op:
-            'split_p',
-            # transpose op:
-            'concat_p',
-        ]
-
-
-class TestConcatPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'concat_p'
-        X = paddle.static.data(name='X', shape=[3, 9, 5], dtype='float64')
-        Y = paddle.static.data(name='Y', shape=[3, 2, 5], dtype='float64')
-        Z = paddle.static.data(name='Z', shape=[3, 3, 5], dtype='float64')
-        self.prim_input = {
-            'XS': [X, Y, Z],
-        }
-        self.prim_output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {'axis': 1}
-
-        # Set JVP
-        XS_DOT = [
-            paddle.static.data(name='X_DOT1', shape=[3, 9, 5], dtype='float64'),
-            paddle.static.data(name='X_DOT2', shape=[3, 2, 5], dtype='float64'),
-            paddle.static.data(name='X_DOT3', shape=[3, 3, 5], dtype='float64'),
-        ]
-        self.jvp_args = (XS_DOT,)
-        self.jvp_out_shape_map = {0: self.prim_output['Y']}
-
-        # Set transpose
-        check_dot = lambda v: v is X or v is Y or v is Z
-        Y_BAR = paddle.static.data(
-            name='Y_BAR', shape=[3, 14, 5], dtype='float64'
-        )
-        self.transpose_args = (check_dot, Y_BAR)
-        self.transpose_out_shape_map = {
-            0: X,
-            1: Y,
-            2: Z,
-        }
-
-        self.all_ops = [
-            # prim op:
-            'concat_p',
-            # jvp op:
-            'concat_p',
-            # transpose op:
-            'split_p',
-        ]
-
-
-class TestReduceSumPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'reduce_sum_p'
-        X = paddle.static.data(name='X', shape=[2, 3, 4, 5], dtype='float64')
-        self.prim_input = {'X': X}
-        self.prim_output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {'axis': [2], 'keepdim': False}
-
-        # Set JVP
-        X_DOT = paddle.static.data(
-            name='X_DOT1', shape=[2, 3, 4, 5], dtype='float64'
-        )
-        self.jvp_args = (X_DOT,)
-        self.jvp_out_shape_map = {0: self.prim_output['Y']}
-
-        # Set transpose
-        check_dot = lambda v: v is X
-        Y_BAR = paddle.static.data(
-            name='Y_BAR', shape=[2, 3, 5], dtype='float64'
-        )
-        self.transpose_args = (check_dot, Y_BAR)
-        self.transpose_out_shape_map = {
-            0: X,
-        }
-
-        self.all_ops = [
-            # prim op:
-            'reduce_sum_p',
-            # jvp op:
-            'reduce_sum_p',
-            # transpose op:
-            'reshape_p',
-            'broadcast_p',
-        ]
-
-
-class TestMatmulPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'matmul_p'
-        X = paddle.static.data(name='X', shape=[2, 3], dtype='float64')
-        Y = paddle.static.data(name='Y', shape=[3, 4], dtype='float64')
-        self.prim_input = {'X': X, 'Y': Y}
-        self.prim_output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[2, 3], dtype='float64')
-        Y_DOT = paddle.static.data(name='Y_DOT', shape=[3, 4], dtype='float64')
-        self.jvp_args = (X_DOT, Y_DOT)
-        self.jvp_out_shape_map = {0: self.prim_output['Z']}
-
-        # Set transpose
-        check_dot = lambda v: v is X
-        Z_BAR = paddle.static.data(name='Z_BAR', shape=[2, 4], dtype='float64')
-        self.transpose_args = (check_dot, Z_BAR)
-        self.transpose_out_shape_map = {
-            0: X,
-        }
-
-        self.all_ops = [
-            # prim op:
-            'matmul_p',
-            # jvp op:
-            'matmul_p',
-            'matmul_p',
-            'add_p',
-            # transpose op:
-            'matmul_p',
-            'transpose_p',
-        ]
-
-
-class TestSliceSelectPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'slice_select_p'
-        X = paddle.static.data(name='X', shape=[3, 20], dtype='float64')
-        self.prim_input = {
-            'X': X,
-        }
-        self.prim_output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {
-            'axis': [1],
-            'starts': [0],
-            'ends': [20],
-            'strides': [2],
-        }
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[3, 20], dtype='float64')
-        self.jvp_args = (X_DOT,)
-        self.jvp_out_shape_map = {0: self.prim_output['Y']}
-
-        # Set transpose
-        check_dot = lambda v: v is X
-        Y_BAR = paddle.static.data(name='Y_BAR', shape=[3, 10], dtype='float64')
-        self.transpose_args = (check_dot, Y_BAR)
-        self.transpose_out_shape_map = {
-            0: X,
-        }
-
-        self.all_ops = [
-            # prim op:
-            'slice_select_p',
-            # jvp op:
-            'slice_select_p',
-            # transpose op:
-            'slice_assign_p',
-            'fill_constant_p',
-        ]
-
-
-class TestSliceAssignPJVPAndTranspose1(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'slice_assign_p'
-        X = paddle.static.data(name='X', shape=[3, 20], dtype='float64')
-        Y = paddle.static.data(name='Y', shape=[3, 5], dtype='float64')
-        self.prim_input = {'X': X, 'Y': Y}
-        self.prim_output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {
-            'axis': [1],
-            'starts': [0],
-            'ends': [10],
-            'strides': [2],
-        }
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[3, 20], dtype='float64')
-        Y_DOT = paddle.static.data(name='Y_DOT', shape=[3, 5], dtype='float64')
-        self.jvp_args = (X_DOT, Y_DOT)
-        self.jvp_out_shape_map = {0: self.prim_output['Z']}
-
-        # Set transpose
-        check_dot = lambda v: v is X
-        Z_BAR = paddle.static.data(name='Z_BAR', shape=[3, 20], dtype='float64')
-        self.transpose_args = (check_dot, Z_BAR)
-        self.transpose_out_shape_map = {0: X}
-
-        self.all_ops = [
-            # prim op:
-            'slice_assign_p',
-            # jvp op:
-            'slice_assign_p',
-            "slice_assign_p",
-            "add_p",
-            "fill_constant_p",
-            "fill_constant_p",
-            # transpose op:
-            'slice_assign_p',
-            'fill_constant_p',
-        ]
-
-
-class TestSliceAssignPJVPAndTranspose2(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'slice_assign_p'
-        X = paddle.static.data(name='X', shape=[3, 20], dtype='float64')
-        Y = paddle.static.data(name='Y', shape=[3, 5], dtype='float64')
-        self.prim_input = {'X': X, 'Y': Y}
-        self.prim_output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {
-            'axis': [1],
-            'starts': [0],
-            'ends': [10],
-            'strides': [2],
-        }
-
-        # Set JVP
-        Y_DOT = paddle.static.data(name='Y_DOT', shape=[3, 5], dtype='float64')
-        self.jvp_args = (None, Y_DOT)
-        self.jvp_out_shape_map = {0: self.prim_output['Z']}
-
-        # Set transpose
-        check_dot = lambda v: v is Y
-        Z_BAR = paddle.static.data(name='Z_BAR', shape=[3, 20], dtype='float64')
-        self.transpose_args = (check_dot, Z_BAR)
-        self.transpose_out_shape_map = {1: Y}
-
-        self.all_ops = [
-            # prim op:
-            'slice_assign_p',
-            # jvp op:
-            'slice_assign_p',
-            "fill_constant_p",
-            # transpose op:
-            'slice_select_p',
-            'fill_constant_p',
-        ]
-
-
-class TestSliceAssignPJVPAndTranspose3(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'slice_assign_p'
-        X = paddle.static.data(name='X', shape=[3, 20], dtype='float64')
-        Y = paddle.static.data(name='Y', shape=[3, 5], dtype='float64')
-        self.prim_input = {'X': X, 'Y': Y}
-        self.prim_output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {
-            'axis': [1],
-            'starts': [0],
-            'ends': [10],
-            'strides': [2],
-        }
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[3, 20], dtype='float64')
-        self.jvp_args = (X_DOT, None)
-        self.jvp_out_shape_map = {0: self.prim_output['Z']}
-
-        # Set transpose
-        check_dot = lambda v: v is X
-        Z_BAR = paddle.static.data(name='Z_BAR', shape=[3, 20], dtype='float64')
-        self.transpose_args = (check_dot, Z_BAR)
-        self.transpose_out_shape_map = {0: X}
-
-        self.all_ops = [
-            # prim op:
-            'slice_assign_p',
-            # jvp op:
-            'slice_assign_p',
-            "fill_constant_p",
-            # transpose op:
-            'slice_assign_p',
-            'fill_constant_p',
-        ]
-
-
-class TestGatherPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'gather_p'
-        X = paddle.static.data(name='X', shape=[9, 5], dtype='float64')
-        IndexTensor = paddle.static.data(
-            name='IndexTensor', shape=[3], dtype='int32'
-        )
-        self.prim_input = {'X': X, 'IndexTensor': IndexTensor}
-        self.prim_output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {'axis': 1}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[9, 5], dtype='float64')
-        self.jvp_args = (
-            X_DOT,
-            IndexTensor,
-        )
-        self.jvp_out_shape_map = {0: self.prim_output['Y']}
-
-        # Set transpose
-        check_dot = lambda v: v is X
-        Y_BAR = paddle.static.data(name='Y_BAR', shape=[9, 3], dtype='float64')
-        self.transpose_args = (check_dot, Y_BAR)
-        self.transpose_out_shape_map = {
-            0: X,
-        }
-
-        self.all_ops = [
-            # prim op:
-            'gather_p',
-            # jvp op:
-            'gather_p',
-            # transpose op:
-            'scatter_add_p',
-            'fill_constant_p',
-        ]
-
-
-class TestScatterAddPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'scatter_add_p'
-        X = paddle.static.data(name='X', shape=[9, 5], dtype='float64')
-        Y = paddle.static.data(name='Y', shape=[9, 3], dtype='float64')
-        IndexTensor = paddle.static.data(
-            name='IndexTensor', shape=[3], dtype='int32'
-        )
-        self.prim_input = {'X': X, 'Y': Y, 'IndexTensor': IndexTensor}
-        self.prim_output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {'axis': 1}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[9, 5], dtype='float64')
-        Y_DOT = paddle.static.data(name='Y_DOT', shape=[9, 3], dtype='float64')
-        self.jvp_args = (X_DOT, Y_DOT)
-        self.jvp_out_shape_map = {0: self.prim_output['Z']}
-
-        # Set transpose
-        check_dot = lambda v: v is X or v is Y
-        Z_BAR = paddle.static.data(name='Z_BAR', shape=[9, 5], dtype='float64')
-        self.transpose_args = (check_dot, Z_BAR)
-        self.transpose_out_shape_map = {0: X, 1: Y}
-
-        self.all_ops = [
-            # prim op:
-            'scatter_add_p',
-            # jvp op:
-            'scatter_add_p',
-            # transpose op:
-            'scatter_add_p',
-            'fill_constant_p',
-            'gather_p',
-        ]
-
-
-class TestSelectPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'select_p'
-        Cond = paddle.static.data(name='Condition', shape=[9, 5], dtype='bool')
-        X = paddle.static.data(name='X', shape=[9, 5], dtype='float64')
-        Y = paddle.static.data(name='Y', shape=[9, 5], dtype='float64')
-
-        self.prim_input = {'Condition': Cond, 'X': X, 'Y': Y}
-        self.prim_output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {}
-
-        # Set JVP
-        Cond_DOT = paddle.static.data(
-            name='Cond_DOT', shape=[9, 5], dtype='float64'
-        )
-        X_DOT = paddle.static.data(name='X_DOT', shape=[9, 5], dtype='float64')
-        Y_DOT = paddle.static.data(name='Y_DOT', shape=[9, 5], dtype='float64')
-        self.jvp_args = (Cond_DOT, X_DOT, Y_DOT)
-        self.jvp_out_shape_map = {0: self.prim_output['Z']}
-
-        # Set transpose
-        check_dot = lambda v: True
-        Z_BAR = paddle.static.data(name='Z_BAR', shape=[9, 5], dtype='float64')
-        self.transpose_args = (check_dot, Z_BAR)
-        self.transpose_out_shape_map = {0: X, 1: Y}
-
-        self.all_ops = [
-            # prim op:
-            'select_p',
-            # jvp op:
-            'select_p',
-            # transpose op:
-            'fill_constant_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            'select_p',
-            'select_p',
-        ]
-
-
-class TestEqPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'eq_p'
-        X = paddle.static.data(name='X', shape=[4, 5], dtype='float64')
-        Y = paddle.static.data(name='Y', shape=[4, 5], dtype='float64')
-
-        self.prim_input = {'X': X, 'Y': Y}
-        self.prim_output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[4, 5], dtype='float64')
-        Y_DOT = paddle.static.data(name='Y_DOT', shape=[4, 5], dtype='float64')
-        self.jvp_args = (X_DOT, Y_DOT)
-        self.jvp_out_shape_map = {0: self.prim_output['Z']}
-
-        self.all_ops = [
-            # prim op:
-            'eq_p',
-            # jvp op:
-            'fill_constant_p',
-            # transpose op:
-        ]
-
-
-class TestGtPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'gt_p'
-        X = paddle.static.data(name='X', shape=[4, 5], dtype='float64')
-        Y = paddle.static.data(name='Y', shape=[4, 5], dtype='float64')
-
-        self.prim_input = {'X': X, 'Y': Y}
-        self.prim_output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[4, 5], dtype='float64')
-        Y_DOT = paddle.static.data(name='Y_DOT', shape=[4, 5], dtype='float64')
-        self.jvp_args = (X_DOT, Y_DOT)
-        self.jvp_out_shape_map = {0: self.prim_output['Z']}
-
-        self.all_ops = [
-            # prim op:
-            'gt_p',
-            # jvp op:
-            'fill_constant_p',
-            # transpose op:
-        ]
-
-
-class TestGePJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'ge_p'
-        X = paddle.static.data(name='X', shape=[4, 5], dtype='float64')
-        Y = paddle.static.data(name='Y', shape=[4, 5], dtype='float64')
-
-        self.prim_input = {'X': X, 'Y': Y}
-        self.prim_output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[4, 5], dtype='float64')
-        Y_DOT = paddle.static.data(name='Y_DOT', shape=[4, 5], dtype='float64')
-        self.jvp_args = (X_DOT, Y_DOT)
-        self.jvp_out_shape_map = {0: self.prim_output['Z']}
-
-        self.all_ops = [
-            # prim op:
-            'ge_p',
-            # jvp op:
-            'fill_constant_p',
-            # transpose op:
-        ]
-
-
-class TestNePJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'ne_p'
-        X = paddle.static.data(name='X', shape=[4, 5], dtype='float64')
-        Y = paddle.static.data(name='Y', shape=[4, 5], dtype='float64')
-
-        self.prim_input = {'X': X, 'Y': Y}
-        self.prim_output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[4, 5], dtype='float64')
-        Y_DOT = paddle.static.data(name='Y_DOT', shape=[4, 5], dtype='float64')
-        self.jvp_args = (X_DOT, Y_DOT)
-        self.jvp_out_shape_map = {0: self.prim_output['Z']}
-
-        self.all_ops = [
-            # prim op:
-            'ne_p',
-            # jvp op:
-            'fill_constant_p',
-            # transpose op:
-        ]
-
-
-class TestPowPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'pow_p'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='float32')
-        Y = paddle.static.data(name='Y', shape=[5, 6], dtype='float32')
-        self.prim_input = {'X': X, 'Y': Y}
-        self.prim_output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[5, 6], dtype='float32')
-        Y_DOT = paddle.static.data(name='Y_DOT', shape=[5, 6], dtype='float32')
-        self.jvp_args = (X_DOT, Y_DOT)
-        self.jvp_out_shape_map = {0: self.prim_output['Z']}
-
-        self.all_ops = [
-            # prim op:
-            'pow_p',
-            # jvp op:
-            'fill_constant_p',
-            'fill_constant_p',
-            'eq_p',
-            'select_p',
-            'sub_p',
-            'mul_p',
-            'mul_p',
-            'pow_p',
-            'mul_p',
-            'mul_p',
-            'log_p',
-            'add_p'
-            # transpose op:
-        ]
-
-
-class TestMaxPJVPAndTranspose(TestAddPJVPAndTranspose):
-    def init_data(self):
-        # Set prim op
-        self.op_type = 'max_p'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='float32')
-        Y = paddle.static.data(name='Y', shape=[5, 6], dtype='float32')
-        self.prim_input = {'X': X, 'Y': Y}
-        self.prim_output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.prim_attrs = {}
-
-        # Set JVP
-        X_DOT = paddle.static.data(name='X_DOT', shape=[5, 6], dtype='float32')
-        Y_DOT = paddle.static.data(name='Y_DOT', shape=[5, 6], dtype='float32')
-        self.jvp_args = (X_DOT, Y_DOT)
-        self.jvp_out_shape_map = {0: self.prim_output['Z']}
-
-        self.all_ops = [
-            # prim op:
-            'max_p',
-            # jvp op:
-            'fill_constant_p',
-            'eq_p',
-            'select_p',
-            # transpose op:
-        ]
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/autograd/test_orig2prim.py b/test/autograd/test_orig2prim.py
deleted file mode 100644
index ca05fa9ee2f9e..0000000000000
--- a/test/autograd/test_orig2prim.py
+++ /dev/null
@@ -1,1204 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle
-from paddle.base.layer_helper import LayerHelper
-from paddle.incubate.autograd.primrules import _orig2prim
-
-paddle.enable_static()
-
-
-# ----------------------- Test orig2prim rules ---------------------------- #
-class TestElementWiseAddOrig2Prim(unittest.TestCase):
-    def setUp(self):
-        self.main_program = paddle.static.Program()
-        self.startup_program = paddle.static.Program()
-        self.layer_help = LayerHelper('TestOrig2Prim')
-
-        with paddle.static.program_guard(
-            self.main_program, self.startup_program
-        ):
-            self.init_data()
-
-    def init_data(self):
-        self.op_type = 'elementwise_add'
-        X = paddle.static.data(name='X', shape=[2, 2], dtype='float')
-        Y = paddle.static.data(name='Y', shape=[2, 2], dtype='float')
-
-        self.input = {'X': X, 'Y': Y}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.orig2prim_args = (X, Y)
-        self.all_ops = ['elementwise_add', 'add_p']
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {0: self.output['Out']}
-
-    def test_op(self):
-        with paddle.static.program_guard(
-            self.main_program, self.startup_program
-        ):
-            op = self.layer_help.append_op(
-                type=self.op_type,
-                inputs=self.input,
-                outputs=self.output,
-                attrs=self.attrs,
-            )
-
-            prim_out = _orig2prim(op, *self.orig2prim_args)
-            all_ops = [op.type for op in self.main_program.block(0).ops]
-
-            self.assertEqual(sorted(all_ops), sorted(self.all_ops))
-            prim_out = paddle.utils.flatten(prim_out)
-            for k, v in self.out_map.items():
-                self.assertEqual(prim_out[k].shape, v.shape)
-
-
-class TestSqrtOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'sqrt'
-        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.orig2prim_args = (X,)
-        self.all_ops = ['sqrt', 'sqrt_p']
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {0: self.output['Out']}
-
-
-class TestElementWiseMulOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'elementwise_mul'
-        X = paddle.static.data(name='X', shape=[8, 8], dtype='float')
-        Y = paddle.static.data(name='Y', shape=[8, 8], dtype='float')
-
-        self.input = {'X': X, 'Y': Y}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.orig2prim_args = (X, Y)
-        self.all_ops = ['elementwise_mul', 'mul_p']
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {0: self.output['Out']}
-
-
-class TestElementWiseDivOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'elementwise_div'
-        X = paddle.static.data(name='X', shape=[8, 8], dtype='float')
-        Y = paddle.static.data(name='Y', shape=[8, 8], dtype='float')
-
-        self.input = {'X': X, 'Y': Y}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.orig2prim_args = (X, Y)
-        self.all_ops = ['elementwise_div', 'div_p']
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {0: self.output['Out']}
-
-
-class TestMatmulV2Orig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'matmul_v2'
-        X = paddle.static.data(name='X', shape=[3, 4], dtype='float')
-        Y = paddle.static.data(name='Y', shape=[4, 3], dtype='float')
-
-        self.input = {'X': X, 'Y': Y}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {'trans_x': True, 'trans_y': True}
-
-        self.orig2prim_args = (X, Y)
-        self.all_ops = ['matmul_v2', 'transpose_p', 'transpose_p', 'matmul_p']
-        self.out_map = {0: self.output['Out']}
-
-
-class TestTanhOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'tanh'
-        X = paddle.static.data(name='X', shape=[3, 4], dtype='float')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.orig2prim_args = (X,)
-        self.all_ops = ['tanh', 'tanh_p']
-        self.out_map = {0: self.output['Out']}
-
-
-class TestSinOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'sin'
-        X = paddle.static.data(name='X', shape=[3, 4], dtype='float')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.orig2prim_args = (X,)
-        self.all_ops = ['sin', 'sin_p']
-        self.out_map = {0: self.output['Out']}
-
-
-class TestCosOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'cos'
-        X = paddle.static.data(name='X', shape=[3, 4], dtype='float')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.orig2prim_args = (X,)
-        self.all_ops = ['cos', 'cos_p']
-        self.out_map = {0: self.output['Out']}
-
-
-class TestExpOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'exp'
-        X = paddle.static.data(name='X', shape=[3, 4], dtype='float')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.orig2prim_args = (X,)
-        self.all_ops = ['exp', 'exp_p']
-        self.out_map = {0: self.output['Out']}
-
-
-class TestErfOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'erf'
-        X = paddle.static.data(name='X', shape=[3, 4], dtype='float')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.orig2prim_args = (X,)
-        self.all_ops = ['erf', 'erf_p']
-        self.out_map = {0: self.output['Out']}
-
-
-class TestAbsOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'abs'
-        X = paddle.static.data(name='X', shape=[3, 4], dtype='float')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.orig2prim_args = (X,)
-        self.all_ops = ['abs', 'abs_p']
-        self.out_map = {0: self.output['Out']}
-
-
-class TestLogOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'log'
-        X = paddle.static.data(name='X', shape=[3, 4], dtype='float')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.orig2prim_args = (X,)
-        self.all_ops = ['log', 'log_p']
-        self.out_map = {0: self.output['Out']}
-
-
-class TestReshape2Orig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'reshape2'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Out': X,
-            'XShape': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            ),
-        }
-        self.attrs = {'shape': [6, 5]}
-
-        self.orig2prim_args = (
-            None,
-            None,
-            X,
-        )
-        self.all_ops = ['reshape2', 'reshape_p', 'fill_constant_p']
-        # Do not check XShape
-        self.out_map = {0: self.output['Out']}
-
-
-class TestConcatOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'concat'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-        Y = paddle.static.data(name='Y', shape=[3, 6], dtype='int64')
-
-        self.input = {
-            'X': [X, Y],
-        }
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {'axis': 0}
-
-        self.orig2prim_args = (
-            None,
-            (X, Y),
-        )
-        self.all_ops = ['concat', 'concat_p']
-        self.out_map = {0: self.output['Out']}
-
-
-class TestSliceOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'slice'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-
-        self.input = {
-            'Input': X,
-        }
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {
-            'axes': [0],
-            'starts': [1],
-            'ends': [4],
-        }
-
-        self.orig2prim_args = (None, None, X, None, None)
-        self.all_ops = ['slice', 'slice_select_p']
-        self.out_map = {0: self.output['Out']}
-
-
-class TestFillZerosLikeOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'fill_zeros_like'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.orig2prim_args = (X,)
-        self.all_ops = ['fill_zeros_like', 'fill_constant_p']
-        self.out_map = {0: self.output['Out']}
-
-
-class TestFillAnyLikeOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'fill_any_like'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.orig2prim_args = (X,)
-        self.all_ops = ['fill_any_like', 'fill_constant_p']
-        self.out_map = {0: self.output['Out']}
-
-
-class TestFillAnyLikeOrig2Prim2(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'fill_any_like'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {'dtype': paddle.float32, 'value': 5}
-
-        self.orig2prim_args = (X,)
-        self.all_ops = ['fill_any_like', 'fill_constant_p']
-        self.out_map = {0: self.output['Out']}
-
-
-class TestSumOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'sum'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-        Y = paddle.static.data(name='Y', shape=[5, 6], dtype='int64')
-
-        self.input = {'X': X, 'Y': Y}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.orig2prim_args = ((X, Y),)
-        self.all_ops = ['sum', 'add_p']
-        self.out_map = {0: self.output['Out']}
-
-
-class TestPNormOrig2Prim1(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'p_norm'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {
-            'porder': 1,
-            'asvector': True,
-        }
-
-        self.orig2prim_args = (X,)
-        self.all_ops = [
-            'p_norm',
-            'reshape_p',
-            'abs_p',
-            'reduce_sum_p',
-        ]
-        self.out_map = {0: self.output['Out']}
-
-
-class TestPNormOrig2Prim2(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'p_norm'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {
-            'porder': 2,
-            'asvector': True,
-        }
-
-        self.orig2prim_args = (X,)
-        self.all_ops = [
-            'p_norm',
-            'reshape_p',
-            'sqrt_p',
-            'reduce_sum_p',
-            'mul_p',
-        ]
-        self.out_map = {0: self.output['Out']}
-
-
-class TestIndexSelectOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'index_select'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-        Index = paddle.static.data(name='Index', shape=[2], dtype='int32')
-
-        self.input = {'X': X, 'Index': Index}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {
-            'dim': 0,
-        }
-
-        self.orig2prim_args = (
-            Index,
-            X,
-        )
-        self.all_ops = ['index_select', 'gather_p']
-        self.out_map = {0: self.output['Out']}
-
-
-class TestElementwiseSubOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'elementwise_sub'
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='int32')
-        Y = paddle.static.data(name='Y', shape=[6], dtype='int32')
-
-        self.input = {'X': X, 'Y': Y}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {
-            'dim': 0,
-        }
-
-        self.orig2prim_args = (
-            X,
-            Y,
-        )
-        self.all_ops = ['elementwise_sub', 'broadcast_p', 'sub_p']
-        self.out_map = {0: self.output['Out']}
-
-
-class TestScaleOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'scale'
-        X = paddle.static.data(name='X', shape=[10, 7], dtype='int32')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {'scale': 2.0, 'bias': 1.0, 'bias_after_scale': True}
-
-        self.orig2prim_args = (
-            None,
-            X,
-        )
-        self.all_ops = [
-            'scale',
-            'fill_constant_p',
-            'fill_constant_p',
-            'mul_p',
-            'add_p',
-        ]
-        self.out_map = {0: self.output['Out']}
-
-
-class TestAssignOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'assign'
-        X = paddle.static.data(name='X', shape=[10, 7], dtype='int32')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.orig2prim_args = (X,)
-        self.all_ops = ['assign', 'fill_constant_p', 'add_p']
-        self.out_map = {0: self.output['Out']}
-
-
-class TestWhereOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'where'
-        Cond = paddle.static.data(name='Condition', shape=[5, 6], dtype='bool')
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='float32')
-        Y = paddle.static.data(name='Y', shape=[5, 6], dtype='float32')
-
-        self.input = {'Condition': Cond, 'X': X, 'Y': Y}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-        self.orig2prim_args = (Cond, X, Y)
-        self.all_ops = ['where', 'select_p']
-        self.out_map = {0: self.output['Out']}
-
-
-class TestEqualOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'equal'
-        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
-        Y = paddle.static.data(name='Y', shape=[5, 8], dtype='float')
-
-        self.input = {'X': X, 'Y': Y}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype='bool'
-            )
-        }
-        self.attrs = {}
-        self.orig2prim_args = (X, Y)
-        self.all_ops = ['equal', 'eq_p']
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {0: self.output['Out']}
-
-
-class TestNeOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'not_equal'
-        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
-        Y = paddle.static.data(name='Y', shape=[5, 8], dtype='float')
-
-        self.input = {'X': X, 'Y': Y}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype='bool'
-            )
-        }
-        self.attrs = {}
-        self.orig2prim_args = (X, Y)
-        self.all_ops = ['not_equal', 'ne_p']
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {0: self.output['Out']}
-
-
-class TestGtOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'greater_than'
-        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
-        Y = paddle.static.data(name='Y', shape=[5, 8], dtype='float')
-
-        self.input = {'X': X, 'Y': Y}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype='bool'
-            )
-        }
-        self.attrs = {}
-        self.orig2prim_args = (X, Y)
-        self.all_ops = ['greater_than', 'gt_p']
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {0: self.output['Out']}
-
-
-class TestGeOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'greater_equal'
-        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
-        Y = paddle.static.data(name='Y', shape=[5, 8], dtype='float')
-
-        self.input = {'X': X, 'Y': Y}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype='bool'
-            )
-        }
-        self.attrs = {}
-        self.orig2prim_args = (X, Y)
-        self.all_ops = ['greater_equal', 'ge_p']
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {0: self.output['Out']}
-
-
-class TestPowOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'elementwise_pow'
-        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
-        Y = paddle.static.data(name='Y', shape=[5, 8], dtype='float')
-
-        self.input = {'X': X, 'Y': Y}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.orig2prim_args = (X, Y)
-        self.all_ops = ['elementwise_pow', 'pow_p']
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {0: self.output['Out']}
-
-
-class TestMaxOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'elementwise_max'
-        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
-        Y = paddle.static.data(name='Y', shape=[5, 8], dtype='float')
-
-        self.input = {'X': X, 'Y': Y}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.orig2prim_args = (X, Y)
-        self.all_ops = ['elementwise_max', 'max_p']
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {0: self.output['Out']}
-
-
-class TestGeluOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'gelu'
-        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
-
-        self.input = {'X': X}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {'approximate': False}
-
-        self.orig2prim_args = (X,)
-        self.all_ops = [
-            'gelu',
-            'add_p',
-            'erf_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            'mul_p',
-            'mul_p',
-            'mul_p',
-        ]
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {0: self.output['Out']}
-
-
-class TestGeluApproximateOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'gelu'
-        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
-
-        self.input = {'X': X}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {'approximate': True}
-
-        self.orig2prim_args = (X,)
-        self.all_ops = [
-            'add_p',
-            'add_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            'gelu',
-            'mul_p',
-            'mul_p',
-            'mul_p',
-            'mul_p',
-            'pow_p',
-            'tanh_p',
-        ]
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {0: self.output['Out']}
-
-
-class TestDropoutOrig2PrimCase1(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'dropout'
-        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
-
-        self.input = {'X': X}
-        self.output = {
-            'Mask': self.layer_help.create_variable_for_type_inference(
-                dtype=paddle.uint8
-            ),
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            ),
-        }
-        self.attrs = {
-            'dropout_prob': 0.5,
-            'is_test': False,
-            'dropout_implementation': 'upscale_in_train',
-        }
-
-        self.orig2prim_args = (None, X)
-        self.all_ops = [
-            'bernoulli_p',
-            'mul_p',
-            'fill_constant_p',
-            'div_p',
-            'cast_p',
-            'dropout',
-        ]
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {0: self.output['Mask'], 1: self.output['Out']}
-
-
-class TestDropoutOrig2PrimCase2(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'dropout'
-        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
-
-        self.input = {'X': X}
-        self.output = {
-            'Mask': self.layer_help.create_variable_for_type_inference(
-                dtype=paddle.uint8
-            ),
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            ),
-        }
-        self.attrs = {
-            'dropout_prob': 0.5,
-            'is_test': False,
-            'dropout_implementation': 'downgrade_in_infer',
-        }
-
-        self.orig2prim_args = (None, X)
-        self.all_ops = ['bernoulli_p', 'mul_p', 'cast_p', 'dropout']
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {0: self.output['Mask'], 1: self.output['Out']}
-
-
-class TestDropoutOrig2PrimCase3(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'dropout'
-        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
-
-        self.input = {'X': X}
-        self.output = {
-            'Mask': self.layer_help.create_variable_for_type_inference(
-                dtype=paddle.uint8
-            ),
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            ),
-        }
-        self.attrs = {
-            'dropout_prob': 0.5,
-            'is_test': True,
-            'dropout_implementation': 'upscale_in_train',
-        }
-
-        self.orig2prim_args = (None, X)
-        self.all_ops = ['bernoulli_p', 'cast_p', 'dropout']
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {0: self.output['Mask'], 1: self.output['Out']}
-
-
-class TestDropoutOrig2PrimCase4(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'dropout'
-        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
-
-        self.input = {'X': X}
-        self.output = {
-            'Mask': self.layer_help.create_variable_for_type_inference(
-                dtype=paddle.uint8
-            ),
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            ),
-        }
-        self.attrs = {
-            'dropout_prob': 0.5,
-            'is_test': True,
-            'dropout_implementation': 'downgrade_in_infer',
-        }
-
-        self.orig2prim_args = (None, X)
-        self.all_ops = [
-            'bernoulli_p',
-            'fill_constant_p',
-            'mul_p',
-            'cast_p',
-            'dropout',
-        ]
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {0: self.output['Mask'], 1: self.output['Out']}
-
-
-class TestReduceSumOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'reduce_sum'
-
-        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
-
-        self.input = {'X': X}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {'axis': [0, 1], 'keep_dim': False}
-
-        self.orig2prim_args = (X,)
-        self.all_ops = ['reduce_sum', 'reduce_sum_p']
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {0: self.output['Out']}
-
-
-class TestReduceMeanOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'reduce_mean'
-        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
-
-        self.input = {'X': X}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {'axis': [0, 1], 'keep_dim': False}
-
-        self.orig2prim_args = (X,)
-        self.all_ops = [
-            'reduce_mean',
-            'reduce_sum_p',
-            'fill_constant_p',
-            'div_p',
-        ]
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {0: self.output['Out']}
-
-
-class TestSizeOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'size'
-        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
-
-        self.input = {'Input': X}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=paddle.int64
-            )
-        }
-        self.attrs = {}
-        self.orig2prim_args = (X,)
-        self.all_ops = ['size', 'fill_constant_p']
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {0: self.output['Out']}
-
-
-class TestCastOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'cast'
-        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
-
-        self.input = {'X': X}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {'in_dtype': X.dtype, 'out_dtype': paddle.float64}
-        self.orig2prim_args = (X,)
-        self.all_ops = ['cast', 'cast_p']
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {0: self.output['Out']}
-
-
-class TestPowScalarOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'pow'
-        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
-
-        self.input = {'X': X}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {'factor': 2.0}
-        self.orig2prim_args = (None, X)
-        self.all_ops = ['pow', 'pow_p', 'fill_constant_p']
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {0: self.output['Out']}
-
-
-class TestSquareOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'square'
-        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
-
-        self.input = {'X': X}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-        self.orig2prim_args = (X,)
-        self.all_ops = ['square', 'pow_p', 'fill_constant_p']
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {0: self.output['Out']}
-
-
-class TestRSqrtOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'rsqrt'
-        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.orig2prim_args = (X,)
-        self.all_ops = ['rsqrt', 'rsqrt_p']
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {0: self.output['Out']}
-
-
-class TestBatchnormOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'batch_norm'
-        x = paddle.static.data(name='X', shape=[5, 8], dtype='float')
-        m = paddle.static.data(name='Mean', shape=[8], dtype='float')
-        v = paddle.static.data(name='Variance', shape=[8], dtype='float')
-        w = paddle.static.data(name='Scale', shape=[8], dtype='float')
-        b = paddle.static.data(name='Bias', shape=[8], dtype='float')
-
-        self.input = {
-            "X": [x],
-            "Scale": [w],
-            "Bias": [b],
-            "Mean": [m],
-            "Variance": [v],
-        }
-        saved_variance = self.layer_help.create_variable_for_type_inference(
-            dtype=x.dtype, stop_gradient=True
-        )
-        batch_norm_out = self.layer_help.create_variable_for_type_inference(
-            x.dtype
-        )
-        saved_mean = self.layer_help.create_variable_for_type_inference(
-            dtype=x.dtype, stop_gradient=True
-        )
-        self.output = {
-            "Y": [batch_norm_out],
-            "MeanOut": [m],
-            "VarianceOut": [v],
-            "SavedMean": [saved_mean],
-            "SavedVariance": [saved_variance],
-        }
-
-        self.attrs = {
-            "momentum": 0.9,
-            "epsilon": 1e-5,
-            "is_test": False,
-            "data_layout": 'NCHW',
-            "use_mkldnn": False,
-            "fuse_with_relu": False,
-            "use_global_stats": False,
-            "trainable_statistics": False,
-        }
-        self.orig2prim_args = (b, m, None, w, v, x)
-        self.all_ops = [
-            'add_p',
-            'add_p',
-            'add_p',
-            'add_p',
-            'batch_norm',
-            'broadcast_p',
-            'broadcast_p',
-            'broadcast_p',
-            'broadcast_p',
-            'broadcast_p',
-            'div_p',
-            'div_p',
-            'div_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            'mul_p',
-            'mul_p',
-            'mul_p',
-            'mul_p',
-            'mul_p',
-            'pow_p',
-            'reduce_sum_p',
-            'reduce_sum_p',
-            'reshape_p',
-            'reshape_p',
-            'reshape_p',
-            'reshape_p',
-            'sqrt_p',
-            'sub_p',
-            'sub_p',
-            'sub_p',
-            'sub_p',
-        ]
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {}
-
-
-class TestFillConstantOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'fill_constant'
-
-        self.attrs = {'value': 1.0, 'shape': (2, 3), 'dtype': paddle.float32}
-        self.input = {}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=paddle.float32
-            )
-        }
-
-        self.orig2prim_args = (None, None, None)
-        self.all_ops = ['fill_constant', 'fill_constant_p']
-        # { prim_op_output_index: orig_op_output_var }
-        self.out_map = {0: self.output['Out']}
-
-
-class TestUniformRandomOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'uniform_random'
-        self.input = {}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=paddle.float32
-            )
-        }
-        self.attrs = {'shape': [1, 2]}
-
-        self.orig2prim_args = (None, None)
-        self.all_ops = ['uniform_random', 'uniform_random_p']
-        self.out_map = {0: self.output['Out']}
-
-
-class TestSigmoidOrig2Prim(TestElementWiseAddOrig2Prim):
-    def init_data(self):
-        self.op_type = 'sigmoid'
-        X = paddle.static.data(name='X', shape=[3], dtype='float32')
-
-        self.attrs = {}
-        self.input = {'X': X}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=paddle.float32
-            )
-        }
-
-        self.orig2prim_args = (X,)
-        self.all_ops = [
-            'sigmoid',
-            'div_p',
-            'fill_constant_p',
-            'add_p',
-            'fill_constant_p',
-            'exp_p',
-            'fill_constant_p',
-            'sub_p',
-        ]
-        self.out_map = {0: self.output['Out']}
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/autograd/test_prim2orig.py b/test/autograd/test_prim2orig.py
deleted file mode 100644
index b854c1350148a..0000000000000
--- a/test/autograd/test_prim2orig.py
+++ /dev/null
@@ -1,761 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle
-from paddle.base.layer_helper import LayerHelper
-from paddle.incubate.autograd.primrules import _prim2orig
-
-paddle.enable_static()
-
-
-# ------------------------ Test prim2orig rules ---------------------------- #
-class TestAddPPrim2Orig(unittest.TestCase):
-    def setUp(self):
-        self.main_program = paddle.static.Program()
-        self.startup_program = paddle.static.Program()
-        self.layer_help = LayerHelper('TestPrim2Orig')
-
-        with paddle.static.program_guard(
-            self.main_program, self.startup_program
-        ):
-            self.init_data()
-
-    def init_data(self):
-        self.op_type = 'add_p'
-        X = paddle.static.data(name='X', shape=[2, 2], dtype='float')
-        Y = paddle.static.data(name='Y', shape=[2, 2], dtype='float')
-
-        self.input = {'X': X, 'Y': Y}
-        self.output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.prim2orig_args = (X, Y)
-        self.all_ops = ['add_p', 'elementwise_add']
-        # { prim_op_output_var: origin_op_out_index }
-        self.out_map = {self.output['Z']: 0}
-
-    def test_op(self):
-        with paddle.static.program_guard(
-            self.main_program, self.startup_program
-        ):
-            op = self.layer_help.append_op(
-                type=self.op_type,
-                inputs=self.input,
-                outputs=self.output,
-                attrs=self.attrs,
-            )
-
-            orig_out = _prim2orig(op, *self.prim2orig_args)
-            all_ops = [op.type for op in self.main_program.block(0).ops]
-            self.assertEqual(sorted(all_ops), sorted(self.all_ops))
-            orig_out = paddle.utils.flatten(orig_out)
-            for k, v in self.out_map.items():
-                self.assertEqual(k.shape, orig_out[v].shape)
-
-
-class TestSubPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'sub_p'
-        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
-        Y = paddle.static.data(name='Y', shape=[7, 8], dtype='float64')
-
-        self.input = {'X': X, 'Y': Y}
-        self.output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.prim2orig_args = (X, Y)
-        self.all_ops = ['sub_p', 'elementwise_sub']
-        self.out_map = {self.output['Z']: 0}
-
-
-class TestMulPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'mul_p'
-        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
-        Y = paddle.static.data(name='Y', shape=[7, 8], dtype='float64')
-
-        self.input = {'X': X, 'Y': Y}
-        self.output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.prim2orig_args = (X, Y)
-        self.all_ops = ['mul_p', 'elementwise_mul']
-        self.out_map = {self.output['Z']: 0}
-
-
-class TestDivPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'div_p'
-        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
-        Y = paddle.static.data(name='Y', shape=[7, 8], dtype='float64')
-
-        self.input = {'X': X, 'Y': Y}
-        self.output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.prim2orig_args = (X, Y)
-        self.all_ops = ['div_p', 'elementwise_div']
-        self.out_map = {self.output['Z']: 0}
-
-
-class TestSqrtPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'sqrt_p'
-        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.prim2orig_args = (X,)
-        self.all_ops = ['sqrt_p', 'sqrt']
-        self.out_map = {self.output['Y']: 0}
-
-
-class TestTanhPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'tanh_p'
-        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.prim2orig_args = (X,)
-        self.all_ops = ['tanh_p', 'tanh']
-        self.out_map = {self.output['Y']: 0}
-
-
-class TestSinPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'sin_p'
-        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.prim2orig_args = (X,)
-        self.all_ops = ['sin_p', 'sin']
-        self.out_map = {self.output['Y']: 0}
-
-
-class TestCosPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'cos_p'
-        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.prim2orig_args = (X,)
-        self.all_ops = ['cos_p', 'cos']
-        self.out_map = {self.output['Y']: 0}
-
-
-class TestExpPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'exp_p'
-        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.prim2orig_args = (X,)
-        self.all_ops = ['exp_p', 'exp']
-        self.out_map = {self.output['Y']: 0}
-
-
-class TestErfPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'erf_p'
-        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.prim2orig_args = (X,)
-        self.all_ops = ['erf_p', 'erf']
-        self.out_map = {self.output['Y']: 0}
-
-
-class TestAbsPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'abs_p'
-        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.prim2orig_args = (X,)
-        self.all_ops = ['abs_p', 'abs']
-        self.out_map = {self.output['Y']: 0}
-
-
-class TestLogPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'log_p'
-        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.prim2orig_args = (X,)
-        self.all_ops = ['log_p', 'log']
-        self.out_map = {self.output['Y']: 0}
-
-
-class TestReshapePPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'reshape_p'
-        X = paddle.static.data(name='X', shape=[2, 8], dtype='float64')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {'shape': [4, 4]}
-
-        self.prim2orig_args = (X,)
-        self.all_ops = ['reshape_p', 'reshape2']
-        self.out_map = {self.output['Y']: 0}
-
-
-class TestBroadcastPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'broadcast_p'
-        X = paddle.static.data(name='X', shape=[2, 8], dtype='float64')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {'shape': [10, 2, 8]}
-
-        self.prim2orig_args = (X,)
-        self.all_ops = ['broadcast_p', 'expand_v2']
-        self.out_map = {self.output['Y']: 0}
-
-
-class TestTransposePPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'transpose_p'
-        X = paddle.static.data(name='X', shape=[7, 8, 9, 10], dtype='float64')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {'axis': [1, 2, 0, 3]}
-
-        self.prim2orig_args = (X,)
-        self.all_ops = ['transpose_p', 'transpose2']
-        self.out_map = {self.output['Y']: 0}
-
-
-class TestSplitPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'split_p'
-        X = paddle.static.data(name='X', shape=[3, 9, 5], dtype='float64')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'YS': [
-                self.layer_help.create_variable_for_type_inference(
-                    dtype=X.dtype
-                )
-                for i in range(3)
-            ]
-        }
-        self.attrs = {'num_or_sections': [2, 3, 4], 'axis': 1}
-
-        self.prim2orig_args = (X,)
-        self.all_ops = ['split_p', 'split']
-        self.out_map = {
-            self.output['YS'][0]: 0,
-            self.output['YS'][1]: 1,
-            self.output['YS'][2]: 2,
-        }
-
-
-class TestConcatPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'concat_p'
-        X = paddle.static.data(name='X', shape=[3, 9, 5], dtype='float64')
-        Y = paddle.static.data(name='Y', shape=[2, 9, 5], dtype='float64')
-        Z = paddle.static.data(name='Z', shape=[1, 9, 5], dtype='float64')
-
-        self.input = {
-            'XS': [X, Y, Z],
-        }
-        self.output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {'axis': 0}
-
-        self.prim2orig_args = ((X, Y, Z),)
-        self.all_ops = ['concat_p', 'concat']
-        self.out_map = {self.output['Y']: 0}
-
-
-class TestReducePPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'reduce_sum_p'
-        X = paddle.static.data(name='X', shape=[3, 9, 5], dtype='float64')
-
-        self.input = {'X': X}
-        self.output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {'axis': [1], 'keepdim': True}
-
-        self.prim2orig_args = (X,)
-        self.all_ops = ['reduce_sum_p', 'reduce_sum']
-        self.out_map = {self.output['Y']: 0}
-
-
-class TestMatmulPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'matmul_p'
-        X = paddle.static.data(name='X', shape=[9, 5], dtype='float64')
-        Y = paddle.static.data(name='Y', shape=[5, 9], dtype='float64')
-
-        self.input = {'X': X, 'Y': Y}
-        self.output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.prim2orig_args = (X, Y)
-        self.all_ops = ['matmul_p', 'matmul_v2']
-        self.out_map = {self.output['Z']: 0}
-
-
-class TestSliceSelectPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'slice_select_p'
-        X = paddle.static.data(name='X', shape=[9, 5], dtype='float64')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {'axis': [0], 'starts': [1], 'ends': [8], 'strides': [2]}
-
-        self.prim2orig_args = (X,)
-        self.all_ops = ['slice_select_p', 'strided_slice']
-        self.out_map = {self.output['Y']: 0}
-
-
-class TestSliceAssignPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'slice_assign_p'
-        X = paddle.static.data(name='X', shape=[9, 5], dtype='float64')
-        Y = paddle.static.data(name='Y', shape=[9, 3], dtype='float64')
-
-        self.input = {'X': X, 'Y': Y}
-        self.output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {'axis': [1], 'starts': [0], 'ends': [3], 'strides': [1]}
-
-        self.prim2orig_args = (X, Y)
-        self.all_ops = ['slice_assign_p', 'assign', 'set_value']
-        self.out_map = {self.output['Z']: 0}
-
-
-class TestGatherPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'gather_p'
-        X = paddle.static.data(name='X', shape=[9, 5], dtype='float64')
-        IndexTensor = paddle.static.data(
-            name='IndexTensor', shape=[3], dtype='int32'
-        )
-
-        self.input = {'X': X, 'IndexTensor': IndexTensor}
-        self.output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {
-            'axis': 0,
-        }
-
-        self.prim2orig_args = (
-            IndexTensor,
-            X,
-        )
-        self.all_ops = ['gather_p', 'gather']
-        self.out_map = {self.output['Y']: 0}
-
-
-class TestScatterAddPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'scatter_add_p'
-        X = paddle.static.data(name='X', shape=[9, 5], dtype='float64')
-        Y = paddle.static.data(name='Y', shape=[3, 5], dtype='float64')
-        IndexTensor = paddle.static.data(
-            name='IndexTensor', shape=[3], dtype='int32'
-        )
-
-        self.input = {'X': X, 'Y': Y, 'IndexTensor': IndexTensor}
-        self.output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {
-            'axis': 0,
-        }
-
-        self.prim2orig_args = (IndexTensor, X, Y)
-        self.all_ops = [
-            'scatter_add_p',
-            'fill_any_like',
-            'scatter',
-            'elementwise_add',
-        ]
-        self.out_map = {self.output['Z']: 0}
-
-
-class TestFillConstantPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'fill_constant_p'
-
-        self.input = {}
-        self.output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                paddle.int32
-            )
-        }
-        self.attrs = {'value': 10, 'shape': [5, 5], 'dtype': paddle.int32}
-
-        self.prim2orig_args = ()
-        self.all_ops = ['fill_constant_p', 'fill_constant']
-        self.out_map = {self.output['Y']: 0}
-
-
-class TestSelectPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'select_p'
-        Cond = paddle.static.data(name='Condition', shape=[5, 6], dtype='bool')
-        X = paddle.static.data(name='X', shape=[5, 6], dtype='float32')
-        Y = paddle.static.data(name='Y', shape=[5, 6], dtype='float32')
-
-        self.input = {'Condition': Cond, 'X': X, 'Y': Y}
-        self.output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-        self.prim2orig_args = (Cond, X, Y)
-        self.all_ops = ['select_p', 'where']
-        self.out_map = {self.output['Z']: 0}
-
-
-class TestEqPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'eq_p'
-        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
-        Y = paddle.static.data(name='Y', shape=[7, 8], dtype='float64')
-
-        self.input = {'X': X, 'Y': Y}
-        self.output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype='bool'
-            )
-        }
-        self.attrs = {}
-
-        self.prim2orig_args = (X, Y)
-        self.all_ops = ['eq_p', 'equal']
-        self.out_map = {self.output['Z']: 0}
-
-
-class TestNePPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'ne_p'
-        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
-        Y = paddle.static.data(name='Y', shape=[7, 8], dtype='float64')
-
-        self.input = {'X': X, 'Y': Y}
-        self.output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype='bool'
-            )
-        }
-        self.attrs = {}
-
-        self.prim2orig_args = (X, Y)
-        self.all_ops = ['ne_p', 'not_equal']
-        self.out_map = {self.output['Z']: 0}
-
-
-class TestGtPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'gt_p'
-        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
-        Y = paddle.static.data(name='Y', shape=[7, 8], dtype='float64')
-
-        self.input = {'X': X, 'Y': Y}
-        self.output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype='bool'
-            )
-        }
-        self.attrs = {}
-
-        self.prim2orig_args = (X, Y)
-        self.all_ops = ['gt_p', 'greater_than']
-        self.out_map = {self.output['Z']: 0}
-
-
-class TestGePPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'ge_p'
-        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
-        Y = paddle.static.data(name='Y', shape=[7, 8], dtype='float64')
-
-        self.input = {'X': X, 'Y': Y}
-        self.output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype='bool'
-            )
-        }
-        self.attrs = {}
-
-        self.prim2orig_args = (X, Y)
-        self.all_ops = ['ge_p', 'greater_equal']
-        self.out_map = {self.output['Z']: 0}
-
-
-class TestPowPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'pow_p'
-        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
-        Y = paddle.static.data(name='Y', shape=[7, 8], dtype='float64')
-
-        self.input = {'X': X, 'Y': Y}
-        self.output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.prim2orig_args = (X, Y)
-        self.all_ops = ['pow_p', 'elementwise_pow']
-        self.out_map = {self.output['Z']: 0}
-
-
-class TestMaxPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'max_p'
-        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
-        Y = paddle.static.data(name='Y', shape=[7, 8], dtype='float64')
-
-        self.input = {'X': X, 'Y': Y}
-        self.output = {
-            'Z': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.prim2orig_args = (X, Y)
-        self.all_ops = ['max_p', 'elementwise_max']
-        self.out_map = {self.output['Z']: 0}
-
-
-class TestBernoulliPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'bernoulli_p'
-
-        self.input = {}
-        self.output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=paddle.float64
-            )
-        }
-        self.attrs = {'shape': [7, 8], 'dtype': paddle.float64, 'p': 0.5}
-
-        self.prim2orig_args = ()
-        self.all_ops = ['bernoulli_p', 'fill_constant', 'bernoulli']
-        self.out_map = {self.output['Y']: 0}
-
-
-class TestCastPPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'cast_p'
-        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {'dtype': paddle.int64}
-
-        self.prim2orig_args = (X,)
-        self.all_ops = ['cast_p', 'cast']
-        self.out_map = {self.output['Y']: 0}
-
-
-class TestRsqrtPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'rsqrt_p'
-        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
-
-        self.input = {
-            'X': X,
-        }
-        self.output = {
-            'Y': self.layer_help.create_variable_for_type_inference(
-                dtype=X.dtype
-            )
-        }
-        self.attrs = {}
-
-        self.prim2orig_args = (X,)
-        self.all_ops = ['rsqrt_p', 'rsqrt']
-        self.out_map = {self.output['Y']: 0}
-
-
-class TestUniformRandomPrim2Orig(TestAddPPrim2Orig):
-    def init_data(self):
-        self.op_type = 'uniform_random_p'
-
-        self.input = {}
-        self.output = {
-            'Out': self.layer_help.create_variable_for_type_inference(
-                dtype=paddle.float64
-            )
-        }
-        self.attrs = {
-            'shape': [1, 2, 3],
-            'min': -1.0,
-            'max': 1.0,
-            'seed': 0,
-            'dtype': paddle.float64,
-        }
-
-        self.prim2orig_args = ()
-        self.all_ops = ['uniform_random_p', 'uniform_random']
-        self.out_map = {self.output['Out']: 0}
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/autograd/test_primapi.py b/test/autograd/test_primapi.py
deleted file mode 100644
index 7bbe4e4476046..0000000000000
--- a/test/autograd/test_primapi.py
+++ /dev/null
@@ -1,1097 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import typing
-import unittest
-
-import autograd
-import autograd.numpy as anp
-import autograd.scipy as ascipy
-import config
-import numpy as np
-import parameterized as param
-import utils
-
-import paddle
-from paddle.base import core
-from paddle.incubate.autograd import primapi, primx
-
-
-@utils.place(config.DEVICES)
-@utils.parameterize(
-    (utils.TEST_CASE_NAME, 'fun', 'xs', 'dtype'),
-    (
-        (
-            'uniform_random',
-            lambda: paddle.uniform(
-                [1, 2, 3], dtype='float32', min=0, max=1.0, seed=1
-            ),
-            (),
-            'int32',
-        ),
-        (
-            'sigmoid',
-            paddle.nn.functional.sigmoid,
-            (
-                np.random.rand(
-                    5,
-                ),
-            ),
-            'float32',
-        ),
-    ),
-)
-class TestFowardApi(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.xs = tuple(x.astype(cls.dtype) for x in cls.xs)
-
-    def setUp(self):
-        paddle.enable_static()
-        paddle.incubate.autograd.enable_prim()
-
-    def tearDown(self):
-        paddle.incubate.autograd.disable_prim()
-        paddle.disable_static()
-
-    def test_grad(self):
-        def expected():
-            paddle.incubate.autograd.disable_prim()
-            sp = paddle.static.Program()
-            mp = paddle.static.Program()
-            with paddle.static.program_guard(mp, sp):
-                feed, static_xs = utils.gen_static_inputs_and_feed(
-                    self.xs, stop_gradient=False
-                )
-                out = self.fun(*static_xs)
-            exe = paddle.static.Executor()
-            exe.run(sp)
-            out = exe.run(mp, feed=feed, fetch_list=out)
-            paddle.incubate.autograd.enable_prim()
-            return out
-
-        def actual():
-            paddle.incubate.autograd.enable_prim()
-            sp = paddle.static.Program()
-            mp = paddle.static.Program()
-            with paddle.static.program_guard(mp, sp):
-                feed, static_xs = utils.gen_static_inputs_and_feed(
-                    self.xs, stop_gradient=False
-                )
-                out = self.fun(*static_xs)
-                primx.orig2prim(mp.block(0))
-                primx.prim2orig(mp.block(0))
-            exe = paddle.static.Executor()
-            exe.run(sp)
-            out = exe.run(mp, feed=feed, fetch_list=out)
-            paddle.incubate.autograd.disable_prim()
-            return out
-
-        expected = expected()
-        actual = actual()
-        self.assertEqual(type(actual), type(expected))
-        for i, j in zip(actual, expected):
-            np.testing.assert_allclose(i, j, rtol=1e-6)
-
-
-@utils.place(config.DEVICES)
-@utils.parameterize(
-    (utils.TEST_CASE_NAME, 'fun', 'xs', 'v', 'dtype'),
-    (
-        (
-            'dropout',
-            paddle.nn.functional.dropout,
-            (np.random.rand(5000, 5000),),
-            None,
-            'float32',
-        ),
-    ),
-)
-class TestDropoutGrad(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.xs = tuple(x.astype(cls.dtype) for x in cls.xs)
-        cls._rtol = (
-            config.TOLERANCE.get(str(cls.dtype))
-            .get("first_order_grad")
-            .get("rtol")
-        )
-        cls._atol = (
-            config.TOLERANCE.get(str(cls.dtype))
-            .get("first_order_grad")
-            .get("atol")
-        )
-
-    def setUp(self):
-        paddle.enable_static()
-        paddle.incubate.autograd.enable_prim()
-
-    def tearDown(self):
-        paddle.incubate.autograd.disable_prim()
-        paddle.disable_static()
-
-    def test_grad(self):
-        def expected():
-            paddle.incubate.autograd.disable_prim()
-            sp = paddle.static.Program()
-            mp = paddle.static.Program()
-            with paddle.static.program_guard(mp, sp):
-                feed, static_xs, static_v = utils.gen_static_data_and_feed(
-                    self.xs, self.v, stop_gradient=False
-                )
-                _, ys_grad = paddle.incubate.autograd.vjp(
-                    self.fun, static_xs, static_v
-                )
-            exe = paddle.static.Executor()
-            exe.run(sp)
-            out = exe.run(mp, feed=feed, fetch_list=ys_grad)
-            paddle.incubate.autograd.enable_prim()
-            return out
-
-        def actual():
-            paddle.incubate.autograd.enable_prim()
-            sp = paddle.static.Program()
-            mp = paddle.static.Program()
-            with paddle.static.program_guard(mp, sp):
-                feed, static_xs, static_v = utils.gen_static_data_and_feed(
-                    self.xs, self.v, stop_gradient=False
-                )
-                ys = (
-                    self.fun(*static_xs)
-                    if isinstance(static_xs, typing.Sequence)
-                    else self.fun(static_xs)
-                )
-                ys_grad = paddle.incubate.autograd.grad(ys, static_xs, static_v)
-                paddle.incubate.autograd.prim2orig(mp.block(0))
-            exe = paddle.static.Executor()
-            exe.run(sp)
-            out = exe.run(mp, feed=feed, fetch_list=ys_grad)
-            paddle.incubate.autograd.disable_prim()
-            return out
-
-        expected = expected()
-        actual = actual()
-        self.assertEqual(type(actual), type(expected))
-        for i, j in zip(actual, expected):
-            np.testing.assert_allclose(np.sum(i), np.sum(j), rtol=1e-1)
-
-
-@utils.place(config.DEVICES)
-@utils.parameterize(
-    (utils.TEST_CASE_NAME, 'fun', 'xs', 'v', 'dtype'),
-    (
-        (
-            'matmul',
-            paddle.matmul,
-            (np.random.rand(2, 3), np.random.rand(3, 2)),
-            None,
-            'float32',
-        ),
-    ),
-)
-class TestWithoutProgramGuard(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.xs = tuple(x.astype(cls.dtype) for x in cls.xs)
-        cls._rtol = (
-            config.TOLERANCE.get(str(cls.dtype))
-            .get("first_order_grad")
-            .get("rtol")
-        )
-        cls._atol = (
-            config.TOLERANCE.get(str(cls.dtype))
-            .get("first_order_grad")
-            .get("atol")
-        )
-
-    def setUp(self):
-        paddle.enable_static()
-        paddle.incubate.autograd.enable_prim()
-
-    def tearDown(self):
-        paddle.incubate.autograd.disable_prim()
-        paddle.disable_static()
-
-    def test_forward_grad_without_program_guard(self):
-        def with_program_guard():
-            paddle.incubate.autograd.enable_prim()
-            sp = paddle.static.Program()
-            mp = paddle.static.Program()
-            with paddle.static.program_guard(mp, sp):
-                feed, static_xs, static_v = utils.gen_static_data_and_feed(
-                    self.xs, self.v, stop_gradient=False
-                )
-                ys = (
-                    self.fun(*static_xs)
-                    if isinstance(static_xs, typing.Sequence)
-                    else self.fun(static_xs)
-                )
-                ys_grad = paddle.incubate.autograd.forward_grad(
-                    ys, static_xs, static_v
-                )
-                paddle.incubate.autograd.prim2orig(mp.block(0))
-            exe = paddle.static.Executor()
-            exe.run(sp)
-            out = exe.run(mp, feed=feed, fetch_list=ys_grad)
-            paddle.incubate.autograd.disable_prim()
-            return out
-
-        def without_program_guard():
-            paddle.incubate.autograd.enable_prim()
-            feed, static_xs, static_v = utils.gen_static_data_and_feed(
-                self.xs, self.v, stop_gradient=False
-            )
-            ys = (
-                self.fun(*static_xs)
-                if isinstance(static_xs, typing.Sequence)
-                else self.fun(static_xs)
-            )
-            ys_grad = paddle.incubate.autograd.forward_grad(
-                ys, static_xs, static_v
-            )
-            sp = paddle.base.framework.default_startup_program()
-            mp = paddle.base.framework.default_main_program()
-            exe = paddle.static.Executor()
-            exe.run(sp)
-            out = exe.run(mp, feed=feed, fetch_list=ys_grad)
-            paddle.incubate.autograd.disable_prim()
-            return out
-
-        expected = with_program_guard()
-        actual = without_program_guard()
-        self.assertEqual(type(actual), type(expected))
-        np.testing.assert_allclose(
-            np.concatenate(actual),
-            np.concatenate(expected),
-            rtol=self._rtol,
-            atol=self._atol,
-        )
-
-    def test_grad_without_program_guard(self):
-        def with_program_guard():
-            paddle.incubate.autograd.enable_prim()
-            sp = paddle.static.Program()
-            mp = paddle.static.Program()
-            with paddle.static.program_guard(mp, sp):
-                feed, static_xs, static_v = utils.gen_static_data_and_feed(
-                    self.xs, self.v, stop_gradient=False
-                )
-                ys = (
-                    self.fun(*static_xs)
-                    if isinstance(static_xs, typing.Sequence)
-                    else self.fun(static_xs)
-                )
-                xs_grad = paddle.incubate.autograd.grad(ys, static_xs, static_v)
-                paddle.incubate.autograd.prim2orig(mp.block(0))
-            exe = paddle.static.Executor()
-            exe.run(sp)
-            out = exe.run(mp, feed=feed, fetch_list=xs_grad)
-            paddle.incubate.autograd.disable_prim()
-            return out
-
-        def without_program_guard():
-            paddle.incubate.autograd.enable_prim()
-            feed, static_xs, static_v = utils.gen_static_data_and_feed(
-                self.xs, self.v, stop_gradient=False
-            )
-            ys = (
-                self.fun(*static_xs)
-                if isinstance(static_xs, typing.Sequence)
-                else self.fun(static_xs)
-            )
-            xs_grad = paddle.incubate.autograd.grad(ys, static_xs, static_v)
-            sp = paddle.base.framework.default_startup_program()
-            mp = paddle.base.framework.default_main_program()
-            exe = paddle.static.Executor()
-            exe.run(sp)
-            out = exe.run(mp, feed=feed, fetch_list=xs_grad)
-            paddle.incubate.autograd.disable_prim()
-            return out
-
-        expected = with_program_guard()
-        actual = without_program_guard()
-        for i, j in zip(actual, expected):
-            self.assertEqual(type(i), type(j))
-            np.testing.assert_allclose(
-                np.concatenate(i),
-                np.concatenate(j),
-                rtol=self._rtol,
-                atol=self._atol,
-            )
-
-
-@utils.place(config.DEVICES)
-@utils.parameterize(
-    (utils.TEST_CASE_NAME, 'fun', 'xs', 'v', 'dtype'),
-    (
-        (
-            'matmul',
-            paddle.matmul,
-            (np.random.rand(2, 3), np.random.rand(3, 2)),
-            None,
-            'float32',
-        ),
-        (
-            'multiply',
-            paddle.multiply,
-            (np.random.rand(2, 3), np.random.rand(2, 3)),
-            None,
-            'float64',
-        ),
-        (
-            'add',
-            paddle.add,
-            (np.random.rand(2, 3), np.random.rand(2, 3)),
-            None,
-            'float32',
-        ),
-        (
-            'input_not_sequence',
-            paddle.tanh,
-            (np.random.rand(5, 5),),
-            None,
-            'float64',
-        ),
-        (
-            'input_gradients_not_none',
-            paddle.matmul,
-            (np.random.rand(3, 3), np.random.rand(3, 3)),
-            (np.random.rand(3, 3), np.random.rand(3, 3)),
-            'float64',
-        ),
-        ('log', paddle.log, (np.random.rand(3, 4),), None, 'float32'),
-        (
-            'abs',
-            paddle.abs,
-            (np.random.uniform(-10, 10, (10, 10)),),
-            None,
-            'float32',
-        ),
-        ('rsqrt', paddle.rsqrt, (np.random.rand(100, 200),), None, 'float32'),
-        (
-            'sigmoid',
-            paddle.nn.functional.sigmoid,
-            (
-                np.random.rand(
-                    5,
-                ),
-            ),
-            None,
-            'float32',
-        ),
-    ),
-)
-# paddle.where, paddle.pow, paddle.maximum has no double grad definition,
-# can not compute forward grad use double trick
-class TestForwardGrad(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.xs = tuple(x.astype(cls.dtype) for x in cls.xs)
-        cls._rtol = (
-            config.TOLERANCE.get(str(cls.dtype))
-            .get("first_order_grad")
-            .get("rtol")
-        )
-        cls._atol = (
-            config.TOLERANCE.get(str(cls.dtype))
-            .get("first_order_grad")
-            .get("atol")
-        )
-
-    def setUp(self):
-        paddle.enable_static()
-        paddle.incubate.autograd.enable_prim()
-
-    def tearDown(self):
-        paddle.incubate.autograd.disable_prim()
-        paddle.disable_static()
-
-    def test_forward_grad(self):
-        def expected():
-            paddle.incubate.autograd.disable_prim()
-            sp = paddle.static.Program()
-            mp = paddle.static.Program()
-            with paddle.static.program_guard(mp, sp):
-                feed, static_xs, static_v = utils.gen_static_data_and_feed(
-                    self.xs, self.v, stop_gradient=False
-                )
-                _, ys_grad = paddle.incubate.autograd.jvp(
-                    self.fun, static_xs, static_v
-                )
-            exe = paddle.static.Executor()
-            exe.run(sp)
-            out = exe.run(mp, feed=feed, fetch_list=ys_grad)
-            paddle.incubate.autograd.enable_prim()
-            return out
-
-        def actual():
-            paddle.incubate.autograd.enable_prim()
-            sp = paddle.static.Program()
-            mp = paddle.static.Program()
-            with paddle.static.program_guard(mp, sp):
-                feed, static_xs, static_v = utils.gen_static_data_and_feed(
-                    self.xs, self.v, stop_gradient=False
-                )
-                ys = (
-                    self.fun(*static_xs)
-                    if isinstance(static_xs, typing.Sequence)
-                    else self.fun(static_xs)
-                )
-                ys_grad = paddle.incubate.autograd.forward_grad(
-                    ys, static_xs, static_v
-                )
-                paddle.incubate.autograd.prim2orig(mp.block(0))
-            exe = paddle.static.Executor()
-            exe.run(sp)
-            out = exe.run(mp, feed=feed, fetch_list=ys_grad)
-            paddle.incubate.autograd.disable_prim()
-            return out
-
-        actual = actual()
-        expected = expected()
-        self.assertEqual(type(actual), type(expected))
-        np.testing.assert_allclose(
-            np.concatenate(actual),
-            np.concatenate(expected),
-            rtol=self._rtol,
-            atol=self._atol,
-        )
-
-    def test_prim_disabled(self):
-        paddle.incubate.autograd.disable_prim()
-        sp = paddle.static.Program()
-        mp = paddle.static.Program()
-        with self.assertRaises(RuntimeError):
-            with paddle.static.program_guard(mp, sp):
-                feed, static_xs, static_v = utils.gen_static_data_and_feed(
-                    self.xs, self.v, stop_gradient=False
-                )
-                ys = (
-                    self.fun(*static_xs)
-                    if isinstance(static_xs, typing.Sequence)
-                    else self.fun(static_xs)
-                )
-                ys_grad = paddle.incubate.autograd.forward_grad(
-                    ys, static_xs, static_v
-                )
-                paddle.incubate.autograd.prim2orig(mp.block(0))
-            exe = paddle.static.Executor()
-            exe.run(sp)
-            exe.run(mp, feed=feed, fetch_list=ys_grad)
-        paddle.incubate.autograd.enable_prim()
-
-    def test_illegal_param(self):
-        paddle.incubate.autograd.enable_prim()
-        with self.assertRaises(TypeError):
-            paddle.incubate.autograd.forward_grad(
-                1, paddle.static.data('inputs', shape=[1])
-            )
-
-        with self.assertRaises(TypeError):
-            paddle.incubate.autograd.forward_grad(
-                paddle.static.data('targets', shape=[1]), 1
-            )
-        paddle.incubate.autograd.disable_prim()
-
-
-where_wrap = lambda x, y: paddle.where(paddle.eye(3, 4) == 1, x, y)
-
-
-@utils.place(config.DEVICES)
-@utils.parameterize(
-    (utils.TEST_CASE_NAME, 'fun', 'xs', 'v', 'dtype'),
-    (
-        (
-            'matmul',
-            paddle.matmul,
-            (np.random.rand(2, 3), np.random.rand(3, 2)),
-            None,
-            'float32',
-        ),
-        (
-            'multiply',
-            paddle.multiply,
-            (np.random.rand(2, 3), np.random.rand(2, 3)),
-            None,
-            'float64',
-        ),
-        (
-            'div',
-            paddle.divide,
-            (np.random.rand(2, 3), np.random.rand(2, 3)),
-            None,
-            'float64',
-        ),
-        (
-            'add',
-            paddle.add,
-            (np.random.rand(2, 3), np.random.rand(2, 3)),
-            None,
-            'float32',
-        ),
-        (
-            'input_not_sequence',
-            paddle.tanh,
-            (np.random.rand(5, 5),),
-            None,
-            'float64',
-        ),
-        (
-            'input_gradients_not_none',
-            paddle.matmul,
-            (np.random.rand(3, 3), np.random.rand(3, 3)),
-            (np.random.rand(3, 3),),
-            'float64',
-        ),
-        ('sin', paddle.sin, (np.random.rand(100, 200),), None, 'float32'),
-        ('rsqrt', paddle.rsqrt, (np.random.rand(100, 200),), None, 'float32'),
-        ('cos', paddle.cos, (np.random.rand(200, 90),), None, 'float32'),
-        ('exp', paddle.exp, (np.random.rand(299, 320),), None, 'float32'),
-        # In where op, grad of condition computed by paddle.static.gradients is None,
-        # and paddle.incubate.autograd.grad will replace None with zeros while transpose
-        # will just return None because cond_dot is unused, that is a diff.
-        (
-            'select',
-            where_wrap,
-            (np.random.rand(3, 4), np.random.rand(3, 4)),
-            None,
-            'float32',
-        ),
-        # pow_p and pow has diff when compute z_dot of 0^0
-        (
-            'pow',
-            paddle.pow,
-            (np.array([1, 2, 3]), np.array([0, 2, 7])),
-            None,
-            'float32',
-        ),
-        # To make max_p consistent with paddle.maximum, be sure x.grad = 0 and y.grad = 1 when x==y.
-        (
-            'max',
-            paddle.maximum,
-            (
-                np.array([1, 2, 3]),
-                np.array([2, 2, 2]),
-            ),
-            None,
-            'float32',
-        ),
-        ('erf', paddle.erf, (np.random.rand(300, 288),), None, 'float32'),
-        (
-            'gelu',
-            paddle.nn.functional.gelu,
-            (np.random.rand(200, 189),),
-            None,
-            'float32',
-        ),
-        (
-            'gelu_approximate',
-            lambda x: paddle.nn.functional.gelu(x, True),
-            (np.random.rand(200, 189),),
-            None,
-            'float32',
-        ),
-        ('sum', paddle.sum, (np.random.rand(200, 345),), None, 'float32'),
-        (
-            'sigmoid',
-            paddle.nn.functional.sigmoid,
-            (
-                np.random.rand(
-                    5,
-                ),
-            ),
-            None,
-            'float32',
-        ),
-        (
-            'sum_with_axis',
-            lambda x: paddle.sum(x, axis=1),
-            (np.random.rand(200, 345),),
-            None,
-            'float32',
-        ),
-        (
-            'sum_with_keepdim',
-            lambda x: paddle.sum(x, keepdim=True),
-            (np.random.rand(200, 345),),
-            None,
-            'float32',
-        ),
-        ('mean', paddle.mean, (np.random.rand(200, 345),), None, 'float32'),
-        (
-            'mean_with_axis',
-            lambda x: paddle.mean(x, axis=1),
-            (np.random.rand(200, 345),),
-            None,
-            'float32',
-        ),
-        (
-            'mean_with_keepdim',
-            lambda x: paddle.mean(x, keepdim=True),
-            (np.random.rand(200, 345),),
-            None,
-            'float32',
-        ),
-        (
-            'mean_with_axis_keepdim',
-            lambda x: paddle.mean(x, axis=0, keepdim=True),
-            (np.random.rand(200, 345),),
-            None,
-            'float32',
-        ),
-        (
-            'abs',
-            paddle.abs,
-            (np.random.uniform(-10, 10, (200, 345)),),
-            None,
-            'float32',
-        ),
-        (
-            'cast_float',
-            lambda x: paddle.cast(x, paddle.float64),
-            (np.random.rand(10, 20),),
-            None,
-            'float32',
-        ),
-        (
-            'cast_int',
-            lambda x: paddle.cast(x, paddle.int32),
-            (np.random.rand(10, 20),),
-            None,
-            'float32',
-        ),
-        ('square', paddle.square, (np.random.rand(100),), None, 'float32'),
-        (
-            'pow_scalar',
-            lambda x: paddle.pow(x, 2),
-            (np.random.rand(20, 30),),
-            None,
-            'float32',
-        ),
-        (
-            'var',
-            lambda x: paddle.var(x, unbiased=False),
-            (np.random.rand(200, 324),),
-            None,
-            'float32',
-        ),
-        (
-            'var_with_axis',
-            lambda x: paddle.var(x, axis=1, unbiased=False),
-            (np.random.rand(10, 20, 30),),
-            None,
-            'float32',
-        ),
-        (
-            'var_with_keepdim',
-            lambda x: paddle.var(x, axis=1, keepdim=True, unbiased=False),
-            (np.random.rand(10, 20, 30),),
-            None,
-            'float32',
-        ),
-        (
-            'bn',
-            lambda x, w, b: paddle.nn.functional.batch_norm(
-                x, paddle.ones((10,)), paddle.ones((10,)), w, b
-            ),
-            (np.random.rand(10, 10), np.random.rand(10), np.random.rand(10)),
-            None,
-            'float32',
-        ),
-        (
-            'bn_train',
-            lambda x, w, b: paddle.nn.functional.batch_norm(
-                x, paddle.ones((10,)), paddle.ones((10,)), w, b, training=True
-            ),
-            (np.random.rand(10, 10), np.random.rand(10), np.random.rand(10)),
-            None,
-            'float32',
-        ),
-        (
-            'bn_nhwc',
-            lambda x, w, b: paddle.nn.functional.batch_norm(
-                x,
-                paddle.ones((10,)) + 1,
-                paddle.ones((10,)),
-                w,
-                b,
-                training=True,
-                data_format='NHWC',
-            ),
-            (np.random.rand(10, 10), np.random.rand(10), np.random.rand(10)),
-            None,
-            'float32',
-        ),
-        (
-            'bn_global_stat',
-            lambda x, w, b: paddle.nn.functional.batch_norm(
-                x,
-                paddle.ones((10,)) + 3.2,
-                paddle.ones((10,)) + 6.7,
-                w,
-                b,
-                training=True,
-                data_format='NHWC',
-                use_global_stats=True,
-            ),
-            (np.random.rand(10, 10), np.random.rand(10), np.random.rand(10)),
-            None,
-            'float32',
-        ),
-    ),
-)
-class TestGrad(unittest.TestCase):
-    def setUp(self):
-        paddle.enable_static()
-        paddle.incubate.autograd.enable_prim()
-
-    def tearDown(self):
-        paddle.incubate.autograd.disable_prim()
-        paddle.disable_static()
-
-    @classmethod
-    def setUpClass(cls):
-        cls.xs = tuple(x.astype(cls.dtype) for x in cls.xs)
-        cls._rtol = (
-            config.TOLERANCE.get(str(cls.dtype))
-            .get("first_order_grad")
-            .get("rtol")
-        )
-        cls._atol = (
-            config.TOLERANCE.get(str(cls.dtype))
-            .get("first_order_grad")
-            .get("atol")
-        )
-
-    def test_grad(self):
-        def expected():
-            paddle.incubate.autograd.disable_prim()
-            sp = paddle.static.Program()
-            mp = paddle.static.Program()
-            with paddle.static.program_guard(mp, sp):
-                feed, static_xs, static_v = utils.gen_static_data_and_feed(
-                    self.xs, self.v, stop_gradient=False
-                )
-                _, ys_grad = paddle.incubate.autograd.vjp(
-                    self.fun, static_xs, static_v
-                )
-            exe = paddle.static.Executor()
-            exe.run(sp)
-            out = exe.run(mp, feed=feed, fetch_list=ys_grad)
-            paddle.incubate.autograd.enable_prim()
-            return out
-
-        def actual():
-            paddle.incubate.autograd.enable_prim()
-            sp = paddle.static.Program()
-            mp = paddle.static.Program()
-            with paddle.static.program_guard(mp, sp):
-                feed, static_xs, static_v = utils.gen_static_data_and_feed(
-                    self.xs, self.v, stop_gradient=False
-                )
-                ys = (
-                    self.fun(*static_xs)
-                    if isinstance(static_xs, typing.Sequence)
-                    else self.fun(static_xs)
-                )
-                ys_grad = paddle.incubate.autograd.grad(ys, static_xs, static_v)
-                paddle.incubate.autograd.prim2orig(mp.block(0))
-            exe = paddle.static.Executor()
-            exe.run(sp)
-            out = exe.run(mp, feed=feed, fetch_list=ys_grad)
-            paddle.incubate.autograd.disable_prim()
-            return out
-
-        actual = actual()
-        expected = expected()
-        self.assertEqual(type(actual), type(expected))
-        for i, j in zip(actual, expected):
-            np.testing.assert_allclose(i, j, rtol=self._rtol, atol=self._atol)
-
-    def test_illegal_param(self):
-        paddle.incubate.autograd.enable_prim()
-        with self.assertRaises(TypeError):
-            paddle.incubate.autograd.grad(
-                1, paddle.static.data('inputs', shape=[1])
-            )
-
-        with self.assertRaises(TypeError):
-            paddle.incubate.autograd.grad(
-                paddle.static.data('targets', shape=[1]), 1
-            )
-        paddle.incubate.autograd.disable_prim()
-
-    def test_disable_prim(self):
-        def expected():
-            paddle.incubate.autograd.disable_prim()
-            sp = paddle.static.Program()
-            mp = paddle.static.Program()
-            with paddle.static.program_guard(mp, sp):
-                feed, static_xs, static_v = utils.gen_static_data_and_feed(
-                    self.xs, self.v, stop_gradient=False
-                )
-                ys = (
-                    self.fun(*static_xs)
-                    if isinstance(static_xs, typing.Sequence)
-                    else self.fun(static_xs)
-                )
-                ys_grad = paddle.incubate.autograd.grad(ys, static_xs, static_v)
-            exe = paddle.static.Executor()
-            exe.run(sp)
-            out = exe.run(mp, feed=feed, fetch_list=ys_grad)
-            paddle.incubate.autograd.enable_prim()
-            return out
-
-        def actual():
-            paddle.incubate.autograd.disable_prim()
-            sp = paddle.static.Program()
-            mp = paddle.static.Program()
-            with paddle.static.program_guard(mp, sp):
-                feed, static_xs, static_v = utils.gen_static_data_and_feed(
-                    self.xs, self.v, stop_gradient=False
-                )
-                ys = (
-                    self.fun(*static_xs)
-                    if isinstance(static_xs, typing.Sequence)
-                    else self.fun(static_xs)
-                )
-                ys_grad = paddle.static.gradients(ys, static_xs, static_v)
-            exe = paddle.static.Executor()
-            exe.run(sp)
-            out = exe.run(mp, feed=feed, fetch_list=ys_grad)
-            paddle.incubate.autograd.enable_prim()
-            return out
-
-        actual = actual()
-        expected = expected()
-        self.assertEqual(type(actual), type(expected))
-        for i, j in zip(actual, expected):
-            np.testing.assert_allclose(i, j, rtol=self._rtol, atol=self._atol)
-
-
-def multiply_pd(x):
-    x2 = paddle.multiply(x, x)
-    x3 = paddle.multiply(x2, x2)
-    x4 = paddle.multiply(x3, x)
-    return x4
-
-
-multiply_ag = lambda xs: xs[0] * xs[0] * xs[0] * xs[0] * xs[0]
-sin_ag = lambda xs: anp.sin(xs[0])
-cos_ag = lambda xs: anp.cos(xs[0])
-exp_ag = lambda xs: anp.exp(xs[0])
-pow_ag = lambda xs: xs[0] ** xs[1]
-log_ag = lambda xs: anp.log(xs[0])
-erf_ag = lambda xs: ascipy.special.erf(xs[0])
-sigmoid_ag = lambda xs: 1.0 / (1 + anp.exp(-xs[0]))
-
-
-def gelu_ag(x, approximate=False):
-    if approximate:
-        sqrt_2_over_pi = np.sqrt(2 / np.pi).astype(x.dtype)
-        cdf = 0.5 * (1.0 + anp.tanh(sqrt_2_over_pi * (x + 0.044715 * (x**3))))
-        return x * cdf
-    else:
-        return x * (ascipy.special.erf(x / np.sqrt(2)) + 1) / 2
-
-
-@utils.place(config.DEVICES)
-@utils.parameterize(
-    (utils.TEST_CASE_NAME, 'fun_pd', 'fun_ag', 'xs', 'v', 'dtype'),
-    (
-        (
-            'multiply',
-            multiply_pd,
-            multiply_ag,
-            (np.random.rand(3, 5),),
-            None,
-            'float32',
-        ),
-        ('sin', paddle.sin, sin_ag, (np.random.rand(2, 3),), None, 'float32'),
-        ('cos', paddle.cos, cos_ag, (np.random.rand(3, 4),), None, 'float32'),
-        ('exp', paddle.exp, exp_ag, (np.random.rand(2, 3),), None, 'float32'),
-        (
-            'pow',
-            paddle.pow,
-            pow_ag,
-            (np.random.rand(2, 3), np.random.rand(2, 3)),
-            None,
-            'float32',
-        ),
-        ('log', paddle.log, log_ag, (np.random.rand(3, 8),), None, 'float32'),
-        (
-            'erf',
-            paddle.erf,
-            erf_ag,
-            (np.random.rand(100, 200),),
-            None,
-            'float32',
-        ),
-        (
-            'gelu',
-            paddle.nn.functional.gelu,
-            lambda xs: gelu_ag(xs[0]),
-            (np.random.rand(10, 20, 30),),
-            None,
-            'float32',
-        ),
-        (
-            'gelu_approximate',
-            lambda x: paddle.nn.functional.gelu(x, approximate=True),
-            lambda xs: gelu_ag(xs[0], approximate=True),
-            (np.random.rand(10, 20, 30),),
-            None,
-            'float32',
-        ),
-        (
-            'sigmoid',
-            paddle.nn.functional.sigmoid,
-            sigmoid_ag,
-            (np.random.rand(10, 20),),
-            None,
-            'float32',
-        ),
-    ),
-)
-class TestGradWithHigherOrder(unittest.TestCase):
-    def setUp(self):
-        paddle.enable_static()
-        paddle.incubate.autograd.enable_prim()
-
-    def tearDown(self):
-        paddle.incubate.autograd.disable_prim()
-        paddle.disable_static()
-
-    @classmethod
-    def setUpClass(cls):
-        cls.xs = tuple(x.astype(cls.dtype) for x in cls.xs)
-        cls._rtol = (
-            config.TOLERANCE.get(str(cls.dtype))
-            .get("first_order_grad")
-            .get("rtol")
-        )
-        cls._atol = (
-            config.TOLERANCE.get(str(cls.dtype))
-            .get("first_order_grad")
-            .get("atol")
-        )
-
-    def test_grad(self):
-        def expected():
-            egrad = autograd.elementwise_grad
-            grad_3 = egrad(egrad(egrad(self.fun_ag)))(self.xs)
-            grad_4 = egrad(egrad(egrad(egrad(self.fun_ag))))(self.xs)
-            grad_5 = egrad(egrad(egrad(egrad(egrad(self.fun_ag)))))(self.xs)
-            # the output of egrad is tuple
-            return list(grad_3 + grad_4 + grad_5)
-
-        def actual():
-            paddle_grad = paddle.incubate.autograd.grad
-            paddle.incubate.autograd.enable_prim()
-            main = paddle.static.Program()
-            startup = paddle.static.Program()
-            with paddle.static.program_guard(main, startup):
-                feed, static_xs, static_v = utils.gen_static_data_and_feed(
-                    self.xs, self.v, stop_gradient=False
-                )
-                ys = (
-                    self.fun_pd(*static_xs)
-                    if isinstance(static_xs, typing.Sequence)
-                    else self.fun_pd(static_xs)
-                )
-
-                grad1 = paddle_grad(ys, static_xs, static_v)
-                grad2 = paddle_grad(grad1, static_xs, static_v)
-                grad3 = paddle_grad(grad2, static_xs, static_v)
-                grad4 = paddle_grad(grad3, static_xs, static_v)
-                grad5 = paddle_grad(grad4, static_xs, static_v)
-                paddle.incubate.autograd.prim2orig()
-
-            fetch_list = [grad3, grad4, grad5]
-
-            place = paddle.CPUPlace()
-            if paddle.device.is_compiled_with_cuda():
-                place = paddle.CUDAPlace(0)
-            exe = paddle.static.Executor(place)
-            exe.run(startup)
-            outs = exe.run(main, feed=feed, fetch_list=fetch_list)
-            paddle.incubate.autograd.disable_prim()
-            return outs
-
-        actual = actual()
-        expected = expected()
-        self.assertEqual(type(actual), type(expected))
-        for i, j in zip(actual, expected):
-            np.testing.assert_allclose(i, j, rtol=self._rtol, atol=self._atol)
-
-
-class TestToPrim(unittest.TestCase):
-    def setUp(self):
-        paddle.enable_static()
-        core._set_prim_forward_enabled(True)
-
-    def tearDown(self):
-        core._set_prim_forward_enabled(False)
-        paddle.disable_static()
-
-    @param.parameterized.expand((({'dropout'},),))
-    def test_blacklist(self, blacklist):
-        program = paddle.static.Program()
-        with paddle.static.program_guard(program):
-            paddle.nn.functional.softmax(
-                paddle.nn.functional.dropout(paddle.rand((1,)))
-            )
-        primapi.to_prim(program.blocks, blacklist=blacklist)
-        ops = tuple(op.type for op in program.block(0).ops)
-        self.assertTrue(all(tuple(op in ops for op in blacklist)))
-
-    @param.parameterized.expand((({'dropout'},),))
-    def test_whitelist(self, whitelist):
-        program = paddle.static.Program()
-        with paddle.static.program_guard(program):
-            paddle.nn.functional.softmax(
-                paddle.nn.functional.dropout(paddle.rand((1,)))
-            )
-        primapi.to_prim(program.blocks, whitelist=whitelist)
-        ops = tuple(op.type for op in program.block(0).ops)
-        self.assertTrue(all(tuple(op not in ops for op in whitelist)))
-
-    @param.parameterized.expand((({'softmax'}, {'softmax', 'dropout'}),))
-    def test_both_not_empty(self, blacklist, whitelist):
-        program = paddle.static.Program()
-        with paddle.static.program_guard(program):
-            paddle.nn.functional.softmax(
-                paddle.nn.functional.dropout(paddle.rand((1,)))
-            )
-        primapi.to_prim(
-            program.blocks, blacklist=blacklist, whitelist=whitelist
-        )
-        ops = tuple(op.type for op in program.block(0).ops)
-        self.assertTrue(all(tuple(op in ops for op in blacklist)))
-
-    @param.parameterized.expand(((('dropout',), 'softmax'),))
-    def test_type_error(self, blacklist, whitelist):
-        program = paddle.static.Program()
-        with paddle.static.program_guard(program):
-            paddle.nn.functional.softmax(
-                paddle.nn.functional.dropout(paddle.rand((1,)))
-            )
-        with self.assertRaises(TypeError):
-            primapi.to_prim(
-                program.blocks, blacklist=blacklist, whitelist=whitelist
-            )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/autograd/test_primops.py b/test/autograd/test_primops.py
deleted file mode 100644
index 9a20dd377b2c5..0000000000000
--- a/test/autograd/test_primops.py
+++ /dev/null
@@ -1,235 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-import unittest
-import uuid
-
-sys.path.insert(0, '.')
-
-import config
-import numpy as np
-import utils
-from numpy.random import randint, randn
-
-import paddle
-from paddle.incubate.autograd import primops
-
-paddle.enable_static()
-
-
-@utils.place(config.DEVICES)
-@utils.parameterize(
-    (
-        utils.TEST_CASE_NAME,
-        'op',
-        'args',
-        'kwargs',
-        'expected_shape',
-        'expected_dtype',
-    ),
-    (
-        ('add', primops.add, (randn(2, 3), randn(2, 3)), {}, (2, 3), 'float64'),
-        ('sub', primops.sub, (randn(2, 3), randn(2, 3)), {}, (2, 3), 'float64'),
-        ('mul', primops.mul, (randn(2, 3), randn(2, 3)), {}, (2, 3), 'float64'),
-        ('div', primops.div, (randn(2, 3), randn(2, 3)), {}, (2, 3), 'float64'),
-        ('sub', primops.sub, (randn(2, 3), randn(2, 3)), {}, (2, 3), 'float64'),
-        ('sqrt', primops.sqrt, randn(2, 3), {}, (2, 3), 'float64'),
-        ('tanh', primops.tanh, randn(2, 3), {}, (2, 3), 'float64'),
-        ('sin', primops.sin, randn(2, 3), {}, (2, 3), 'float64'),
-        ('cos', primops.cos, randn(2, 3), {}, (2, 3), 'float64'),
-        ('exp', primops.exp, randn(2, 3), {}, (2, 3), 'float64'),
-        ('erf', primops.erf, randn(2, 3), {}, (2, 3), 'float64'),
-        ('abs', primops.abs, randn(2, 3), {}, (2, 3), 'float64'),
-        ('log', primops.log, randn(2, 3), {}, (2, 3), 'float64'),
-        (
-            'cast',
-            primops.cast,
-            randn(2, 3),
-            {'dtype': paddle.int64},
-            (2, 3),
-            'int64',
-        ),
-        (
-            'reshape',
-            primops.reshape,
-            randn(2, 3),
-            {'shape': (3, 2)},
-            (3, 2),
-            'float64',
-        ),
-        (
-            'broadcast',
-            primops.broadcast,
-            randn(2),
-            {'shape': (3, 2)},
-            (3, 2),
-            'float64',
-        ),
-        (
-            'transpose',
-            primops.transpose,
-            randn(2, 3),
-            {'axis': (1, 0)},
-            (3, 2),
-            'float64',
-        ),
-        (
-            'concat_axis0',
-            primops.concat,
-            ((randn(2, 3), randn(2, 3)),),
-            {'axis': 0},
-            (4, 3),
-            'float64',
-        ),
-        (
-            'concat_axis1',
-            primops.concat,
-            ((randn(2, 3), randn(2, 3)),),
-            {'axis': 1},
-            (2, 6),
-            'float64',
-        ),
-        (
-            'reduce_axis1',
-            primops.reduce_sum,
-            randn(2, 3),
-            {'axis': (1,)},
-            (2,),
-            'float64',
-        ),
-        (
-            'reduce_axis01',
-            primops.reduce_sum,
-            randn(2, 3),
-            {'axis': (0, 1)},
-            (),
-            'float64',
-        ),
-        (
-            'split',
-            primops.split,
-            randn(2, 3),
-            {'num_or_sections': [1, 2], 'axis': 1},
-            ((2, 1), (2, 2)),
-            ('float64', 'float64'),
-        ),
-        (
-            'matmul',
-            primops.matmul,
-            (randn(2, 3), randn(3, 2)),
-            {},
-            (2, 2),
-            'float64',
-        ),
-        (
-            'slice_select',
-            primops.slice_select,
-            randn(3, 2),
-            {'axis': [0], 'starts': [0], 'ends': [2], 'strides': [1]},
-            (2, 2),
-            'float64',
-        ),
-        (
-            'slice_assign',
-            primops.slice_assign,
-            (randn(2, 3), randn(2, 2)),
-            {'axis': [1], 'starts': [1], 'ends': [3], 'strides': [1]},
-            (2, 3),
-            'float64',
-        ),
-        (
-            'gather',
-            primops.gather,
-            (randn(3, 2), randint(0, 2, (5,), np.int32)),
-            {'axis': 0},
-            (5, 2),
-            'float64',
-        ),
-        (
-            'scatter_add',
-            primops.scatter_add,
-            (randn(3, 2), randn(5, 2), randint(0, 2, (5,), np.int32)),
-            {'axis': 0},
-            (3, 2),
-            'float64',
-        ),
-        (
-            'fill_const',
-            primops.fill_const,
-            (),
-            {'value': 10, 'shape': (3, 2), 'dtype': paddle.float32},
-            (3, 2),
-            'float32',
-        ),
-        ('neg', primops.neg, randn(2, 3), {}, (2, 3), 'float64'),
-        (
-            'select',
-            primops.select,
-            (randn(2, 3) > 0, randn(2, 3), randn(2, 3)),
-            {},
-            (2, 3),
-            'float64',
-        ),
-        ('eq', primops.eq, (randn(2, 3), randn(2, 3)), {}, (2, 3), 'bool'),
-        ('ne', primops.ne, (randn(2, 3), randn(2, 3)), {}, (2, 3), 'bool'),
-        ('gt', primops.gt, (randn(2, 3), randn(2, 3)), {}, (2, 3), 'bool'),
-        ('ge', primops.ge, (randn(2, 3), randn(2, 3)), {}, (2, 3), 'bool'),
-        ('pow', primops.pow, (randn(2, 3), randn(2, 3)), {}, (2, 3), 'float64'),
-        ('max', primops.max, (randn(2, 3), randn(2, 3)), {}, (2, 3), 'float64'),
-    ),
-)
-class TestPrimops(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        paddle.enable_static()
-
-    @classmethod
-    def tearDownClass(cls):
-        paddle.disable_static()
-
-    def test_prim_ops(self):
-        program = paddle.static.Program()
-        with paddle.static.program_guard(program):
-            args = self._as_tuple(self.args)
-            args = self.arr2var(args)
-            results = self.op(*args, **self.kwargs)
-            results = self._as_tuple(results)
-            expected_shape = self._as_tuple(self.expected_shape)
-            expected_dtype = self._as_tuple(self.expected_dtype)
-
-            for r, shape, dtype in zip(results, expected_shape, expected_dtype):
-                self.assertEqual(r.shape, shape)
-                self.assertEqual(str(r.dtype).split('.')[1], dtype)
-
-    def arr2var(self, arr):
-        """convert numpy ndarray to paddle Variable recursively."""
-        return [
-            paddle.static.data(f'x{uuid.uuid4()}', v.shape, v.dtype)
-            if isinstance(v, np.ndarray)
-            else self.arr2var(v)
-            for v in arr
-        ]
-
-    def _as_tuple(self, input):
-        if isinstance(input, (tuple, list)) and len(input) == 0:
-            return input
-        if not isinstance(input, (tuple, list)) or all(
-            isinstance(i, int) for i in input
-        ):
-            return (input,)
-        return input
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/autograd/test_transform.py b/test/autograd/test_transform.py
deleted file mode 100644
index 6116c0b5b490c..0000000000000
--- a/test/autograd/test_transform.py
+++ /dev/null
@@ -1,484 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle
-from paddle.incubate.autograd.primx import Transform, orig2prim, prim2orig
-
-paddle.enable_static()
-
-
-class TestAutoGradTransformForAdd(unittest.TestCase):
-    # This UT is deprecated for 'prim2org' mechanism has been already deprecated
-    # so this UT will be skipped as method 'test_run' was renamed to '_test_run'
-    def setUp(self):
-        self.main_program = paddle.static.Program()
-        self.startup_program = paddle.static.Program()
-
-        with paddle.static.program_guard(
-            self.main_program, self.startup_program
-        ):
-            self.init_data()
-
-    def init_data(self):
-        # { input_index: input_shape }
-        self.xs_shape_map = {0: (20, 40), 1: (20, 40)}
-        # { output_index: output_shape }
-        self.ys_shape_map = {0: (20, 40)}
-        X0 = paddle.static.data(
-            name='X0', shape=self.xs_shape_map[0], dtype='float32'
-        )
-        X0.stop_gradient = False
-        X1 = paddle.static.data(
-            name='X1', shape=self.xs_shape_map[1], dtype='float32'
-        )
-        X1.stop_gradient = False
-
-        A = paddle.tanh(X0)
-        B = paddle.tanh(X1)
-        C = paddle.rsqrt(B)
-        Y = paddle.add(A, C)
-
-        self.orig_xs = [X0, X1]
-        self.orig_ys = [
-            Y,
-        ]
-
-        self.orig_ops = ['tanh', 'tanh', 'elementwise_add', 'rsqrt']
-        self.orig2prim_ops = ['tanh_p', 'tanh_p', 'add_p', 'rsqrt_p']
-        self.linearize_ops = self.orig2prim_ops + [
-            # call fill_const() in linearize() function
-            'fill_constant_p',
-            'fill_constant_p',
-            # linearized op
-            'mul_p',
-            'sub_p',
-            'fill_constant_p',
-            'mul_p',
-            'mul_p',
-            'sub_p',
-            'fill_constant_p',
-            'mul_p',
-            'add_p',
-            'fill_constant_p',
-            'div_p',
-            'div_p',
-            'mul_p',
-        ]
-        self.transpose_ops = self.orig2prim_ops + [
-            # call fill_const() in transpose() function
-            'fill_constant_p',
-            # linearized op after remove path
-            'fill_constant_p',
-            'fill_constant_p',
-            'mul_p',
-            'sub_p',
-            'fill_constant_p',
-            'mul_p',
-            'sub_p',
-            'fill_constant_p',
-            'mul_p',
-            'div_p',
-            'div_p',
-            'fill_constant_p',
-            # transposed op
-            'mul_p',
-            'mul_p',
-        ]
-        self.prim2orig_ops_with_blacklist = [
-            'tanh',
-            'tanh',
-            'add_p',
-            'fill_constant',
-            'fill_constant',
-            'fill_constant',
-            'elementwise_mul',
-            'sub_p',
-            'fill_constant',
-            'elementwise_mul',
-            'sub_p',
-            'fill_constant',
-            'elementwise_mul',
-            'elementwise_mul',
-            'rsqrt',
-            'fill_constant',
-            'elementwise_div',
-            'elementwise_div',
-            'elementwise_mul',
-        ]
-        self.prim2orig_ops = [
-            'tanh',
-            'tanh',
-            'elementwise_add',
-            'fill_constant',
-            'fill_constant',
-            'fill_constant',
-            'elementwise_mul',
-            'elementwise_sub',
-            'fill_constant',
-            'elementwise_mul',
-            'elementwise_sub',
-            'fill_constant',
-            'elementwise_mul',
-            'elementwise_mul',
-            'rsqrt',
-            'fill_constant',
-            'elementwise_div',
-            'elementwise_div',
-            'elementwise_mul',
-        ]
-
-    def _test_run(self):
-        # Must using with program_guard(), otherwise prim ops will append other block
-        with paddle.static.program_guard(
-            self.main_program, self.startup_program
-        ):
-            ad = Transform(self.main_program.block(0))
-            orig_ops = [op.type for op in self.main_program.block(0).ops]
-            self.assertEqual(sorted(orig_ops), sorted(self.orig_ops))
-
-            # Test orig2prim
-            orig2prim(block=self.main_program.block(0))
-            orig2prim_ops = [op.type for op in self.main_program.block(0).ops]
-            self.assertEqual(sorted(orig2prim_ops), sorted(self.orig2prim_ops))
-
-            # Test linearize
-            xs_dot, ys_dot = ad.linearize(self.orig_xs, self.orig_ys)
-            linearize_ops = [op.type for op in self.main_program.block(0).ops]
-            self.assertEqual(sorted(linearize_ops), sorted(self.linearize_ops))
-            flatten_xs_dot = paddle.utils.flatten(xs_dot)
-            for k, v in self.xs_shape_map.items():
-                self.assertEqual(flatten_xs_dot[k].shape, v)
-            flatten_ys_dot = paddle.utils.flatten(ys_dot)
-            for k, v in self.ys_shape_map.items():
-                self.assertEqual(flatten_ys_dot[k].shape, v)
-
-            # Test transpose
-            ys_bar, xs_bar = ad.transpose(ys_dot, xs_dot, retain_fwd=False)
-            transpose_ops = [op.type for op in self.main_program.block(0).ops]
-            self.assertEqual(sorted(transpose_ops), sorted(self.transpose_ops))
-            flatten_xs_bar = paddle.utils.flatten(xs_bar)
-            for k, v in self.xs_shape_map.items():
-                # There may be None in the result of transpose like gather op
-                if flatten_xs_bar[k] is not None:
-                    self.assertEqual(flatten_xs_bar[k].shape, v)
-            flatten_ys_bar = paddle.utils.flatten(ys_bar)
-            for k, v in self.ys_shape_map.items():
-                self.assertEqual(flatten_ys_bar[k].shape, v)
-
-            # Test prim2orig with blacklist
-            prim2orig(
-                block=self.main_program.block(0), blacklist=['add_p', 'sub_p']
-            )
-            prim2orig_ops = [op.type for op in self.main_program.block(0).ops]
-            self.assertEqual(
-                sorted(prim2orig_ops), sorted(self.prim2orig_ops_with_blacklist)
-            )
-
-            # Test prim2orig
-            prim2orig(block=self.main_program.block(0))
-            prim2orig_ops = [op.type for op in self.main_program.block(0).ops]
-            self.assertEqual(sorted(prim2orig_ops), sorted(self.prim2orig_ops))
-
-
-class TestAutoGradTransformForMatmul(TestAutoGradTransformForAdd):
-    def init_data(self):
-        # { input_index: input_shape }
-        self.xs_shape_map = {0: (100, 2), 1: (5, 2)}
-        # { output_index: output_shape }
-        self.ys_shape_map = {0: (100, 5)}
-        X0 = paddle.static.data(
-            'X0', shape=self.xs_shape_map[0], dtype='float32'
-        )
-        X0.stop_gradient = False
-        X1 = paddle.static.data(
-            'X1', shape=self.xs_shape_map[1], dtype='float32'
-        )
-        X1.stop_gradient = False
-
-        A = paddle.reshape(X1, [2, 5])
-        B = paddle.scale(A, scale=2.0, bias=2.0)
-        Y = paddle.matmul(X0, B)
-
-        self.orig_xs = [X0, X1]
-        self.orig_ys = [
-            Y,
-        ]
-
-        self.orig_ops = ['reshape2', 'scale', 'matmul_v2']
-        self.orig2prim_ops = [
-            'reshape_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            'mul_p',
-            'add_p',
-            'matmul_p',
-        ]
-        self.linearize_ops = self.orig2prim_ops + [
-            # call fill_const() in linearize() function
-            'fill_constant_p',
-            'fill_constant_p',
-            # linearized op
-            'reshape_p',
-            'mul_p',
-            # 'mul_p', # JVP rules handle `None` input, some op will not be appended
-            # 'add_p',
-            # 'add_p',
-            'matmul_p',
-            'matmul_p',
-            'add_p',
-        ]
-        self.transpose_ops = self.orig2prim_ops + [
-            # call fill_const() in transpose() function
-            'fill_constant_p',
-            # linearized op after remove path
-            'fill_constant_p',
-            'fill_constant_p',
-            'mul_p',
-            # transposed op
-            'transpose_p',
-            'matmul_p',
-            'transpose_p',
-            'matmul_p',
-            # 'mul_p',
-            'reshape_p',
-        ]
-
-        self.prim2orig_ops_with_blacklist = [
-            'reshape2',
-            'fill_constant',
-            'fill_constant',
-            'fill_constant',
-            'elementwise_mul',
-            'add_p',
-            'matmul_v2',
-            'fill_constant',
-            'fill_constant',
-            'fill_constant',
-            'elementwise_mul',
-            'transpose2',
-            'matmul_v2',
-            'transpose2',
-            'matmul_v2',
-            # 'elementwise_mul',
-            'reshape2',
-        ]
-
-        self.prim2orig_ops = [
-            'reshape2',
-            'fill_constant',
-            'fill_constant',
-            'fill_constant',
-            'elementwise_mul',
-            'elementwise_add',
-            'matmul_v2',
-            'fill_constant',
-            'fill_constant',
-            'fill_constant',
-            'elementwise_mul',
-            'transpose2',
-            'matmul_v2',
-            'transpose2',
-            'matmul_v2',
-            # 'elementwise_mul',
-            'reshape2',
-        ]
-
-
-class TestAutoGradTransformForIndexSelect(TestAutoGradTransformForAdd):
-    def init_data(self):
-        # { input_index: input_shape }
-        self.xs_shape_map = {0: (7, 8, 9), 1: (8, 1), 2: (7, 8, 9), 3: (3,)}
-        # { output_index: output_shape }
-        self.ys_shape_map = {0: (3, 16, 9)}
-
-        X0 = paddle.static.data(
-            'X0', shape=self.xs_shape_map[0], dtype='float32'
-        )
-        X0.stop_gradient = False
-        X1 = paddle.static.data(
-            'X1', shape=self.xs_shape_map[1], dtype='float32'
-        )
-        X1.stop_gradient = False
-        X2 = paddle.static.data(
-            'X2', shape=self.xs_shape_map[2], dtype='float32'
-        )
-        X2.stop_gradient = False
-        X3 = paddle.static.data('X3', shape=self.xs_shape_map[3], dtype='int32')
-        X3.stop_gradient = False
-
-        A = paddle.add(X0, X1)  # (7, 8, 9)
-        B = paddle.norm(x=A, p=2)  # (1, )
-        C = paddle.subtract(X2, B)  # (7, 8, 9)
-        D = paddle.concat(x=(A, C), axis=1)  # (7, 16, 9)
-        Y = paddle.index_select(D, X3, axis=0)  # (3, 16, 9)
-
-        self.orig_xs = [X0, X1, X2, X3]
-        self.orig_ys = [
-            Y,
-        ]
-        self.orig_ops = [
-            'elementwise_add',
-            'p_norm',
-            'elementwise_sub',
-            'concat',
-            'index_select',
-        ]
-        self.orig2prim_ops = [
-            'broadcast_p',
-            'add_p',
-            'reshape_p',
-            'mul_p',
-            'reduce_sum_p',
-            'sqrt_p',
-            'broadcast_p',
-            'sub_p',
-            'concat_p',
-            'gather_p',
-        ]
-        self.linearize_ops = self.orig2prim_ops + [
-            # call fill_const() in linearize() function
-            'fill_constant_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            # linearized op
-            'broadcast_p',
-            'add_p',
-            'reshape_p',
-            'mul_p',
-            'mul_p',
-            'add_p',
-            'reduce_sum_p',
-            'fill_constant_p',  # 'sqrt_p', Will not append sqrt_p op when apply JVP for sqrt_p
-            'mul_p',
-            'div_p',
-            'broadcast_p',
-            'sub_p',
-            'concat_p',
-            'gather_p',
-        ]
-        self.transpose_ops = self.orig2prim_ops + [
-            # call fill_const() in transpose() function
-            'fill_constant_p',
-            # linearized op after remove path
-            'fill_constant_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            'fill_constant_p',
-            'mul_p',
-            # transposed op
-            'reduce_sum_p',
-            'reshape_p',
-            'reshape_p',
-            'mul_p',
-            'mul_p',
-            'reshape_p',
-            'broadcast_p',
-            'div_p',
-            'reduce_sum_p',
-            'reshape_p',
-            'fill_constant_p',
-            'sub_p',
-            'split_p',
-            'fill_constant_p',
-            'scatter_add_p',
-            'add_p',  # The output of the op is used by multiple subsequent ops
-            'add_p',
-        ]
-
-        self.prim2orig_ops_with_blacklist = [
-            'expand_v2',
-            'add_p',
-            'reshape2',
-            'elementwise_mul',
-            'reduce_sum',
-            'sqrt',
-            'expand_v2',
-            'sub_p',
-            'concat',
-            'gather',
-            'fill_constant',
-            'fill_constant',
-            'fill_constant',
-            'fill_constant',
-            'fill_constant',
-            'fill_constant',
-            'elementwise_mul',
-            'reduce_sum',
-            'reshape2',
-            'reshape2',
-            'elementwise_mul',
-            'elementwise_mul',
-            'reshape2',
-            'expand_v2',
-            'elementwise_div',
-            'reduce_sum',
-            'reshape2',
-            'fill_constant',
-            'sub_p',
-            'split',
-            'fill_constant',
-            'fill_any_like',
-            'add_p',
-            'scatter',
-            'elementwise_add',
-            'add_p',
-        ]
-
-        self.prim2orig_ops = [
-            'expand_v2',
-            'elementwise_add',
-            'reshape2',
-            'elementwise_mul',
-            'reduce_sum',
-            'sqrt',
-            'expand_v2',
-            'elementwise_sub',
-            'concat',
-            'gather',
-            'fill_constant',
-            'fill_constant',
-            'fill_constant',
-            'fill_constant',
-            'fill_constant',
-            'fill_constant',
-            'elementwise_mul',
-            'reduce_sum',
-            'reshape2',
-            'reshape2',
-            'elementwise_mul',
-            'elementwise_mul',
-            'reshape2',
-            'expand_v2',
-            'elementwise_div',
-            'reduce_sum',
-            'reshape2',
-            'fill_constant',
-            'elementwise_sub',
-            'split',
-            'fill_constant',
-            'fill_any_like',
-            'elementwise_add',
-            'scatter',
-            'elementwise_add',
-            'elementwise_add',
-        ]
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/book/test_image_classification.py b/test/book/test_image_classification.py
index 3e8f771983cac..d61e17ba3069b 100644
--- a/test/book/test_image_classification.py
+++ b/test/book/test_image_classification.py
@@ -179,12 +179,7 @@ def train_loop(main_program):
                     avg_loss_value = numpy.array(avg_loss_list).mean()
 
                     print(
-                        'PassID {:1}, BatchID {:04}, Test Loss {:2.2}, Acc {:2.2}'.format(
-                            pass_id,
-                            batch_id + 1,
-                            float(avg_loss_value),
-                            float(acc_value),
-                        )
+                        f'PassID {pass_id:1}, BatchID {batch_id + 1:04}, Test Loss {float(avg_loss_value):2.2}, Acc {float(acc_value):2.2}'
                     )
 
                     if acc_value > 0.01:  # Low threshold for speeding up CI
diff --git a/test/book/test_recognize_digits.py b/test/book/test_recognize_digits.py
index 643aaae6ce6d9..0ea7791e396f0 100644
--- a/test/book/test_recognize_digits.py
+++ b/test/book/test_recognize_digits.py
@@ -153,12 +153,7 @@ def train_loop(main_program):
                         return
                     else:
                         print(
-                            'PassID {:1}, BatchID {:04}, Test Loss {:2.2}, Acc {:2.2}'.format(
-                                pass_id,
-                                batch_id + 1,
-                                float(avg_loss_val),
-                                float(acc_val),
-                            )
+                            f'PassID {pass_id:1}, BatchID {batch_id + 1:04}, Test Loss {float(avg_loss_val):2.2}, Acc {float(acc_val):2.2}'
                         )
                         if math.isnan(float(avg_loss_val)):
                             sys.exit("got NaN loss, training failed.")
diff --git a/test/cinn/CMakeLists.txt b/test/cinn/CMakeLists.txt
index 3158c4372d8fd..a6ded4bb71e72 100644
--- a/test/cinn/CMakeLists.txt
+++ b/test/cinn/CMakeLists.txt
@@ -1,5 +1,4 @@
 set(CINN_PYTHON_TEST_DIR ${CMAKE_BINARY_DIR}/test/cinn)
-set(CINN_CORE_API ${CMAKE_BINARY_DIR}/python/core_api.so)
 
 add_custom_command(
   OUTPUT ${CMAKE_BINARY_DIR}/test/__init__.py POST_BUILD
@@ -25,7 +24,7 @@ foreach(basic_test_name ${BASIC_TEST_NAMES})
     NAME ${basic_test_name}
     COMMAND
       ${CMAKE_COMMAND} -E env
-      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/cinn:$ENV{PYTHONPATH}
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python:$ENV{PYTHONPATH}
       python3 ${CMAKE_CURRENT_SOURCE_DIR}/${basic_test_name}.py
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 endforeach()
@@ -50,7 +49,7 @@ if(WITH_CUDNN)
     NAME test_netbuilder
     COMMAND
       ${CMAKE_COMMAND} -E env
-      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/cinn:$ENV{PYTHONPATH}
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python:$ENV{PYTHONPATH}
       python3 ${CMAKE_CURRENT_SOURCE_DIR}/test_netbuilder.py "${WITH_GPU}"
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 endif()
@@ -72,7 +71,7 @@ add_test(
   NAME test_cinn_op_benchmark
   COMMAND
     ${CMAKE_COMMAND} -E env
-    PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/cinn:$ENV{PYTHONPATH}
+    PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python:$ENV{PYTHONPATH}
     python3 ${CMAKE_CURRENT_SOURCE_DIR}/test_op_benchmark.py "${WITH_GPU}"
   WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 
@@ -81,7 +80,7 @@ if(WITH_CUDNN)
     NAME test_cinn_fake_resnet
     COMMAND
       ${CMAKE_COMMAND} -E env
-      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/cinn:$ENV{PYTHONPATH}
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python:$ENV{PYTHONPATH}
       python3 ${CMAKE_CURRENT_SOURCE_DIR}/test_resnet.py
       "${CMAKE_BINARY_DIR}/third_party/resnet_model" "${WITH_GPU}"
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
@@ -90,7 +89,7 @@ if(WITH_CUDNN)
     NAME test_cinn_real_resnet18
     COMMAND
       ${CMAKE_COMMAND} -E env
-      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/cinn:$ENV{PYTHONPATH}
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python:$ENV{PYTHONPATH}
       FLAGS_cinn_infer_model_version=1.0 python3
       ${CMAKE_CURRENT_SOURCE_DIR}/test_resnet18.py
       "${CMAKE_BINARY_DIR}/third_party/ResNet18" "${WITH_GPU}"
@@ -100,7 +99,7 @@ if(WITH_CUDNN)
     NAME test_cinn_real_mobilenetV2
     COMMAND
       ${CMAKE_COMMAND} -E env
-      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/cinn:$ENV{PYTHONPATH}
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python:$ENV{PYTHONPATH}
       FLAGS_cinn_infer_model_version=1.0 python3
       ${CMAKE_CURRENT_SOURCE_DIR}/test_mobilenetv2.py
       "${CMAKE_BINARY_DIR}/third_party/MobileNetV2" "${WITH_GPU}"
@@ -110,7 +109,7 @@ if(WITH_CUDNN)
     NAME test_cinn_real_efficientnet
     COMMAND
       ${CMAKE_COMMAND} -E env
-      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/cinn:$ENV{PYTHONPATH}
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python:$ENV{PYTHONPATH}
       FLAGS_cinn_infer_model_version=1.0 python3
       ${CMAKE_CURRENT_SOURCE_DIR}/test_efficientnet.py
       "${CMAKE_BINARY_DIR}/third_party/EfficientNet" "${WITH_GPU}"
@@ -120,7 +119,7 @@ if(WITH_CUDNN)
     NAME test_cinn_real_mobilenetV1
     COMMAND
       ${CMAKE_COMMAND} -E env
-      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/cinn:$ENV{PYTHONPATH}
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python:$ENV{PYTHONPATH}
       FLAGS_cinn_infer_model_version=1.0 python3
       ${CMAKE_CURRENT_SOURCE_DIR}/test_mobilenetv1.py
       "${CMAKE_BINARY_DIR}/third_party/MobilenetV1" "${WITH_GPU}"
@@ -130,7 +129,7 @@ if(WITH_CUDNN)
     NAME test_cinn_real_resnet50
     COMMAND
       ${CMAKE_COMMAND} -E env
-      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/cinn:$ENV{PYTHONPATH}
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python:$ENV{PYTHONPATH}
       FLAGS_cinn_infer_model_version=1.0 python3
       ${CMAKE_CURRENT_SOURCE_DIR}/test_resnet50.py
       "${CMAKE_BINARY_DIR}/third_party/ResNet50" "${WITH_GPU}"
@@ -140,7 +139,7 @@ if(WITH_CUDNN)
     NAME test_cinn_real_squeezenet
     COMMAND
       ${CMAKE_COMMAND} -E env
-      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/cinn:$ENV{PYTHONPATH}
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python:$ENV{PYTHONPATH}
       FLAGS_cinn_infer_model_version=1.0 python3
       ${CMAKE_CURRENT_SOURCE_DIR}/test_squeezenet.py
       "${CMAKE_BINARY_DIR}/third_party/SqueezeNet" "${WITH_GPU}"
@@ -150,7 +149,7 @@ if(WITH_CUDNN)
     NAME test_paddle_model_convertor
     COMMAND
       ${CMAKE_COMMAND} -E env
-      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/cinn:$ENV{PYTHONPATH}
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python:$ENV{PYTHONPATH}
       FLAGS_cinn_infer_model_version=1.0 python3
       ${CMAKE_CURRENT_SOURCE_DIR}/test_paddle_model_convertor.py --path
       "${CMAKE_BINARY_DIR}/third_party/resnet_model_1"
@@ -173,7 +172,7 @@ if(WITH_GPU)
       NAME test_conv2d_op
       COMMAND
         ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/cinn:$ENV{PYTHONPATH}
+        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python:$ENV{PYTHONPATH}
         python3 ${CMAKE_CURRENT_SOURCE_DIR}/ops/test_conv2d_op.py
       WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
   endif()
@@ -188,7 +187,7 @@ if(WITH_GPU)
       NAME ${op_test_name}
       COMMAND
         ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/cinn:$ENV{PYTHONPATH}
+        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python:$ENV{PYTHONPATH}
         python3 ${CMAKE_CURRENT_SOURCE_DIR}/${op_test_name}.py
       WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
   endforeach()
@@ -205,7 +204,7 @@ if(WITH_GPU)
       NAME test_mul_op_mapper
       COMMAND
         ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/cinn:$ENV{PYTHONPATH}
+        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python:$ENV{PYTHONPATH}
         python3 ${CMAKE_CURRENT_SOURCE_DIR}/op_mappers/test_mul_op.py
       WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 
@@ -213,7 +212,7 @@ if(WITH_GPU)
       NAME test_conv2d_op_mapper
       COMMAND
         ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/cinn:$ENV{PYTHONPATH}
+        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python:$ENV{PYTHONPATH}
         python3 ${CMAKE_CURRENT_SOURCE_DIR}/op_mappers/test_conv2d_op.py
       WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
   endif()
@@ -228,7 +227,7 @@ if(WITH_GPU)
       NAME "${op_mapper_test_name}_mapper"
       COMMAND
         ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/cinn:$ENV{PYTHONPATH}
+        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python:$ENV{PYTHONPATH}
         python3 ${CMAKE_CURRENT_SOURCE_DIR}/${op_mapper_test_name}.py
       WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
   endforeach()
@@ -249,7 +248,7 @@ if(WITH_GPU)
       NAME ${pass_test_name}
       COMMAND
         ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/cinn:$ENV{PYTHONPATH}
+        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python:$ENV{PYTHONPATH}
         python3 ${CMAKE_CURRENT_SOURCE_DIR}/${pass_test_name}.py
       WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
   endforeach()
@@ -269,7 +268,7 @@ if(WITH_GPU)
       NAME ${fusion_test_name}
       COMMAND
         ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/cinn:$ENV{PYTHONPATH}
+        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python:$ENV{PYTHONPATH}
         python3 ${CMAKE_CURRENT_SOURCE_DIR}/${fusion_test_name}.py
       WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
   endforeach()
diff --git a/test/cinn/auto_schedule/cost_model/test_cost_model.py b/test/cinn/auto_schedule/cost_model/test_cost_model.py
index 12db8f8ccbe10..62cf817e59378 100644
--- a/test/cinn/auto_schedule/cost_model/test_cost_model.py
+++ b/test/cinn/auto_schedule/cost_model/test_cost_model.py
@@ -17,7 +17,8 @@
 import unittest
 
 import numpy as np
-from cinn.auto_schedule.cost_model import CostModel
+
+from paddle.cinn.auto_schedule.cost_model import CostModel
 
 
 class TestCostModel(unittest.TestCase):
diff --git a/test/cinn/fusion/fusion_test.py b/test/cinn/fusion/fusion_test.py
index bbc2d8603b43a..b327ef273a918 100644
--- a/test/cinn/fusion/fusion_test.py
+++ b/test/cinn/fusion/fusion_test.py
@@ -48,9 +48,7 @@ def check_fusion_outputs(
         self.assertEqual(
             real_group_size,
             group_size,
-            msg="The model should be fused into {} groups, but actually fused {} groups".format(
-                group_size, real_group_size
-            ),
+            msg=f"The model should be fused into {group_size} groups, but actually fused {real_group_size} groups",
         )
 
         cinn_no_fusion_outputs = self.get_pass_outputs(base_passes)
diff --git a/test/cinn/ir/test_llir_constructor.py b/test/cinn/ir/test_llir_constructor.py
index 05c44e8935dfb..a8ce14be67884 100644
--- a/test/cinn/ir/test_llir_constructor.py
+++ b/test/cinn/ir/test_llir_constructor.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 
-from cinn import ir, lang, to_cinn_llir
-from cinn.runtime.data_array import DataArray
+from paddle.cinn import ir, lang, to_cinn_llir
+from paddle.cinn.runtime.data_array import DataArray
 
 
 def test_call_extern():
diff --git a/test/cinn/ir/test_llir_schedule_bind.py b/test/cinn/ir/test_llir_schedule_bind.py
index 5be0ddf95ae17..446cffe1b1198 100644
--- a/test/cinn/ir/test_llir_schedule_bind.py
+++ b/test/cinn/ir/test_llir_schedule_bind.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 from test.cinn.utils.testing import assert_llir_equal
 
-from cinn import ir, to_cinn_llir
-from cinn.runtime.data_array import DataArray
-from cinn.schedule import IRSchedule as sch
+from paddle.cinn import ir, to_cinn_llir
+from paddle.cinn.runtime.data_array import DataArray
+from paddle.cinn.schedule import IRSchedule as sch
 
 
 def test_bind_reduce():
diff --git a/test/cinn/ir/test_llir_schedule_cache_read_write.py b/test/cinn/ir/test_llir_schedule_cache_read_write.py
index 7dd8cb488e918..f5446edd4926d 100644
--- a/test/cinn/ir/test_llir_schedule_cache_read_write.py
+++ b/test/cinn/ir/test_llir_schedule_cache_read_write.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 
 
-from cinn import ir, to_cinn_llir
-from cinn.runtime.data_array import DataArray
-from cinn.schedule import IRSchedule as sch
+from paddle.cinn import ir, to_cinn_llir
+from paddle.cinn.runtime.data_array import DataArray
+from paddle.cinn.schedule import IRSchedule as sch
 
 
 # (Note:LiuYang): Here the temp tensor is created in cache_read or cache_write
diff --git a/test/cinn/ir/test_llir_schedule_compute_at.py b/test/cinn/ir/test_llir_schedule_compute_at.py
index 4c96ff23436ae..d6b973747b65d 100644
--- a/test/cinn/ir/test_llir_schedule_compute_at.py
+++ b/test/cinn/ir/test_llir_schedule_compute_at.py
@@ -14,9 +14,9 @@
 
 from test.cinn.utils.testing import assert_llir_equal
 
-from cinn import ir, to_cinn_llir
-from cinn.runtime.data_array import DataArray
-from cinn.schedule import IRSchedule as sch
+from paddle.cinn import ir, to_cinn_llir
+from paddle.cinn.runtime.data_array import DataArray
+from paddle.cinn.schedule import IRSchedule as sch
 
 
 def test_compute_at_elementwise():
diff --git a/test/cinn/ir/test_llir_schedule_compute_inline.py b/test/cinn/ir/test_llir_schedule_compute_inline.py
index 32deb78c0b10b..555089959e320 100644
--- a/test/cinn/ir/test_llir_schedule_compute_inline.py
+++ b/test/cinn/ir/test_llir_schedule_compute_inline.py
@@ -14,9 +14,9 @@
 
 from test.cinn.utils.testing import assert_llir_equal
 
-from cinn import common, ir, to_cinn_llir
-from cinn.runtime.data_array import DataArray
-from cinn.schedule import IRSchedule as sch
+from paddle.cinn import common, ir, to_cinn_llir
+from paddle.cinn.runtime.data_array import DataArray
+from paddle.cinn.schedule import IRSchedule as sch
 
 
 def test_compute_inline_elementwise():
diff --git a/test/cinn/ir/test_llir_schedule_for_kind.py b/test/cinn/ir/test_llir_schedule_for_kind.py
index 70dc96ea0715d..e9b130fecbb97 100644
--- a/test/cinn/ir/test_llir_schedule_for_kind.py
+++ b/test/cinn/ir/test_llir_schedule_for_kind.py
@@ -14,9 +14,9 @@
 
 from test.cinn.utils.testing import assert_llir_equal
 
-from cinn import ir, to_cinn_llir
-from cinn.runtime.data_array import DataArray
-from cinn.schedule import IRSchedule as sch
+from paddle.cinn import ir, to_cinn_llir
+from paddle.cinn.runtime.data_array import DataArray
+from paddle.cinn.schedule import IRSchedule as sch
 
 
 # Current Python DSL cannot express the parallel `for`,
diff --git a/test/cinn/ir/test_llir_schedule_fuse_split.py b/test/cinn/ir/test_llir_schedule_fuse_split.py
index b4722a1a02434..612e3a36c59a1 100644
--- a/test/cinn/ir/test_llir_schedule_fuse_split.py
+++ b/test/cinn/ir/test_llir_schedule_fuse_split.py
@@ -15,9 +15,9 @@
 
 from test.cinn.utils.testing import assert_llir_equal
 
-from cinn import ir, to_cinn_llir
-from cinn.runtime.data_array import DataArray
-from cinn.schedule import IRSchedule as sch
+from paddle.cinn import ir, to_cinn_llir
+from paddle.cinn.runtime.data_array import DataArray
+from paddle.cinn.schedule import IRSchedule as sch
 
 
 def test_fuse():
diff --git a/test/cinn/ir/test_llir_schedule_reorder.py b/test/cinn/ir/test_llir_schedule_reorder.py
index 254197beb222a..e2be0c9952f22 100644
--- a/test/cinn/ir/test_llir_schedule_reorder.py
+++ b/test/cinn/ir/test_llir_schedule_reorder.py
@@ -14,9 +14,9 @@
 
 from test.cinn.utils.testing import assert_llir_equal
 
-from cinn import ir, to_cinn_llir
-from cinn.runtime.data_array import DataArray
-from cinn.schedule import IRSchedule as sch
+from paddle.cinn import ir, to_cinn_llir
+from paddle.cinn.runtime.data_array import DataArray
+from paddle.cinn.schedule import IRSchedule as sch
 
 
 def test_reorder_elementwise():
diff --git a/test/cinn/ir/test_llir_schedule_rfactor.py b/test/cinn/ir/test_llir_schedule_rfactor.py
index 098435686c791..e4b8b8fd7836a 100644
--- a/test/cinn/ir/test_llir_schedule_rfactor.py
+++ b/test/cinn/ir/test_llir_schedule_rfactor.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 
 
-from cinn import ir, to_cinn_llir
-from cinn.runtime.data_array import DataArray
-from cinn.schedule import IRSchedule as sch
+from paddle.cinn import ir, to_cinn_llir
+from paddle.cinn.runtime.data_array import DataArray
+from paddle.cinn.schedule import IRSchedule as sch
 
 
 def test_matmul():
diff --git a/test/cinn/ir/test_llir_schedule_sequence.py b/test/cinn/ir/test_llir_schedule_sequence.py
index 2cff0c650fd63..708f733e05b4f 100644
--- a/test/cinn/ir/test_llir_schedule_sequence.py
+++ b/test/cinn/ir/test_llir_schedule_sequence.py
@@ -14,9 +14,9 @@
 
 from test.cinn.utils.testing import assert_llir_equal
 
-from cinn import ir, to_cinn_llir
-from cinn.runtime.data_array import DataArray
-from cinn.schedule import IRSchedule as sch
+from paddle.cinn import ir, to_cinn_llir
+from paddle.cinn.runtime.data_array import DataArray
+from paddle.cinn.schedule import IRSchedule as sch
 
 
 def test_split_reorder_elementwise():
diff --git a/test/cinn/op_mappers/op_mapper_test.py b/test/cinn/op_mappers/op_mapper_test.py
index f3a5ef5d1847b..45bf92ae6cf93 100644
--- a/test/cinn/op_mappers/op_mapper_test.py
+++ b/test/cinn/op_mappers/op_mapper_test.py
@@ -18,12 +18,11 @@
 import os
 import sys
 
-from cinn.common import is_compiled_with_cuda
-from cinn.framework import Scope
-from cinn.frontend import PaddleModelConvertor
-
 import paddle
 from paddle.base.layer_helper import LayerHelper
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.framework import Scope
+from paddle.cinn.frontend import PaddleModelConvertor
 from paddle.static import Variable as PaddleVariable
 
 sys.path.append("/work/dev_CINN/build/python/tests")
@@ -178,9 +177,7 @@ def __check_valid(self):
             self.assertNotIn(
                 out_name,
                 self.output_dtypes,
-                msg="The {} should not declare twice because it's a inplace output, you should remove it from \"set_op_outputs\"".format(
-                    out_name
-                ),
+                msg=f"The {out_name} should not declare twice because it's a inplace output, you should remove it from \"set_op_outputs\"",
             )
             self.assertIn(
                 in_name,
@@ -219,12 +216,7 @@ def __remove_skip_outputs(self, results):
             if self.fetch_targets[i].name not in self.skip_check_list:
                 check_outputs.append(results[i])
                 logger.debug(
-                    msg="{}, shape={}, dtype={}:\n{}".format(
-                        self.fetch_targets[i].name,
-                        results[i].shape,
-                        str(results[i].dtype),
-                        results[i],
-                    )
+                    msg=f"{self.fetch_targets[i].name}, shape={results[i].shape}, dtype={str(results[i].dtype)}:\n{results[i]}"
                 )
 
         return check_outputs
diff --git a/test/cinn/op_mappers/test_pool2d_op.py b/test/cinn/op_mappers/test_pool2d_op.py
index a58e8e645cd60..e8d09073ee664 100644
--- a/test/cinn/op_mappers/test_pool2d_op.py
+++ b/test/cinn/op_mappers/test_pool2d_op.py
@@ -14,10 +14,10 @@
 
 import unittest
 
-from cinn.common import is_compiled_with_cudnn
 from op_mapper_test import OpMapperTest
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cudnn
 
 
 @unittest.skipIf(
diff --git a/test/cinn/ops/op_test.py b/test/cinn/ops/op_test.py
index 57547907d2ae9..eb85ab7f1fc9d 100755
--- a/test/cinn/ops/op_test.py
+++ b/test/cinn/ops/op_test.py
@@ -18,7 +18,9 @@
 import unittest
 
 import numpy as np
-from cinn.common import (
+
+import paddle
+from paddle.cinn.common import (
     BFloat16,
     Bool,
     DefaultHostTarget,
@@ -29,9 +31,7 @@
     UInt,
     is_compiled_with_cuda,
 )
-from cinn.runtime import seed as cinn_seed
-
-import paddle
+from paddle.cinn.runtime import seed as cinn_seed
 
 logging.basicConfig(level=os.environ.get('LOG_LEVEL', 'INFO').upper())
 logger = logging.getLogger(name="op_test")
@@ -216,9 +216,7 @@ def _check_error_message(output_id, expect, actual):
             self.assertEqual(
                 len(expect_flatten),
                 len(actual_flatten),
-                "[{}] The {}-th output size different, which expect shape is {} but actual is {}.".format(
-                    self._get_device(), output_id, expect.shape, actual.shape
-                ),
+                f"[{self._get_device()}] The {output_id}-th output size different, which expect shape is {expect.shape} but actual is {actual.shape}.",
             )
             num_diffs = 0
             offset = -1
@@ -227,14 +225,7 @@ def _check_error_message(output_id, expect, actual):
                     num_diffs = num_diffs + 1
                     offset = i if offset == -1 else offset
 
-            error_message = "[{}] The {}-th output: total {} different results, the first different result's offset={}, where expect value is {} but actual is {}.".format(
-                self._get_device(),
-                output_id,
-                num_diffs,
-                offset,
-                expect_flatten[offset],
-                actual_flatten[offset],
-            )
+            error_message = f"[{self._get_device()}] The {output_id}-th output: total {num_diffs} different results, the first different result's offset={offset}, where expect value is {expect_flatten[offset]} but actual is {actual_flatten[offset]}."
             return error_message
 
         self.assertEqual(len(expect_res), len(actual_res))
@@ -257,9 +248,7 @@ def _check_error_message(output_id, expect, actual):
             self.assertEqual(
                 expect.dtype,
                 actual.dtype,
-                msg="[{}] The {}-th output dtype different, which expect shape is {} but actual is {}.".format(
-                    self._get_device(), i, expect.dtype, actual.dtype
-                ),
+                msg=f"[{self._get_device()}] The {i}-th output dtype different, which expect shape is {expect.dtype} but actual is {actual.dtype}.",
             )
             # NOTE: Paddle's 0D Tensor will be changed to 1D when calling tensor.numpy(),
             # only check non-0D Tensor's shape here. 0D-Tensor's shape will be verified by `test_zero_dim_tensor.py`
@@ -267,9 +256,7 @@ def _check_error_message(output_id, expect, actual):
                 self.assertEqual(
                     expect.shape,
                     actual.shape,
-                    msg="[{}] The {}-th output shape different, which expect shape is {} but actual is {}.".format(
-                        self._get_device(), i, expect.shape, actual.shape
-                    ),
+                    msg=f"[{self._get_device()}] The {i}-th output shape different, which expect shape is {expect.shape} but actual is {actual.shape}.",
                 )
 
             should_all_equal = all_equal or (
@@ -294,9 +281,7 @@ def _check_error_message(output_id, expect, actual):
                 )
                 # _compute_error_message checks which values have absolute or relative error
                 error_message = (
-                    "np.allclose(expect, actual, atol={}, rtol={}) checks succeed!".format(
-                        max_absolute_error, max_relative_error
-                    )
+                    f"np.allclose(expect, actual, atol={max_absolute_error}, rtol={max_relative_error}) checks succeed!"
                     if is_allclose
                     else _compute_error_message(i, expect, actual)
                 )
diff --git a/test/cinn/ops/test_abs_op.py b/test/cinn/ops/test_abs_op.py
index aca8b631f7a2d..c0c9a698d31e6 100644
--- a/test/cinn/ops/test_abs_op.py
+++ b/test/cinn/ops/test_abs_op.py
@@ -15,12 +15,12 @@
 # limitations under the License.
 
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_acos_op.py b/test/cinn/ops/test_acos_op.py
index 74eee1dee01c1..dc760fbf66936 100644
--- a/test/cinn/ops/test_acos_op.py
+++ b/test/cinn/ops/test_acos_op.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_acosh_op.py b/test/cinn/ops/test_acosh_op.py
index 10cae7040e60b..f3cfbe0891920 100644
--- a/test/cinn/ops/test_acosh_op.py
+++ b/test/cinn/ops/test_acosh_op.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_add_op.py b/test/cinn/ops/test_add_op.py
index 5768b8cfe4907..45b0c62cb8a13 100644
--- a/test/cinn/ops/test_add_op.py
+++ b/test/cinn/ops/test_add_op.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_arange_op.py b/test/cinn/ops/test_arange_op.py
index 049101c2a5f44..eec1b3cd2dbf3 100644
--- a/test/cinn/ops/test_arange_op.py
+++ b/test/cinn/ops/test_arange_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_argmax_op.py b/test/cinn/ops/test_argmax_op.py
index aa6c05829c555..c33092aae8854 100644
--- a/test/cinn/ops/test_argmax_op.py
+++ b/test/cinn/ops/test_argmax_op.py
@@ -15,12 +15,12 @@
 # limitations under the License.
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_argmin_op.py b/test/cinn/ops/test_argmin_op.py
index f04eda05f7974..ed5a35d2c6945 100644
--- a/test/cinn/ops/test_argmin_op.py
+++ b/test/cinn/ops/test_argmin_op.py
@@ -16,12 +16,12 @@
 
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_argsort_op.py b/test/cinn/ops/test_argsort_op.py
index afcd663f46fe6..8166fab186f43 100644
--- a/test/cinn/ops/test_argsort_op.py
+++ b/test/cinn/ops/test_argsort_op.py
@@ -16,12 +16,12 @@
 
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_asin_op.py b/test/cinn/ops/test_asin_op.py
index 8cc7793891e07..fcc1fdc62865a 100644
--- a/test/cinn/ops/test_asin_op.py
+++ b/test/cinn/ops/test_asin_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_asinh_op.py b/test/cinn/ops/test_asinh_op.py
index 66979248e39e3..72fefc9ad9c5d 100644
--- a/test/cinn/ops/test_asinh_op.py
+++ b/test/cinn/ops/test_asinh_op.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_atan2_op.py b/test/cinn/ops/test_atan2_op.py
index cacbf9935acfa..a9b93195f9e60 100644
--- a/test/cinn/ops/test_atan2_op.py
+++ b/test/cinn/ops/test_atan2_op.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_atan_op.py b/test/cinn/ops/test_atan_op.py
index 521df1480d1f5..834efffb679ee 100644
--- a/test/cinn/ops/test_atan_op.py
+++ b/test/cinn/ops/test_atan_op.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_atanh_op.py b/test/cinn/ops/test_atanh_op.py
index 635389699c896..2a8da64c250a4 100644
--- a/test/cinn/ops/test_atanh_op.py
+++ b/test/cinn/ops/test_atanh_op.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_batch_norm_op.py b/test/cinn/ops/test_batch_norm_op.py
index 8442f1e9f40e8..fde5ccf4e0610 100644
--- a/test/cinn/ops/test_batch_norm_op.py
+++ b/test/cinn/ops/test_batch_norm_op.py
@@ -15,12 +15,12 @@
 # limitations under the License.
 
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_binary_elementwise_op.py b/test/cinn/ops/test_binary_elementwise_op.py
index 74e32edbed31a..44c391fb5c17b 100644
--- a/test/cinn/ops/test_binary_elementwise_op.py
+++ b/test/cinn/ops/test_binary_elementwise_op.py
@@ -16,11 +16,11 @@
 import unittest
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_bitcast_convert_op.py b/test/cinn/ops/test_bitcast_convert_op.py
index f559125fa9058..7d87f4820b18b 100644
--- a/test/cinn/ops/test_bitcast_convert_op.py
+++ b/test/cinn/ops/test_bitcast_convert_op.py
@@ -16,11 +16,11 @@
 from struct import pack, unpack
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_bitwise_op.py b/test/cinn/ops/test_bitwise_op.py
index e86fafb0fb299..06189ba64e8d9 100644
--- a/test/cinn/ops/test_bitwise_op.py
+++ b/test/cinn/ops/test_bitwise_op.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_broadcast_to_op.py b/test/cinn/ops/test_broadcast_to_op.py
index 06af25824f007..c9e7338d8fe31 100644
--- a/test/cinn/ops/test_broadcast_to_op.py
+++ b/test/cinn/ops/test_broadcast_to_op.py
@@ -17,11 +17,11 @@
 import unittest
 
 import numpy as np
-from cinn.common import Float
-from cinn.frontend import NetBuilder
 from op_test import OpTest
 
 import paddle
+from paddle.cinn.common import Float
+from paddle.cinn.frontend import NetBuilder
 
 
 class TestBroadcastToOp(OpTest):
diff --git a/test/cinn/ops/test_broadcast_to_op_new.py b/test/cinn/ops/test_broadcast_to_op_new.py
index feb3a7d1114c6..4b32b7607c3b6 100644
--- a/test/cinn/ops/test_broadcast_to_op_new.py
+++ b/test/cinn/ops/test_broadcast_to_op_new.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_cast_op.py b/test/cinn/ops/test_cast_op.py
index 3fb59547d06fd..9bf1e9db1b6d0 100644
--- a/test/cinn/ops/test_cast_op.py
+++ b/test/cinn/ops/test_cast_op.py
@@ -15,12 +15,12 @@
 # limitations under the License.
 
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_cbrt_op.py b/test/cinn/ops/test_cbrt_op.py
index adecc95b4e309..1034efec872ad 100644
--- a/test/cinn/ops/test_cbrt_op.py
+++ b/test/cinn/ops/test_cbrt_op.py
@@ -15,12 +15,12 @@
 # limitations under the License.
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_ceil_op.py b/test/cinn/ops/test_ceil_op.py
index 232759e999f0e..e9c52549a4841 100644
--- a/test/cinn/ops/test_ceil_op.py
+++ b/test/cinn/ops/test_ceil_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_cholesky_op.py b/test/cinn/ops/test_cholesky_op.py
index 661c51e179807..5eb55353ed51a 100644
--- a/test/cinn/ops/test_cholesky_op.py
+++ b/test/cinn/ops/test_cholesky_op.py
@@ -15,12 +15,12 @@
 # limitations under the License.
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_clz_op.py b/test/cinn/ops/test_clz_op.py
index e57c0fa212093..2b51025baa3f1 100644
--- a/test/cinn/ops/test_clz_op.py
+++ b/test/cinn/ops/test_clz_op.py
@@ -15,12 +15,12 @@
 # limitations under the License.
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 INT32_MAX = (1 << 31) - 1
 INT32_MIN = -(1 << 31)
diff --git a/test/cinn/ops/test_comparison_op.py b/test/cinn/ops/test_comparison_op.py
index e6404a8b0f619..57141489d01cb 100644
--- a/test/cinn/ops/test_comparison_op.py
+++ b/test/cinn/ops/test_comparison_op.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_concat_op.py b/test/cinn/ops/test_concat_op.py
index 9a04248638086..1e8080a8eed44 100755
--- a/test/cinn/ops/test_concat_op.py
+++ b/test/cinn/ops/test_concat_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_constant_op.py b/test/cinn/ops/test_constant_op.py
index 63941c50b6be1..68c3bb15660e4 100644
--- a/test/cinn/ops/test_constant_op.py
+++ b/test/cinn/ops/test_constant_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_conv2d_op.py b/test/cinn/ops/test_conv2d_op.py
index 4f7a5ed577863..ea436ba37f8b8 100755
--- a/test/cinn/ops/test_conv2d_op.py
+++ b/test/cinn/ops/test_conv2d_op.py
@@ -14,13 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
-from cinn.runtime import set_cinn_cudnn_deterministic
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
+from paddle.cinn.runtime import set_cinn_cudnn_deterministic
 
 set_cinn_cudnn_deterministic(True)
 paddle.base.set_flags({'FLAGS_cudnn_deterministic': 1})
diff --git a/test/cinn/ops/test_cos_op.py b/test/cinn/ops/test_cos_op.py
index 4e5e05c03c609..c2cbc189f6f67 100644
--- a/test/cinn/ops/test_cos_op.py
+++ b/test/cinn/ops/test_cos_op.py
@@ -13,12 +13,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_cosh_op.py b/test/cinn/ops/test_cosh_op.py
index 578b0eaec9c15..b8ea2a506d134 100644
--- a/test/cinn/ops/test_cosh_op.py
+++ b/test/cinn/ops/test_cosh_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_depthwise_conv2d_op.py b/test/cinn/ops/test_depthwise_conv2d_op.py
index 57ddc6809998c..721c098349b12 100644
--- a/test/cinn/ops/test_depthwise_conv2d_op.py
+++ b/test/cinn/ops/test_depthwise_conv2d_op.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cudnn
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
 from paddle import nn
+from paddle.cinn.common import is_compiled_with_cudnn
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_divide_op.py b/test/cinn/ops/test_divide_op.py
index ea786955dbf02..71f5d1d3125c5 100644
--- a/test/cinn/ops/test_divide_op.py
+++ b/test/cinn/ops/test_divide_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_dropout_infer_op.py b/test/cinn/ops/test_dropout_infer_op.py
index adcc6b09eadfb..c418c54610b9c 100644
--- a/test/cinn/ops/test_dropout_infer_op.py
+++ b/test/cinn/ops/test_dropout_infer_op.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_erf_op.py b/test/cinn/ops/test_erf_op.py
index 6f413a3079064..ee5ca95d66a7a 100644
--- a/test/cinn/ops/test_erf_op.py
+++ b/test/cinn/ops/test_erf_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_exp_op.py b/test/cinn/ops/test_exp_op.py
index 294cf807183f4..ea3f76218ac19 100644
--- a/test/cinn/ops/test_exp_op.py
+++ b/test/cinn/ops/test_exp_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_expand_dims.py b/test/cinn/ops/test_expand_dims.py
index 4592337873894..adaeaa3ab1332 100644
--- a/test/cinn/ops/test_expand_dims.py
+++ b/test/cinn/ops/test_expand_dims.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_fill_constant_op.py b/test/cinn/ops/test_fill_constant_op.py
index 8b758be96ddb9..145aea64e07a0 100644
--- a/test/cinn/ops/test_fill_constant_op.py
+++ b/test/cinn/ops/test_fill_constant_op.py
@@ -15,12 +15,12 @@
 # limitations under the License.
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_floor_divide_op.py b/test/cinn/ops/test_floor_divide_op.py
index bf245668b1948..03e50f0ed8a09 100644
--- a/test/cinn/ops/test_floor_divide_op.py
+++ b/test/cinn/ops/test_floor_divide_op.py
@@ -15,12 +15,12 @@
 # limitations under the License.
 
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_floor_op.py b/test/cinn/ops/test_floor_op.py
index 4149897ff99b0..9ce507d0671ef 100644
--- a/test/cinn/ops/test_floor_op.py
+++ b/test/cinn/ops/test_floor_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_gather_nd_op.py b/test/cinn/ops/test_gather_nd_op.py
index d602d21dcb8a6..424bded111578 100644
--- a/test/cinn/ops/test_gather_nd_op.py
+++ b/test/cinn/ops/test_gather_nd_op.py
@@ -20,11 +20,11 @@
 from itertools import product
 
 import numpy as np
-from cinn.common import Int, is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 
 import paddle
+from paddle.cinn.common import Int, is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 logging.basicConfig(level=os.environ.get('LOG_LEVEL', 'INFO').upper())
 logger = logging.getLogger(name="gather_nd")
diff --git a/test/cinn/ops/test_gather_op.py b/test/cinn/ops/test_gather_op.py
index afc14a42db239..1c7d7d1c82257 100644
--- a/test/cinn/ops/test_gather_op.py
+++ b/test/cinn/ops/test_gather_op.py
@@ -18,12 +18,12 @@
 import os
 
 import numpy as np
-from cinn.common import Int, is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import Int, is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 logging.basicConfig(level=os.environ.get('LOG_LEVEL', 'INFO').upper())
 logger = logging.getLogger(name="gather")
diff --git a/test/cinn/ops/test_gaussian_random_op.py b/test/cinn/ops/test_gaussian_random_op.py
index 17ba245d3117d..aeab6e3a2b0aa 100644
--- a/test/cinn/ops/test_gaussian_random_op.py
+++ b/test/cinn/ops/test_gaussian_random_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_gelu_op.py b/test/cinn/ops/test_gelu_op.py
index 2ab792ae9b513..3e8211130a146 100644
--- a/test/cinn/ops/test_gelu_op.py
+++ b/test/cinn/ops/test_gelu_op.py
@@ -15,13 +15,13 @@
 # limitations under the License.
 
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
 import paddle.nn.functional as F
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_identity_op.py b/test/cinn/ops/test_identity_op.py
index 4c3e9c5558659..d75d73d252d8a 100644
--- a/test/cinn/ops/test_identity_op.py
+++ b/test/cinn/ops/test_identity_op.py
@@ -15,12 +15,12 @@
 # limitations under the License.
 
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_is_finite_op.py b/test/cinn/ops/test_is_finite_op.py
index cf58cf629c572..4392d976904e9 100644
--- a/test/cinn/ops/test_is_finite_op.py
+++ b/test/cinn/ops/test_is_finite_op.py
@@ -16,12 +16,12 @@
 
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_is_inf_op.py b/test/cinn/ops/test_is_inf_op.py
index 627c49c5a35a1..3df05ddf2f207 100644
--- a/test/cinn/ops/test_is_inf_op.py
+++ b/test/cinn/ops/test_is_inf_op.py
@@ -16,12 +16,12 @@
 
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_is_nan_op.py b/test/cinn/ops/test_is_nan_op.py
index f7b3f8961e27c..7d072e4442ce8 100644
--- a/test/cinn/ops/test_is_nan_op.py
+++ b/test/cinn/ops/test_is_nan_op.py
@@ -16,12 +16,12 @@
 
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_isclose_op.py b/test/cinn/ops/test_isclose_op.py
index 7ac763e2860bb..cf387900bbdcb 100644
--- a/test/cinn/ops/test_isclose_op.py
+++ b/test/cinn/ops/test_isclose_op.py
@@ -15,12 +15,12 @@
 # limitations under the License.
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_left_shift_op.py b/test/cinn/ops/test_left_shift_op.py
index 2af4faf7dace3..9ebb0868a8927 100644
--- a/test/cinn/ops/test_left_shift_op.py
+++ b/test/cinn/ops/test_left_shift_op.py
@@ -14,12 +14,12 @@
 
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_log_op.py b/test/cinn/ops/test_log_op.py
index 7d8cd1a78a456..de710c93f0502 100644
--- a/test/cinn/ops/test_log_op.py
+++ b/test/cinn/ops/test_log_op.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.frontend import NetBuilder
 from op_test import OpTest
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.frontend import NetBuilder
 
 
 class TestLogOp(OpTest):
diff --git a/test/cinn/ops/test_logical_and_op.py b/test/cinn/ops/test_logical_and_op.py
index f8a16e7316993..cc1a1705ea8ac 100644
--- a/test/cinn/ops/test_logical_and_op.py
+++ b/test/cinn/ops/test_logical_and_op.py
@@ -14,12 +14,12 @@
 
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_logical_not_op.py b/test/cinn/ops/test_logical_not_op.py
index a8630e648b8f4..b302c87fba8b6 100644
--- a/test/cinn/ops/test_logical_not_op.py
+++ b/test/cinn/ops/test_logical_not_op.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_logical_or_op.py b/test/cinn/ops/test_logical_or_op.py
index 57632c4f9f79c..5fca3ff38a5d9 100644
--- a/test/cinn/ops/test_logical_or_op.py
+++ b/test/cinn/ops/test_logical_or_op.py
@@ -14,12 +14,12 @@
 
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_logical_right_shift_op.py b/test/cinn/ops/test_logical_right_shift_op.py
index d8b668111746a..f615f2aeac6ec 100644
--- a/test/cinn/ops/test_logical_right_shift_op.py
+++ b/test/cinn/ops/test_logical_right_shift_op.py
@@ -15,12 +15,12 @@
 # limitations under the License.
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_logical_xor_op.py b/test/cinn/ops/test_logical_xor_op.py
index 6ba0f25847e2c..37ac8095cb227 100644
--- a/test/cinn/ops/test_logical_xor_op.py
+++ b/test/cinn/ops/test_logical_xor_op.py
@@ -14,12 +14,12 @@
 
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_lookup_table_op.py b/test/cinn/ops/test_lookup_table_op.py
index e1711907ad28f..004dc0729416f 100644
--- a/test/cinn/ops/test_lookup_table_op.py
+++ b/test/cinn/ops/test_lookup_table_op.py
@@ -14,13 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
 import paddle.nn.functional as F
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_matmul_op.py b/test/cinn/ops/test_matmul_op.py
index 8abb2467e1d66..293e6f9ab7d06 100755
--- a/test/cinn/ops/test_matmul_op.py
+++ b/test/cinn/ops/test_matmul_op.py
@@ -15,12 +15,12 @@
 # limitations under the License.
 
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_max_op.py b/test/cinn/ops/test_max_op.py
index 58e4908fb44a9..3b9ffbc330ec6 100644
--- a/test/cinn/ops/test_max_op.py
+++ b/test/cinn/ops/test_max_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_min_op.py b/test/cinn/ops/test_min_op.py
index 273f0936a158f..f2165c315f6b1 100644
--- a/test/cinn/ops/test_min_op.py
+++ b/test/cinn/ops/test_min_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_mod_op.py b/test/cinn/ops/test_mod_op.py
index 7a19381ae495e..66fd0a8f3e1c6 100644
--- a/test/cinn/ops/test_mod_op.py
+++ b/test/cinn/ops/test_mod_op.py
@@ -15,12 +15,12 @@
 # limitations under the License.
 
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_mul_op.py b/test/cinn/ops/test_mul_op.py
index 1ba893a840dbd..97c29a9f017db 100755
--- a/test/cinn/ops/test_mul_op.py
+++ b/test/cinn/ops/test_mul_op.py
@@ -13,12 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 def infer_shape(
diff --git a/test/cinn/ops/test_multiply_op.py b/test/cinn/ops/test_multiply_op.py
index 75db453f083fb..50c30456acabb 100644
--- a/test/cinn/ops/test_multiply_op.py
+++ b/test/cinn/ops/test_multiply_op.py
@@ -15,12 +15,12 @@
 # limitations under the License.
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_negative_op.py b/test/cinn/ops/test_negative_op.py
index e30204a2a667c..6536b41b5bc25 100644
--- a/test/cinn/ops/test_negative_op.py
+++ b/test/cinn/ops/test_negative_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_one_hot_op.py b/test/cinn/ops/test_one_hot_op.py
index d88db984f09cc..1ac469f85ec21 100755
--- a/test/cinn/ops/test_one_hot_op.py
+++ b/test/cinn/ops/test_one_hot_op.py
@@ -15,13 +15,13 @@
 # limitations under the License.
 
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
 import paddle.nn.functional as F
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_pool2d_op.py b/test/cinn/ops/test_pool2d_op.py
index f7d3eee667187..f9a5690bfc0cb 100644
--- a/test/cinn/ops/test_pool2d_op.py
+++ b/test/cinn/ops/test_pool2d_op.py
@@ -14,13 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cudnn
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
 from paddle import _C_ops
+from paddle.cinn.common import is_compiled_with_cudnn
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_popc_op.py b/test/cinn/ops/test_popc_op.py
index 954354ccfba13..d7da12f549a71 100644
--- a/test/cinn/ops/test_popc_op.py
+++ b/test/cinn/ops/test_popc_op.py
@@ -15,12 +15,12 @@
 # limitations under the License.
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 INT32_MAX = (1 << 31) - 1
 INT32_MIN = -(1 << 31)
diff --git a/test/cinn/ops/test_pow_op.py b/test/cinn/ops/test_pow_op.py
index 7d58869c109f1..016f64ce2a4d9 100644
--- a/test/cinn/ops/test_pow_op.py
+++ b/test/cinn/ops/test_pow_op.py
@@ -15,12 +15,12 @@
 # limitations under the License.
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_randint_op.py b/test/cinn/ops/test_randint_op.py
index 45a3d3c802661..6f80415278d84 100644
--- a/test/cinn/ops/test_randint_op.py
+++ b/test/cinn/ops/test_randint_op.py
@@ -16,11 +16,11 @@
 
 import unittest
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_reciprocal_op.py b/test/cinn/ops/test_reciprocal_op.py
index bbc1e447694fb..4d0bef85e03c7 100644
--- a/test/cinn/ops/test_reciprocal_op.py
+++ b/test/cinn/ops/test_reciprocal_op.py
@@ -14,11 +14,11 @@
 
 
 import numpy as np
-from cinn.frontend import NetBuilder
 from op_test import OpTest
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.frontend import NetBuilder
 
 
 class TestReciprocalOp(OpTest):
diff --git a/test/cinn/ops/test_reduce_op.py b/test/cinn/ops/test_reduce_op.py
index 07170a0af48b6..2ad9aac75db33 100644
--- a/test/cinn/ops/test_reduce_op.py
+++ b/test/cinn/ops/test_reduce_op.py
@@ -17,11 +17,11 @@
 import unittest
 
 import numpy as np
-from cinn.common import Bool, Float, Int, is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 
 import paddle
+from paddle.cinn.common import Bool, Float, Int, is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 paddle.seed(2)
 np.random.seed(2)
diff --git a/test/cinn/ops/test_reduce_op_new.py b/test/cinn/ops/test_reduce_op_new.py
index c47e511133719..40a56d9a7b522 100644
--- a/test/cinn/ops/test_reduce_op_new.py
+++ b/test/cinn/ops/test_reduce_op_new.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_relu6_op.py b/test/cinn/ops/test_relu6_op.py
index 3248b6836ba9f..d77b127590b89 100644
--- a/test/cinn/ops/test_relu6_op.py
+++ b/test/cinn/ops/test_relu6_op.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_relu_op.py b/test/cinn/ops/test_relu_op.py
index 2050305464d2d..a66e2650e9bd3 100755
--- a/test/cinn/ops/test_relu_op.py
+++ b/test/cinn/ops/test_relu_op.py
@@ -14,13 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
 import paddle.nn.functional as F
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_remainder_op.py b/test/cinn/ops/test_remainder_op.py
index 458346d47b7b1..b0c2f7c9ababb 100644
--- a/test/cinn/ops/test_remainder_op.py
+++ b/test/cinn/ops/test_remainder_op.py
@@ -15,12 +15,12 @@
 # limitations under the License.
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_repeat_op.py b/test/cinn/ops/test_repeat_op.py
index efde14180ffb8..e3d2c2b7f61a9 100644
--- a/test/cinn/ops/test_repeat_op.py
+++ b/test/cinn/ops/test_repeat_op.py
@@ -15,12 +15,12 @@
 # limitations under the License.
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_reshape_op.py b/test/cinn/ops/test_reshape_op.py
index d300b61ea1784..5e618860e5406 100644
--- a/test/cinn/ops/test_reshape_op.py
+++ b/test/cinn/ops/test_reshape_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_reverse_op.py b/test/cinn/ops/test_reverse_op.py
index e6a61dde1a073..0f72e9377e56b 100755
--- a/test/cinn/ops/test_reverse_op.py
+++ b/test/cinn/ops/test_reverse_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_right_shift_op.py b/test/cinn/ops/test_right_shift_op.py
index c059247a04e7b..b22e457e06c19 100644
--- a/test/cinn/ops/test_right_shift_op.py
+++ b/test/cinn/ops/test_right_shift_op.py
@@ -14,12 +14,12 @@
 
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_round_op.py b/test/cinn/ops/test_round_op.py
index ade311c9488c8..efb76927465de 100644
--- a/test/cinn/ops/test_round_op.py
+++ b/test/cinn/ops/test_round_op.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_rsqrt_op.py b/test/cinn/ops/test_rsqrt_op.py
index 4ccc4b7e4c56e..8eb41f9a030ee 100644
--- a/test/cinn/ops/test_rsqrt_op.py
+++ b/test/cinn/ops/test_rsqrt_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_scale_op.py b/test/cinn/ops/test_scale_op.py
index 4a816158838ff..9e242e4c372bc 100644
--- a/test/cinn/ops/test_scale_op.py
+++ b/test/cinn/ops/test_scale_op.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_scatter_add.py b/test/cinn/ops/test_scatter_add.py
index eddcc9457e7e1..2e84dec59f40f 100644
--- a/test/cinn/ops/test_scatter_add.py
+++ b/test/cinn/ops/test_scatter_add.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import Float, Int, is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper, run_test
 
 import paddle
+from paddle.cinn.common import Float, Int, is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_scatter_assign_op.py b/test/cinn/ops/test_scatter_assign_op.py
index 16cd97430e123..fbf6988092fdf 100644
--- a/test/cinn/ops/test_scatter_assign_op.py
+++ b/test/cinn/ops/test_scatter_assign_op.py
@@ -15,12 +15,12 @@
 # limitations under the License.
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_select_op.py b/test/cinn/ops/test_select_op.py
index 0897f7c6fa8fe..0abb56285c0ea 100644
--- a/test/cinn/ops/test_select_op.py
+++ b/test/cinn/ops/test_select_op.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_sigmoid_op.py b/test/cinn/ops/test_sigmoid_op.py
index f6410b83b5dd5..89e5deec1122d 100644
--- a/test/cinn/ops/test_sigmoid_op.py
+++ b/test/cinn/ops/test_sigmoid_op.py
@@ -14,13 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
 import paddle.nn.functional as F
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_sign_op.py b/test/cinn/ops/test_sign_op.py
index 25397afd1e9f1..b061b8acfce17 100644
--- a/test/cinn/ops/test_sign_op.py
+++ b/test/cinn/ops/test_sign_op.py
@@ -15,12 +15,12 @@
 # limitations under the License.
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_sin_op.py b/test/cinn/ops/test_sin_op.py
index dda3b59113fde..699002c38df79 100644
--- a/test/cinn/ops/test_sin_op.py
+++ b/test/cinn/ops/test_sin_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_sinh_op.py b/test/cinn/ops/test_sinh_op.py
index 216878d7bf62a..8a446a8297555 100644
--- a/test/cinn/ops/test_sinh_op.py
+++ b/test/cinn/ops/test_sinh_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_slice_assign_op.py b/test/cinn/ops/test_slice_assign_op.py
index e282b8116497c..3dfd7e72cd9f3 100644
--- a/test/cinn/ops/test_slice_assign_op.py
+++ b/test/cinn/ops/test_slice_assign_op.py
@@ -15,12 +15,12 @@
 # limitations under the License.
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 def paddle_slice_assign(data, update, axes, starts, ends, strides):
diff --git a/test/cinn/ops/test_slice_op.py b/test/cinn/ops/test_slice_op.py
index 68be813abb256..b3d83505ea236 100644
--- a/test/cinn/ops/test_slice_op.py
+++ b/test/cinn/ops/test_slice_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_softmax_op.py b/test/cinn/ops/test_softmax_op.py
index b7c346b1b2717..ffa8337a41147 100644
--- a/test/cinn/ops/test_softmax_op.py
+++ b/test/cinn/ops/test_softmax_op.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
 import paddle.nn.functional as F
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_sort_op.py b/test/cinn/ops/test_sort_op.py
index b1e2cb660c340..d0151c0d40c9a 100644
--- a/test/cinn/ops/test_sort_op.py
+++ b/test/cinn/ops/test_sort_op.py
@@ -14,11 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.frontend import NetBuilder
 from op_test import OpTest
 from op_test_helper import TestCaseHelper, run_test
 
 import paddle
+from paddle.cinn.frontend import NetBuilder
 
 
 class TestSortOp(OpTest):
diff --git a/test/cinn/ops/test_split_op.py b/test/cinn/ops/test_split_op.py
index cf5d7ffc9b98b..e927f171d048c 100755
--- a/test/cinn/ops/test_split_op.py
+++ b/test/cinn/ops/test_split_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_sqrt_op.py b/test/cinn/ops/test_sqrt_op.py
index 5315374702472..d90a71f909529 100644
--- a/test/cinn/ops/test_sqrt_op.py
+++ b/test/cinn/ops/test_sqrt_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_squeeze_op.py b/test/cinn/ops/test_squeeze_op.py
index bb8b25cf9b415..8874dd240cfde 100644
--- a/test/cinn/ops/test_squeeze_op.py
+++ b/test/cinn/ops/test_squeeze_op.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.frontend import NetBuilder
 from op_test import OpTest
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.frontend import NetBuilder
 
 
 class TestSqueezeOp(OpTest):
diff --git a/test/cinn/ops/test_subtract_op.py b/test/cinn/ops/test_subtract_op.py
index 3f357f7cbba59..9fabfeb5544d9 100644
--- a/test/cinn/ops/test_subtract_op.py
+++ b/test/cinn/ops/test_subtract_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_sum_op.py b/test/cinn/ops/test_sum_op.py
index fdee4699cb554..838451a2d2c39 100644
--- a/test/cinn/ops/test_sum_op.py
+++ b/test/cinn/ops/test_sum_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_tan_op.py b/test/cinn/ops/test_tan_op.py
index 80f6d480b440b..41d67c37b9a0d 100644
--- a/test/cinn/ops/test_tan_op.py
+++ b/test/cinn/ops/test_tan_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_tanh_op.py b/test/cinn/ops/test_tanh_op.py
index 667feff1db7ae..dbb257e630eb5 100644
--- a/test/cinn/ops/test_tanh_op.py
+++ b/test/cinn/ops/test_tanh_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_top_k_op.py b/test/cinn/ops/test_top_k_op.py
index dc702279e5d24..ff9619f66cec2 100644
--- a/test/cinn/ops/test_top_k_op.py
+++ b/test/cinn/ops/test_top_k_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper, run_test
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_transpose_op.py b/test/cinn/ops/test_transpose_op.py
index 9810c13844044..074343577b275 100644
--- a/test/cinn/ops/test_transpose_op.py
+++ b/test/cinn/ops/test_transpose_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_triangular_solve_op.py b/test/cinn/ops/test_triangular_solve_op.py
index 691d98058c41c..f0958552d2668 100644
--- a/test/cinn/ops/test_triangular_solve_op.py
+++ b/test/cinn/ops/test_triangular_solve_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper, run_test
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_trunc_op.py b/test/cinn/ops/test_trunc_op.py
index bfba7f6c182bb..7d518d9f96036 100644
--- a/test/cinn/ops/test_trunc_op.py
+++ b/test/cinn/ops/test_trunc_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_unary_elementwise_op.py b/test/cinn/ops/test_unary_elementwise_op.py
index fb0f5a41c4b27..ecae1c508bf76 100644
--- a/test/cinn/ops/test_unary_elementwise_op.py
+++ b/test/cinn/ops/test_unary_elementwise_op.py
@@ -17,11 +17,11 @@
 import unittest
 
 import numpy as np
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_uniform_random_op.py b/test/cinn/ops/test_uniform_random_op.py
index ba0cb15ac449e..a1081fc68b23a 100644
--- a/test/cinn/ops/test_uniform_random_op.py
+++ b/test/cinn/ops/test_uniform_random_op.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cinn.common import is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 from op_test_helper import TestCaseHelper
 
 import paddle
+from paddle.cinn.common import is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 @OpTestTool.skip_if(
diff --git a/test/cinn/ops/test_zero_dim_tensor.py b/test/cinn/ops/test_zero_dim_tensor.py
index b60f9a9e5d6c5..0aaf9eb54fd79 100644
--- a/test/cinn/ops/test_zero_dim_tensor.py
+++ b/test/cinn/ops/test_zero_dim_tensor.py
@@ -17,11 +17,11 @@
 import unittest
 
 import numpy as np
-from cinn.common import Bool, Float, Int, is_compiled_with_cuda
-from cinn.frontend import NetBuilder
 from op_test import OpTest, OpTestTool
 
 import paddle
+from paddle.cinn.common import Bool, Float, Int, is_compiled_with_cuda
+from paddle.cinn.frontend import NetBuilder
 
 
 def cinn_dtype_convert(dtype_str):
diff --git a/test/cinn/passes/pass_test.py b/test/cinn/passes/pass_test.py
index b8a64ce00963d..45ad8b9c3aca5 100644
--- a/test/cinn/passes/pass_test.py
+++ b/test/cinn/passes/pass_test.py
@@ -16,7 +16,7 @@
 import os
 from test.cinn.ops.op_test import OpTest
 
-from cinn.frontend import NetBuilder, Variable
+from paddle.cinn.frontend import NetBuilder, Variable
 
 logging.basicConfig(level=os.environ.get('LOG_LEVEL', 'INFO').upper())
 logger = logging.getLogger(name="pass_test")
diff --git a/test/cinn/runtime/test_launch.py b/test/cinn/runtime/test_launch.py
index bb8e3d45aeee5..d75e5aa3ddf16 100644
--- a/test/cinn/runtime/test_launch.py
+++ b/test/cinn/runtime/test_launch.py
@@ -13,10 +13,11 @@
 # limitations under the License.
 
 
-import cinn
 import numpy as np
-from cinn import ir, to_cinn_llir
-from cinn.runtime.data_array import DataArray
+
+from paddle import cinn
+from paddle.cinn import ir, to_cinn_llir
+from paddle.cinn.runtime.data_array import DataArray
 
 
 @to_cinn_llir
diff --git a/test/cinn/runtime/test_launch_kernel_with_symbol.py b/test/cinn/runtime/test_launch_kernel_with_symbol.py
index 37050b9ea5b2b..457f30600924f 100644
--- a/test/cinn/runtime/test_launch_kernel_with_symbol.py
+++ b/test/cinn/runtime/test_launch_kernel_with_symbol.py
@@ -13,11 +13,12 @@
 # limitations under the License.
 
 
-import cinn
 import numpy as np
-from cinn import ir, to_cinn_llir
-from cinn.runtime.data_array import DataArray
-from cinn.schedule import IRSchedule as sch
+
+from paddle import cinn
+from paddle.cinn import ir, to_cinn_llir
+from paddle.cinn.runtime.data_array import DataArray
+from paddle.cinn.schedule import IRSchedule as sch
 
 
 @to_cinn_llir
diff --git a/test/cinn/runtime/test_reduce_cuda.py b/test/cinn/runtime/test_reduce_cuda.py
index 3eaf160763bd4..c99a7ae3ec511 100644
--- a/test/cinn/runtime/test_reduce_cuda.py
+++ b/test/cinn/runtime/test_reduce_cuda.py
@@ -13,11 +13,12 @@
 # limitations under the License.
 
 
-import cinn
 import numpy as np
-from cinn import ir, to_cinn_llir
-from cinn.runtime.data_array import DataArray
-from cinn.schedule import IRSchedule as sch
+
+from paddle import cinn
+from paddle.cinn import ir, to_cinn_llir
+from paddle.cinn.runtime.data_array import DataArray
+from paddle.cinn.schedule import IRSchedule as sch
 
 
 @to_cinn_llir
diff --git a/test/cinn/test_common.py b/test/cinn/test_common.py
index e76b51b16c5ee..953d6ac22a654 100644
--- a/test/cinn/test_common.py
+++ b/test/cinn/test_common.py
@@ -16,7 +16,7 @@
 
 import unittest
 
-from cinn.common import Bool, Float, Int, UInt, make_const
+from paddle.cinn.common import Bool, Float, Int, UInt, make_const
 
 
 class TestType(unittest.TestCase):
diff --git a/test/cinn/test_computation.py b/test/cinn/test_computation.py
index ce4b9c5fcc592..7d750c0f72b73 100755
--- a/test/cinn/test_computation.py
+++ b/test/cinn/test_computation.py
@@ -18,11 +18,11 @@
 import unittest
 
 import numpy as np
-from cinn.common import DefaultHostTarget, DefaultNVGPUTarget, Float
-from cinn.frontend import Computation, NetBuilder
 
 import paddle
 from paddle import base, static
+from paddle.cinn.common import DefaultHostTarget, DefaultNVGPUTarget, Float
+from paddle.cinn.frontend import Computation, NetBuilder
 
 assert len(sys.argv) == 3
 enable_gpu = sys.argv.pop()
diff --git a/test/cinn/test_efficientnet.py b/test/cinn/test_efficientnet.py
index f3a817d140fe1..f543cdefa987c 100755
--- a/test/cinn/test_efficientnet.py
+++ b/test/cinn/test_efficientnet.py
@@ -19,10 +19,10 @@
 import unittest
 
 import numpy as np
-from cinn.common import DefaultHostTarget, DefaultNVGPUTarget
-from cinn.frontend import Interpreter
 
 from paddle import base
+from paddle.cinn.common import DefaultHostTarget, DefaultNVGPUTarget
+from paddle.cinn.frontend import Interpreter
 
 enable_gpu = sys.argv.pop()
 model_dir = sys.argv.pop()
@@ -105,7 +105,7 @@ def apply_test(self):
 
     def test_model(self):
         self.apply_test()
-        # self.target.arch = Target.Arch.NVGPU
+        # self.target.arch = Target.NVGPUArch()
         # self.apply_test()
 
 
diff --git a/test/cinn/test_facedet.py b/test/cinn/test_facedet.py
index b2282cc5faa94..5c9a6f33f011b 100755
--- a/test/cinn/test_facedet.py
+++ b/test/cinn/test_facedet.py
@@ -19,10 +19,10 @@
 import unittest
 
 import numpy as np
-from cinn.common import DefaultHostTarget, DefaultNVGPUTarget
-from cinn.frontend import Interpreter
 
 from paddle import base
+from paddle.cinn.common import DefaultHostTarget, DefaultNVGPUTarget
+from paddle.cinn.frontend import Interpreter
 
 enable_gpu = sys.argv.pop()
 model_dir = sys.argv.pop()
diff --git a/test/cinn/test_frontend.py b/test/cinn/test_frontend.py
index 634c2c227822f..2c856c8a9f1e1 100755
--- a/test/cinn/test_frontend.py
+++ b/test/cinn/test_frontend.py
@@ -18,10 +18,10 @@
 import unittest
 
 import numpy as np
-from cinn.common import DefaultHostTarget, DefaultNVGPUTarget
-from cinn.frontend import Interpreter
 
 from paddle import base
+from paddle.cinn.common import DefaultHostTarget, DefaultNVGPUTarget
+from paddle.cinn.frontend import Interpreter
 
 assert len(sys.argv) == 1 + 2 + 1  # model and enable_gpu count
 enable_gpu = sys.argv.pop()
diff --git a/test/cinn/test_hlir_framework.py b/test/cinn/test_hlir_framework.py
index 42711ded4902d..205cac1db6567 100644
--- a/test/cinn/test_hlir_framework.py
+++ b/test/cinn/test_hlir_framework.py
@@ -15,14 +15,15 @@
 import unittest
 
 import numpy as np
-from cinn import Target
-from cinn.framework import Tensor
+
+from paddle.cinn import Target
+from paddle.cinn.framework import Tensor
 
 
 class TensorTest(unittest.TestCase):
     def test_basic(self):
         target = Target()
-        target.arch = Target.Arch.X86
+        target.arch = Target.X86Arch()
         target.bits = Target.Bit.k64
         target.os = Target.OS.Linux
         tensor = Tensor()
diff --git a/test/cinn/test_ir.py b/test/cinn/test_ir.py
index c4da5c8f912b9..65d9175ff2cc5 100644
--- a/test/cinn/test_ir.py
+++ b/test/cinn/test_ir.py
@@ -16,8 +16,8 @@
 
 import unittest
 
-from cinn.ir import Expr, Var
-from cinn.optim import simplify
+from paddle.cinn.ir import Expr, Var
+from paddle.cinn.optim import simplify
 
 
 class TestIR(unittest.TestCase):
diff --git a/test/cinn/test_matmul.py b/test/cinn/test_matmul.py
index f4fc311023bd2..fd5519de1fc27 100755
--- a/test/cinn/test_matmul.py
+++ b/test/cinn/test_matmul.py
@@ -16,17 +16,18 @@
 
 import unittest
 
-import cinn
 import numpy as np
-from cinn import Target, ir, lang, runtime, utils
-from cinn.poly import create_stages
+
+from paddle import cinn
+from paddle.cinn import Target, ir, lang, runtime, utils
+from paddle.cinn.poly import create_stages
 
 
 class TestMatmul(unittest.TestCase):
     def setUp(self):
         np.random.seed(0)
         self.target = Target()
-        self.target.arch = Target.Arch.X86
+        self.target.arch = Target.X86Arch()
         self.target.bits = Target.Bit.k32
         self.target.os = Target.OS.Linux
         self.m = 1024
diff --git a/test/cinn/test_mobilenetv1.py b/test/cinn/test_mobilenetv1.py
index 6c4cfaf011100..e46cd13a10f77 100644
--- a/test/cinn/test_mobilenetv1.py
+++ b/test/cinn/test_mobilenetv1.py
@@ -19,10 +19,10 @@
 import unittest
 
 import numpy as np
-from cinn.common import DefaultHostTarget, DefaultNVGPUTarget
-from cinn.frontend import Interpreter
 
 from paddle import base
+from paddle.cinn.common import DefaultHostTarget, DefaultNVGPUTarget
+from paddle.cinn.frontend import Interpreter
 
 enable_gpu = sys.argv.pop()
 model_dir = sys.argv.pop()
diff --git a/test/cinn/test_mobilenetv2.py b/test/cinn/test_mobilenetv2.py
index 4332678788117..a7a683f7f9789 100755
--- a/test/cinn/test_mobilenetv2.py
+++ b/test/cinn/test_mobilenetv2.py
@@ -19,10 +19,10 @@
 import unittest
 
 import numpy as np
-from cinn.common import DefaultHostTarget, DefaultNVGPUTarget
-from cinn.frontend import Interpreter
 
 from paddle import base
+from paddle.cinn.common import DefaultHostTarget, DefaultNVGPUTarget
+from paddle.cinn.frontend import Interpreter
 
 enable_gpu = sys.argv.pop()
 model_dir = sys.argv.pop()
@@ -110,7 +110,7 @@ def apply_test(self):
 
     def test_model(self):
         self.apply_test()
-        # self.target.arch = Target.Arch.NVGPU
+        # self.target.arch = Target.NVGPUArch()
         # self.apply_test()
 
 
diff --git a/test/cinn/test_netbuilder.py b/test/cinn/test_netbuilder.py
index 1f5cf4590a3d7..07c1073ceeb9c 100755
--- a/test/cinn/test_netbuilder.py
+++ b/test/cinn/test_netbuilder.py
@@ -18,11 +18,11 @@
 import unittest
 
 import numpy as np
-from cinn.common import DefaultHostTarget, DefaultNVGPUTarget, Float
-from cinn.frontend import NetBuilder
 
 import paddle
 from paddle import static
+from paddle.cinn.common import DefaultHostTarget, DefaultNVGPUTarget, Float
+from paddle.cinn.frontend import NetBuilder
 
 enable_gpu = sys.argv.pop()
 
diff --git a/test/cinn/test_op_benchmark.py b/test/cinn/test_op_benchmark.py
index 0f598d93eb0f4..36e6bf74bd73f 100755
--- a/test/cinn/test_op_benchmark.py
+++ b/test/cinn/test_op_benchmark.py
@@ -18,11 +18,11 @@
 import unittest
 
 import numpy as np
-from cinn.common import DefaultHostTarget, DefaultNVGPUTarget, Float
-from cinn.frontend import Program, Variable
 
 import paddle
 from paddle import static
+from paddle.cinn.common import DefaultHostTarget, DefaultNVGPUTarget, Float
+from paddle.cinn.frontend import Program, Variable
 
 assert len(sys.argv) == 2
 enable_gpu = sys.argv.pop()
diff --git a/test/cinn/test_op_broadcast.py b/test/cinn/test_op_broadcast.py
index 4aa2f9fb430f8..fb3627a3fbaa1 100644
--- a/test/cinn/test_op_broadcast.py
+++ b/test/cinn/test_op_broadcast.py
@@ -16,9 +16,10 @@
 
 import unittest
 
-from cinn import framework
 from test_utils import SingleOpTester
 
+from paddle.cinn import framework
+
 
 class OpTest_add_0(SingleOpTester):
     def create_target_data(self, inputs_data, attrs):
diff --git a/test/cinn/test_op_nn.py b/test/cinn/test_op_nn.py
index ed862cecfcd57..ae40a6109ef96 100644
--- a/test/cinn/test_op_nn.py
+++ b/test/cinn/test_op_nn.py
@@ -20,9 +20,10 @@
 import conv2d_utils
 import numpy as np
 import pool_utils
-from cinn import framework
 from test_utils import SingleOpTester
 
+from paddle.cinn import framework
+
 
 class OpTest_relu(SingleOpTester):
     def create_target_data(self, inputs_data, attrs):
diff --git a/test/cinn/test_op_transform.py b/test/cinn/test_op_transform.py
index 7da12d59e7bfc..eb0a82f597459 100644
--- a/test/cinn/test_op_transform.py
+++ b/test/cinn/test_op_transform.py
@@ -17,11 +17,11 @@
 import os
 import unittest
 
-from cinn import framework
 from test_utils import SingleOpTester
 
 import paddle
 from paddle import static
+from paddle.cinn import framework
 
 os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
 
diff --git a/test/cinn/test_packed_func.py b/test/cinn/test_packed_func.py
index 3a833df11fd85..64dcc72263f85 100755
--- a/test/cinn/test_packed_func.py
+++ b/test/cinn/test_packed_func.py
@@ -17,8 +17,8 @@
 import unittest
 from math import isclose
 
-import cinn
-from cinn import ir
+from paddle import cinn
+from paddle.cinn import ir
 
 
 class TestPackedFunc(unittest.TestCase):
diff --git a/test/cinn/test_paddle_model_convertor.py b/test/cinn/test_paddle_model_convertor.py
index 0b2f3b15b36b6..bd2fb86de4130 100644
--- a/test/cinn/test_paddle_model_convertor.py
+++ b/test/cinn/test_paddle_model_convertor.py
@@ -21,12 +21,12 @@
 import unittest
 
 import numpy as np
-from cinn.common import DefaultHostTarget, DefaultNVGPUTarget
-from cinn.frontend import PaddleModelConvertor
-from cinn.runtime import seed as cinn_seed
 from op_mappers.op_mapper_test import OpMapperTest
 
 import paddle
+from paddle.cinn.common import DefaultHostTarget, DefaultNVGPUTarget
+from paddle.cinn.frontend import PaddleModelConvertor
+from paddle.cinn.runtime import seed as cinn_seed
 
 logging.basicConfig(level=os.environ.get('LOG_LEVEL', 'INFO').upper())
 logger = logging.getLogger(name="paddle_model_convertor")
@@ -103,9 +103,7 @@ def setUp(self):
         self.params_filename = args.params_filename
 
         logger.info(
-            "Run Model From \"{}\", which model filename is \"{}\", and parameter filename is \"{}\"".format(
-                self.model_dir, self.model_filename, self.params_filename
-            )
+            f"Run Model From \"{self.model_dir}\", which model filename is \"{self.model_filename}\", and parameter filename is \"{self.params_filename}\""
         )
 
         self.load_paddle_program()
diff --git a/test/cinn/test_pe_elementwise.py b/test/cinn/test_pe_elementwise.py
index 9a670359207b2..a45a7a96e2931 100644
--- a/test/cinn/test_pe_elementwise.py
+++ b/test/cinn/test_pe_elementwise.py
@@ -16,11 +16,12 @@
 
 import unittest
 
-import cinn
 import numpy as np
 import scipy
-from cinn import Target, ir, lang, pe, runtime
-from cinn.poly import create_stages
+
+from paddle import cinn
+from paddle.cinn import Target, ir, lang, pe, runtime
+from paddle.cinn.poly import create_stages
 
 
 class TestPEElementwise(unittest.TestCase):
@@ -29,7 +30,7 @@ def setUp(self):
         self.n = 32
 
         self.target = Target()
-        self.target.arch = Target.Arch.X86
+        self.target.arch = Target.X86Arch()
         self.target.bits = Target.Bit.k32
         self.target.os = Target.OS.Linux
         cinn.set_target(self.target)
diff --git a/test/cinn/test_pe_reduction.py b/test/cinn/test_pe_reduction.py
index e6a73c9fd11e2..d8bda45aede6a 100644
--- a/test/cinn/test_pe_reduction.py
+++ b/test/cinn/test_pe_reduction.py
@@ -16,10 +16,11 @@
 
 import unittest
 
-import cinn
 import numpy as np
-from cinn import Target, ir, lang, pe, runtime
-from cinn.poly import create_stages
+
+from paddle import cinn
+from paddle.cinn import Target, ir, lang, pe, runtime
+from paddle.cinn.poly import create_stages
 
 
 class TestPEReduction(unittest.TestCase):
@@ -28,7 +29,7 @@ def setUp(self):
         self.n = 32
 
         self.target = Target()
-        self.target.arch = Target.Arch.X86
+        self.target.arch = Target.X86Arch()
         self.target.bits = Target.Bit.k64
         self.target.os = Target.OS.Linux
 
diff --git a/test/cinn/test_pe_transform.py b/test/cinn/test_pe_transform.py
index f6c6b6237d228..9e5ef1d474e11 100644
--- a/test/cinn/test_pe_transform.py
+++ b/test/cinn/test_pe_transform.py
@@ -16,10 +16,11 @@
 
 import unittest
 
-import cinn
 import numpy as np
-from cinn import Target, ir, lang, pe, runtime
-from cinn.poly import create_stages
+
+from paddle import cinn
+from paddle.cinn import Target, ir, lang, pe, runtime
+from paddle.cinn.poly import create_stages
 
 
 class TestPETransform(unittest.TestCase):
@@ -29,7 +30,7 @@ def setUp(self):
         self.k = 16
 
         self.target = Target()
-        self.target.arch = Target.Arch.X86
+        self.target.arch = Target.X86Arch()
         self.target.bits = Target.Bit.k64
         self.target.os = Target.OS.Linux
 
diff --git a/test/cinn/test_resnet.py b/test/cinn/test_resnet.py
index 63ff06f528787..982edf5b881e2 100755
--- a/test/cinn/test_resnet.py
+++ b/test/cinn/test_resnet.py
@@ -18,10 +18,10 @@
 import unittest
 
 import numpy as np
-from cinn.common import DefaultHostTarget, DefaultNVGPUTarget
-from cinn.frontend import Interpreter
 
 from paddle import base
+from paddle.cinn.common import DefaultHostTarget, DefaultNVGPUTarget
+from paddle.cinn.frontend import Interpreter
 
 enable_gpu = sys.argv.pop()
 model_dir = sys.argv.pop()
@@ -85,7 +85,7 @@ def apply_test(self):
 
     def test_model(self):
         self.apply_test()
-        # self.target.arch = Target.Arch.NVGPU
+        # self.target.arch = Target.NVGPUArch()
         # self.apply_test()
 
 
diff --git a/test/cinn/test_resnet18.py b/test/cinn/test_resnet18.py
index 3d746113ea3a5..926aef5e951ba 100755
--- a/test/cinn/test_resnet18.py
+++ b/test/cinn/test_resnet18.py
@@ -19,10 +19,10 @@
 import unittest
 
 import numpy as np
-from cinn.common import DefaultHostTarget, DefaultNVGPUTarget
-from cinn.frontend import Interpreter
 
 from paddle import base
+from paddle.cinn.common import DefaultHostTarget, DefaultNVGPUTarget
+from paddle.cinn.frontend import Interpreter
 
 enable_gpu = sys.argv.pop()
 model_dir = sys.argv.pop()
diff --git a/test/cinn/test_resnet50.py b/test/cinn/test_resnet50.py
index 2987bd2ad6837..d816924b08769 100755
--- a/test/cinn/test_resnet50.py
+++ b/test/cinn/test_resnet50.py
@@ -19,10 +19,10 @@
 import unittest
 
 import numpy as np
-from cinn.common import DefaultHostTarget, DefaultNVGPUTarget
-from cinn.frontend import Interpreter
 
 from paddle import base
+from paddle.cinn.common import DefaultHostTarget, DefaultNVGPUTarget
+from paddle.cinn.frontend import Interpreter
 
 enable_gpu = sys.argv.pop()
 model_dir = sys.argv.pop()
diff --git a/test/cinn/test_squeezenet.py b/test/cinn/test_squeezenet.py
index 2228961e41f4f..7f78539f32128 100644
--- a/test/cinn/test_squeezenet.py
+++ b/test/cinn/test_squeezenet.py
@@ -19,10 +19,10 @@
 import unittest
 
 import numpy as np
-from cinn.common import DefaultHostTarget, DefaultNVGPUTarget
-from cinn.frontend import Interpreter
 
 from paddle import base
+from paddle.cinn.common import DefaultHostTarget, DefaultNVGPUTarget
+from paddle.cinn.frontend import Interpreter
 
 enable_gpu = sys.argv.pop()
 model_dir = sys.argv.pop()
diff --git a/test/cinn/test_utils.py b/test/cinn/test_utils.py
index 5bec196486262..29389d2483745 100755
--- a/test/cinn/test_utils.py
+++ b/test/cinn/test_utils.py
@@ -17,9 +17,10 @@
 import logging
 import unittest
 
-import cinn
 import numpy as np
-from cinn import common, framework, ir, lang, runtime
+
+from paddle import cinn
+from paddle.cinn import common, framework, ir, lang, runtime
 
 
 class SingleOpTester(unittest.TestCase):
@@ -85,7 +86,7 @@ def to_test_op(
         args = []
         temp_inputs = []
         alignment = 0
-        if self.target.arch == common.Target.Arch.X86:
+        if self.target.arch.IsX86Arch():
             alignment = 32
         for in_data in inputs_data:
             temp_inputs.append(
diff --git a/test/cinn/utils/testing.py b/test/cinn/utils/testing.py
index b67432a17c189..f0713c5fd25f2 100644
--- a/test/cinn/utils/testing.py
+++ b/test/cinn/utils/testing.py
@@ -11,8 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from cinn.ir import IrCompare
-from cinn.runtime import CinnLowerLevelIrJit
+from paddle.cinn.ir import IrCompare
+from paddle.cinn.runtime import CinnLowerLevelIrJit
 
 
 def assert_llir_equal(
diff --git a/test/collective/fleet/hybrid_parallel_pp_multiple_losses.py b/test/collective/fleet/hybrid_parallel_pp_multiple_losses.py
new file mode 100644
index 0000000000000..7b85e84463f68
--- /dev/null
+++ b/test/collective/fleet/hybrid_parallel_pp_multiple_losses.py
@@ -0,0 +1,213 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.distributed as dist
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
+from paddle.nn import Layer
+
+
+def set_random_seed(seed, dp_id, rank_id):
+    """Set random seed for reproducibility."""
+    random.seed(seed)
+    np.random.seed(seed + dp_id)
+    paddle.seed(seed + dp_id)
+
+
+batch_size = 8
+length = 8
+micro_batch_size = 2
+vocab_size = 128
+hidden_size = 16
+d_model = hidden_size
+dim_feedforward = 4 * d_model
+
+
+class EmbeddingNet(Layer):
+    def __init__(self):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
+        self.position_embeddings = nn.Embedding(vocab_size, hidden_size)
+
+    def forward(self, x):
+        attention_mask = paddle.tensor.triu(
+            (paddle.ones((length, length), dtype="float32") * -1e9), 1
+        )
+
+        no_used = paddle.ones((3, 3), dtype="int32")
+
+        w_emb = self.word_embeddings(x)
+        p_emb = self.position_embeddings(x)
+        w_emb = w_emb + p_emb
+
+        attention_mask.stop_gradient = True
+        no_used.stop_gradient = True
+
+        return w_emb, attention_mask, no_used, p_emb
+
+
+class TransformerNet(Layer):
+    def __init__(self):
+        super().__init__()
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.q_proj = nn.Linear(d_model, d_model)
+        self.k_proj = nn.Linear(d_model, d_model)
+        self.v_proj = nn.Linear(d_model, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)
+
+    def forward(self, x, mask):
+        q = self.q_proj(x)
+        k = self.k_proj(x)
+        v = self.v_proj(x)
+        product = paddle.matmul(x=q, y=k, transpose_y=True)
+        product = paddle.scale(product, scale=d_model**-0.5)
+
+        weights = F.softmax(product + mask)
+
+        tgt = paddle.matmul(weights, v)
+        residual = tgt
+        tgt = self.norm1(tgt)
+        tgt = residual + tgt
+
+        out = self.linear2(F.gelu(self.linear1(tgt), approximate=True))
+        return out
+
+
+class EmbeddingPipe(EmbeddingNet):
+    def forward(self, x):
+        return super().forward(x)
+
+
+class TransformerNetPipe(TransformerNet):
+    def forward(self, args):
+        x, mask, no_used, p_emb = args[0], args[1], args[2], args[3]
+
+        output = super().forward(x, mask)
+        output = output + p_emb
+        mask.stop_gradient = True
+        return output, mask, no_used, p_emb
+
+
+class CriterionPipe(Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, out, label):
+        loss = out.mean()
+        return loss
+
+
+class MSEPipe(Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, out, label):
+        loss = (out - label).square().mean()
+        return loss
+
+
+class L1Pipe(Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, out, label):
+        loss = (out - label).abs().mean()
+        return loss
+
+
+class ModelPipe(PipelineLayer):
+    def __init__(self, topology, transformer_layer_num: int = 6):
+        self.descs = []
+        self.descs.append(LayerDesc(EmbeddingPipe))
+
+        for x in range(transformer_layer_num):
+            self.descs.append(LayerDesc(TransformerNetPipe))
+
+        self.descs.append(lambda x: x[0])
+
+        super().__init__(
+            layers=self.descs,
+            loss_fn=[CriterionPipe(), MSEPipe(), L1Pipe()],
+            topology=topology,
+            seg_method="layer:TransformerNetPipe",
+        )
+
+
+class TestDistPPTraining(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 1
+        self.data_parallel_size = 1
+        self.pipeline_parallel_size = 2
+        strategy.hybrid_configs = {
+            "dp_degree": self.data_parallel_size,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": self.pipeline_parallel_size,
+        }
+        strategy.pipeline_configs = {
+            "accumulate_steps": batch_size // micro_batch_size,
+            "micro_batch_size": micro_batch_size,
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+
+    def test_pp_model(self):
+        hcg = fleet.get_hybrid_communicate_group()
+        world_size = hcg.get_model_parallel_world_size()
+        dp_id = hcg.get_data_parallel_rank()
+        pp_id = hcg.get_stage_id()
+        rank_id = dist.get_rank()
+        topology = hcg.topology()
+        set_random_seed(1024, dp_id, rank_id)
+
+        model = ModelPipe(topology)
+        scheduler = paddle.optimizer.lr.PiecewiseDecay(
+            boundaries=[2], values=[0.001, 0.002], verbose=True
+        )
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=scheduler, parameters=model.parameters()
+        )
+
+        model = fleet.distributed_model(model)
+        optimizer = fleet.distributed_optimizer(optimizer)
+
+        for _ in range(5):
+            x_data = np.random.randint(0, vocab_size, size=[batch_size, length])
+            y_data = np.random.randn(batch_size, length, d_model)
+            x = paddle.to_tensor(x_data)
+            y = paddle.to_tensor(y_data)
+            x.stop_gradient = True
+
+            e_losses = model.eval_batch([x, y], True, loss_fn_idx=1)
+            losses = model.train_batch(
+                [x, y], optimizer, scheduler, loss_fn_idx=1
+            )
+
+            if pp_id != 0:
+                for e_loss, loss in zip(e_losses, losses):
+                    np.testing.assert_allclose(loss.numpy(), e_loss.numpy())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/collective/fleet/hybrid_parallel_pp_multiple_losses_alignment.py b/test/collective/fleet/hybrid_parallel_pp_multiple_losses_alignment.py
new file mode 100644
index 0000000000000..d18a50dddf51c
--- /dev/null
+++ b/test/collective/fleet/hybrid_parallel_pp_multiple_losses_alignment.py
@@ -0,0 +1,257 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.distributed as dist
+from paddle import nn
+from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
+from paddle.nn import Layer
+
+
+def set_random_seed(seed, dp_id, rank_id):
+    """Set random seed for reproducibility."""
+    random.seed(seed)
+    np.random.seed(seed + dp_id)
+    paddle.seed(seed + dp_id)
+
+
+batch_size = 8
+length = 8
+micro_batch_size = 2
+vocab_size = 128
+hidden_size = 16
+d_model = hidden_size
+dim_feedforward = 4 * d_model
+
+
+class SimpleNet(Layer):
+    def __init__(self):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
+        self.softmax_weight = self.create_parameter(
+            shape=[hidden_size, vocab_size],
+        )
+        self.softmax_bias = self.create_parameter(
+            shape=[vocab_size],
+            is_bias=False,
+        )
+
+    def forward(self, x1, x2, y1):
+        x_emb = self.word_embeddings(x1)
+        fc = paddle.matmul(x_emb, self.softmax_weight)
+        fc = paddle.add(fc, self.softmax_bias)
+        projection = paddle.reshape(fc, shape=[-1, vocab_size])
+        loss_0 = (projection - y1).square().mean()
+        loss_1 = paddle.nn.functional.softmax_with_cross_entropy(
+            logits=projection, label=y1, soft_label=False
+        ).mean()
+        loss_2 = (projection - y1).abs().mean()
+        return loss_0, loss_1, loss_2
+
+
+class EmbeddingNet(Layer):
+    def __init__(self):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
+
+    @property
+    def embedding_weight(self):
+        return self.word_embeddings.weight
+
+    def forward(self, args):
+        x1, x2 = args
+        x_emb = self.word_embeddings(x1)
+        x2.stop_gradient = True
+        return x_emb, x2
+
+
+class MatmulNet(Layer):
+    def __init__(self):
+        super().__init__()
+        self.softmax_weight = self.create_parameter(
+            shape=[hidden_size, vocab_size],
+        )
+
+    def forward(self, args):
+        x1, x2 = args
+        fc = paddle.matmul(x1, self.softmax_weight)
+        return fc, x2
+
+
+class BiasNet(Layer):
+    def __init__(self):
+        super().__init__()
+        self.softmax_bias = self.create_parameter(
+            shape=[vocab_size],
+        )
+
+    def forward(self, args):
+        fc, x2 = args
+        fc = paddle.add(fc, self.softmax_bias)
+        projection = paddle.reshape(fc, shape=[-1, vocab_size])
+        return projection, x2
+
+
+class LossNet(Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, args, y1):
+        projection, x2 = args
+        loss = paddle.nn.functional.softmax_with_cross_entropy(
+            logits=projection, label=y1[0], soft_label=False
+        )
+        return loss.mean()
+
+
+class MSEPipe(Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, args, y1):
+        projection, x2 = args
+        loss = (projection - y1[0]).square().mean()
+        return loss
+
+
+class L1Pipe(Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, args, y1):
+        projection, x2 = args
+        loss = (projection - y1[0]).abs().mean()
+        return loss
+
+
+class SimpleNetPipe(PipelineLayer):
+    def __init__(self, **kwargs):
+        self.descs = []
+        self.descs.append(LayerDesc(EmbeddingNet))
+        self.descs.append(LayerDesc(MatmulNet))
+        self.descs.append(LayerDesc(BiasNet))
+
+        super().__init__(
+            layers=self.descs,
+            loss_fn=[MSEPipe(), LossNet(), L1Pipe()],
+            **kwargs
+        )
+
+
+class TestDistPPTraining(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 1
+        self.data_parallel_size = 1
+        self.pipeline_parallel_size = 2
+        strategy.hybrid_configs = {
+            "dp_degree": self.data_parallel_size,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": self.pipeline_parallel_size,
+        }
+        strategy.pipeline_configs = {
+            "accumulate_steps": batch_size // micro_batch_size,
+            "micro_batch_size": micro_batch_size,
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+
+    def test_pp_model_backward(self):
+        hcg = fleet.get_hybrid_communicate_group()
+        world_size = hcg.get_model_parallel_world_size()
+        dp_id = hcg.get_data_parallel_rank()
+        pp_id = hcg.get_stage_id()
+        rank_id = dist.get_rank()
+        topology = hcg.topology()
+        set_random_seed(1024, dp_id, rank_id)
+
+        # construct model a
+        model_a = SimpleNet()
+        scheduler_a = paddle.optimizer.lr.PiecewiseDecay(
+            boundaries=[2, 3, 4], values=[0.01, 0.02, 0.03, 0.04], verbose=True
+        )
+        optimizer_a = paddle.optimizer.SGD(
+            learning_rate=scheduler_a, parameters=model_a.parameters()
+        )
+
+        # construct model b
+        model_b = SimpleNetPipe(topology=hcg.topology())
+
+        scheduler_b = paddle.optimizer.lr.PiecewiseDecay(
+            boundaries=[2, 3, 4], values=[0.01, 0.02, 0.03, 0.04], verbose=True
+        )
+        optimizer_b = paddle.optimizer.SGD(
+            learning_rate=scheduler_b, parameters=model_b.parameters()
+        )
+        model_b = fleet.distributed_model(model_b)
+        optimizer_b = fleet.distributed_optimizer(optimizer_b)
+
+        param_len = len(model_a.parameters())
+
+        parameters = []
+        for param in model_a.parameters():
+            parameters.append(param.numpy())
+
+        model_b_params = model_b.parameters()
+
+        if pp_id == 0:
+            model_b_params[0].set_value(parameters[2])
+        else:
+            model_b_params[0].set_value(parameters[0])
+            model_b_params[1].set_value(parameters[1])
+
+        loss_fn_idx = 1
+        for _ in range(5):
+            x1_data = np.random.randint(0, vocab_size, size=[batch_size, 1])
+            x2_data = np.random.randint(0, vocab_size, size=[batch_size, 1])
+            y1_data = np.random.randint(0, hidden_size, size=[batch_size, 1])
+
+            x1 = paddle.to_tensor(x1_data)
+            x2 = paddle.to_tensor(x2_data)
+            y1 = paddle.to_tensor(y1_data)
+
+            x1.stop_gradient = True
+            x2.stop_gradient = True
+            y1.stop_gradient = True
+
+            loss_a = model_a(x1, x2, y1)
+            loss_a[loss_fn_idx].backward()
+
+            optimizer_a.step()
+            optimizer_a.clear_grad()
+            scheduler_a.step()
+
+            loss_b = model_b.train_batch(
+                [(x1, x2), (y1,)],
+                optimizer_b,
+                scheduler_b,
+                loss_fn_idx=loss_fn_idx,
+            )
+
+            for idx in range(3):
+                np.testing.assert_allclose(
+                    loss_a[idx].numpy(),
+                    loss_b[idx].numpy(),
+                    rtol=1e-6,
+                    atol=1e-6,
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/collective/fleet/hybrid_parallel_pp_multiple_losses_with_virtual_stage.py b/test/collective/fleet/hybrid_parallel_pp_multiple_losses_with_virtual_stage.py
new file mode 100644
index 0000000000000..247712e3d9721
--- /dev/null
+++ b/test/collective/fleet/hybrid_parallel_pp_multiple_losses_with_virtual_stage.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.distributed as dist
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
+from paddle.nn import Layer
+
+
+def set_random_seed(seed, dp_id, rank_id):
+    """Set random seed for reproducibility."""
+    random.seed(seed)
+    np.random.seed(seed + dp_id)
+    paddle.seed(seed + dp_id)
+
+
+batch_size = 24
+length = 8
+micro_batch_size = 4
+num_virtual_pipeline_stages = 2
+vocab_size = 128
+hidden_size = 16
+d_model = hidden_size
+dim_feedforward = 4 * d_model
+
+
+class EmbeddingNet(Layer):
+    def __init__(self):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
+        self.position_embeddings = nn.Embedding(vocab_size, hidden_size)
+
+    def forward(self, x):
+        attention_mask = paddle.tensor.triu(
+            (paddle.ones((length, length), dtype="float32") * -1e9), 1
+        )
+
+        no_used = paddle.ones((3, 3), dtype="int32")
+
+        w_emb = self.word_embeddings(x)
+        p_emb = self.position_embeddings(x)
+        w_emb = w_emb + p_emb
+
+        attention_mask.stop_gradient = True
+        no_used.stop_gradient = True
+
+        return w_emb, attention_mask, no_used, p_emb
+
+
+class TransformerNet(Layer):
+    def __init__(self):
+        super().__init__()
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.q_proj = nn.Linear(d_model, d_model)
+        self.k_proj = nn.Linear(d_model, d_model)
+        self.v_proj = nn.Linear(d_model, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)
+
+    def forward(self, x, mask):
+        q = self.q_proj(x)
+        k = self.k_proj(x)
+        v = self.v_proj(x)
+        product = paddle.matmul(x=q, y=k, transpose_y=True)
+        product = paddle.scale(product, scale=d_model**-0.5)
+
+        weights = F.softmax(product + mask)
+        tgt = paddle.matmul(weights, v)
+        residual = tgt
+        tgt = self.norm1(tgt)
+        tgt = residual + tgt
+
+        out = self.linear2(F.gelu(self.linear1(tgt), approximate=True))
+        return out
+
+
+class EmbeddingPipe(EmbeddingNet):
+    def forward(self, x):
+        return super().forward(x)
+
+
+class TransformerNetPipe(TransformerNet):
+    def forward(self, args):
+        x, mask, no_used, p_emb = args[0], args[1], args[2], args[3]
+
+        output = super().forward(x, mask)
+        output = output + p_emb
+        mask.stop_gradient = True
+        return output, mask, no_used, p_emb
+
+
+class CriterionPipe(Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, out, label):
+        loss = out.mean()
+        return loss
+
+
+class MSEPipe(Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, out, label):
+        loss = (out - label).square().mean()
+        return loss
+
+
+class L1Pipe(Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, out, label):
+        loss = (out - label).abs().mean()
+        return loss
+
+
+class ModelPipe(PipelineLayer):
+    def __init__(self, topology, transformer_layer_num: int = 6):
+        self.descs = []
+        self.descs.append(LayerDesc(EmbeddingPipe))
+        for x in range(transformer_layer_num):
+            self.descs.append(LayerDesc(TransformerNetPipe))
+
+        self.descs.append(lambda x: x[0])
+
+        super().__init__(
+            layers=self.descs,
+            loss_fn=[CriterionPipe(), MSEPipe(), L1Pipe()],
+            topology=topology,
+            num_virtual_pipeline_stages=num_virtual_pipeline_stages,
+            seg_method="layer:TransformerNetPipe",
+        )
+
+
+class TestDistPPTraining(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 1
+        self.data_parallel_size = 1
+        self.pipeline_parallel_size = 3
+        strategy.hybrid_configs = {
+            "dp_degree": self.data_parallel_size,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": self.pipeline_parallel_size,
+            "pp_configs": {
+                "enable_timer": True,
+            },
+        }
+        strategy.pipeline_configs = {
+            "accumulate_steps": batch_size // micro_batch_size,
+            "micro_batch_size": micro_batch_size,
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+
+    def test_pp_model(self):
+        hcg = fleet.get_hybrid_communicate_group()
+        world_size = hcg.get_model_parallel_world_size()
+        dp_id = hcg.get_data_parallel_rank()
+        pp_id = hcg.get_stage_id()
+        rank_id = dist.get_rank()
+        topology = hcg.topology()
+        set_random_seed(1024, dp_id, rank_id)
+
+        model = ModelPipe(topology)
+        scheduler = paddle.optimizer.lr.PiecewiseDecay(
+            boundaries=[2], values=[0.001, 0.002], verbose=True
+        )
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=scheduler, parameters=model.parameters()
+        )
+
+        model = fleet.distributed_model(model)
+        optimizer = fleet.distributed_optimizer(optimizer)
+
+        for _ in range(5):
+            x_data = np.random.randint(0, vocab_size, size=[batch_size, length])
+            y_data = np.random.randn(batch_size, length, d_model)
+            x = paddle.to_tensor(x_data)
+            y = paddle.to_tensor(y_data)
+            x.stop_gradient = True
+
+            e_losses = model.eval_batch([x, y], True, loss_fn_idx=1)
+            losses = model.train_batch(
+                [x, y], optimizer, scheduler, loss_fn_idx=1
+            )
+
+            if pp_id != 0:
+                for e_loss, loss in zip(e_losses, losses):
+                    np.testing.assert_allclose(loss.numpy(), e_loss.numpy())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/collective/fleet/test_distributed_strategy.py b/test/collective/fleet/test_distributed_strategy.py
index 337efe6d1e028..66ec3a55786e3 100644
--- a/test/collective/fleet/test_distributed_strategy.py
+++ b/test/collective/fleet/test_distributed_strategy.py
@@ -37,7 +37,6 @@ def test_sync_strategy(self):
         self.assertEqual(strategy._program_config.sync_mode, False)
         self.assertEqual(strategy._program_config.runtime_split_send_recv, True)
         self.assertEqual(strategy._build_strategy.async_mode, True)
-        self.assertEqual(strategy._execute_strategy.num_threads, 2)
 
         # test set_program_config using DistributeTranspilerConfig()
         program_config_class = DistributeTranspilerConfig()
@@ -160,30 +159,6 @@ def test_async_strategy(self):
             trainer_runtime_config_illegal,
         )
 
-        # test set_execute_strategy using base.ExecutionStrategy
-        exec_strategy_class = base.ExecutionStrategy()
-        exec_strategy_class.num_threads = 4
-        strategy.set_execute_strategy(exec_strategy_class)
-        exec_strategy = strategy.get_execute_strategy()
-        self.assertEqual(exec_strategy.num_threads, 4)
-
-        # test set_execute_strategy using dict
-        exec_strategy_dict = {}
-        exec_strategy_dict['num_threads'] = 8
-        strategy.set_execute_strategy(exec_strategy_dict)
-        exec_strategy = strategy.get_execute_strategy()
-        self.assertEqual(exec_strategy.num_threads, 8)
-
-        # test set_execute_strategy exception
-        exec_strategy_dict['unknown'] = None
-        self.assertRaises(
-            Exception, strategy.set_execute_strategy, exec_strategy_dict
-        )
-        exec_strategy_illegal = None
-        self.assertRaises(
-            Exception, strategy.set_execute_strategy, exec_strategy_illegal
-        )
-
     def test_half_async_strategy(self):
         strategy = StrategyFactory.create_half_async_strategy()
         self.assertEqual(strategy._program_config.sync_mode, False)
diff --git a/test/collective/fleet/test_dygraph_recompute_for_eager.py b/test/collective/fleet/test_dygraph_recompute_for_eager.py
index 288f69c03d933..790d47b6b5948 100644
--- a/test/collective/fleet/test_dygraph_recompute_for_eager.py
+++ b/test/collective/fleet/test_dygraph_recompute_for_eager.py
@@ -75,6 +75,7 @@ def __init__(
         use_raw_recompute=False,
         recompute_kwargs={},
         raise_value_error=False,
+        recompute_use_kwargs_as_inputs=False,
     ):
         super().__init__()
         self.recompute_blocks = recompute_blocks
@@ -115,6 +116,7 @@ def __init__(
                     self.runfunc2, self.runfunc3, self.runfunc4
                 ),
             ]
+        self.recompute_use_kwargs_as_inputs = recompute_use_kwargs_as_inputs
 
     def forward(self, inputs):
         if self.use_fleet_sq and not self.use_raw_recompute:
@@ -135,9 +137,14 @@ def forward(self, inputs):
         )
         for i in range(len(self.layers)):
             if i in self.recompute_blocks:
-                inputs = recompute(
-                    self.layers[i], inputs, pos, **recompute_kwargs
-                )
+                if self.recompute_use_kwargs_as_inputs:
+                    inputs = recompute(
+                        self.layers[i], pos=pos, x=inputs, **recompute_kwargs
+                    )
+                else:
+                    inputs = recompute(
+                        self.layers[i], inputs, pos, **recompute_kwargs
+                    )
             else:
                 inputs = self.layers[i](inputs, pos)
 
@@ -153,6 +160,7 @@ def run_model(
     segments=1,
     enable_autocast=False,
     pure_fp16=False,
+    recompute_use_kwargs_as_inputs=False,
 ):
     gen = paddle.seed(10)
     gen.manual_seed(10)
@@ -168,6 +176,7 @@ def run_model(
         segments=segments,
         recompute_kwargs=recompute_kwargs,
         raise_value_error=raise_value_error,
+        recompute_use_kwargs_as_inputs=recompute_use_kwargs_as_inputs,
     )
 
     if pure_fp16:
@@ -208,7 +217,12 @@ def run_model(
 
 
 class TestRecompute(unittest.TestCase):
-    def test_base_case(self, enable_autocast=False, pure_fp16=False):
+    def test_base_case(
+        self,
+        enable_autocast=False,
+        pure_fp16=False,
+        recompute_use_kwargs_as_inputs=False,
+    ):
         def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad):
             self.assertEqual(loss_ref, loss)
             self.assertEqual(param_ref, param)
@@ -231,6 +245,7 @@ def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad):
                 enable_autocast=enable_autocast,
                 pure_fp16=pure_fp16,
                 recompute_kwargs={"use_reentrant": flag},
+                recompute_use_kwargs_as_inputs=recompute_use_kwargs_as_inputs,
             )
             check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
 
@@ -240,6 +255,7 @@ def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad):
                 enable_autocast=enable_autocast,
                 pure_fp16=pure_fp16,
                 recompute_kwargs={"use_reentrant": flag},
+                recompute_use_kwargs_as_inputs=recompute_use_kwargs_as_inputs,
             )
             check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
 
@@ -249,6 +265,7 @@ def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad):
                 enable_autocast=enable_autocast,
                 pure_fp16=pure_fp16,
                 recompute_kwargs={"use_reentrant": flag},
+                recompute_use_kwargs_as_inputs=recompute_use_kwargs_as_inputs,
             )
             check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
 
@@ -258,6 +275,7 @@ def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad):
                 enable_autocast=enable_autocast,
                 pure_fp16=pure_fp16,
                 recompute_kwargs={"use_reentrant": flag},
+                recompute_use_kwargs_as_inputs=recompute_use_kwargs_as_inputs,
             )
             check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
 
@@ -268,6 +286,7 @@ def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad):
                 enable_autocast=enable_autocast,
                 pure_fp16=pure_fp16,
                 recompute_kwargs={"use_reentrant": flag},
+                recompute_use_kwargs_as_inputs=recompute_use_kwargs_as_inputs,
             )
             check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
 
@@ -291,23 +310,34 @@ def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad):
 
     def test_fc_net_with_dropout(self):
         self.test_base_case()
+        self.test_base_case(recompute_use_kwargs_as_inputs=True)
 
     def test_fc_net_without_restore_rng(self):
         for flag in [True, False]:
-            loss_ref, param_ref, grad_ref = run_model(
-                recompute_block=[2],
-                recompute_kwargs={
-                    "preserve_rng_state": False,
-                    "use_reentrant": flag,
-                },
-                enable_autocast=True,
-            )
+            for recompute_use_kwargs_as_inputs in [True, False]:
+                loss_ref, param_ref, grad_ref = run_model(
+                    recompute_block=[2],
+                    recompute_kwargs={
+                        "preserve_rng_state": False,
+                        "use_reentrant": flag,
+                    },
+                    enable_autocast=True,
+                    recompute_use_kwargs_as_inputs=recompute_use_kwargs_as_inputs,
+                )
 
     def test_fc_net_with_amp(self):
         self.test_base_case(enable_autocast=True)
+        self.test_base_case(
+            enable_autocast=True, recompute_use_kwargs_as_inputs=True
+        )
 
     def test_fc_net_with_fp16(self):
         self.test_base_case(enable_autocast=True, pure_fp16=True)
+        self.test_base_case(
+            enable_autocast=True,
+            pure_fp16=True,
+            recompute_use_kwargs_as_inputs=True,
+        )
 
     def test_recompute_kwargs(self):
         paddle.set_device("gpu")
@@ -315,7 +345,7 @@ def test_recompute_kwargs(self):
         pos.stop_gradient = False
 
         kwargs = {"pos": pos, "use_reentrant": True}
-        with self.assertRaises(ValueError):
+        with self.assertRaises(TypeError):
             loss_ref, param_ref, grad_ref = run_model(
                 recompute_block=[2],
                 recompute_kwargs=kwargs,
@@ -328,46 +358,57 @@ def test_recompute_kwargs(self):
         )
 
     def test_recompute_inputs_with_param(self):
-        pos = paddle.randn(shape=[10, 10], dtype="float32")
-        new_pos = EagerParamBase(
-            shape=pos.shape, dtype=pos.dtype, name=pos.name
-        )
-        pos._share_buffer_to(new_pos)
-        new_pos.stop_gradient = False
+        for flag in [True, False]:
+            for recompute_use_kwargs_as_inputs in [True, False]:
+                pos = paddle.randn(shape=[10, 10], dtype="float32")
+                new_pos = EagerParamBase(
+                    shape=pos.shape, dtype=pos.dtype, name=pos.name
+                )
+                pos._share_buffer_to(new_pos)
+                new_pos.stop_gradient = False
 
-        loss, param, grad = run_model(
-            recompute_block=[], recompute_kwargs={"pos": new_pos}
-        )
+                loss, param, grad = run_model(
+                    recompute_block=[2, 4],
+                    recompute_kwargs={"pos": new_pos, "use_reentrant": flag},
+                    recompute_use_kwargs_as_inputs=recompute_use_kwargs_as_inputs,
+                )
 
-        loss_ref, param_ref, grad_ref = run_model(
-            recompute_block=[1, 2, 3], recompute_kwargs={"pos": new_pos}
-        )
+                loss_ref, param_ref, grad_ref = run_model(
+                    recompute_block=[1, 2, 3],
+                    recompute_kwargs={"pos": new_pos, "use_reentrant": flag},
+                    recompute_use_kwargs_as_inputs=recompute_use_kwargs_as_inputs,
+                )
 
-        self.assertEqual(loss_ref, loss)
-        self.assertEqual(param_ref, param)
-        self.assertEqual(grad_ref, grad)
+                self.assertEqual(loss_ref, loss)
+                self.assertEqual(param_ref, param)
+                self.assertEqual(grad_ref, grad)
 
     def test_recompute_inputs_with_tuple(self):
-        pos = paddle.randn(shape=[10, 10], dtype="float32")
-        new_pos = EagerParamBase(
-            shape=pos.shape, dtype=pos.dtype, name=pos.name
-        )
-        pos._share_buffer_to(new_pos)
-        pos.stop_gradient = False
-        new_pos.stop_gradient = False
-
-        loss, param, grad = run_model(
-            recompute_block=[2, 4], recompute_kwargs={"pos": (pos,)}
-        )
+        for flag in [True, False]:
+            for recompute_use_kwargs_as_inputs in [True, False]:
+                pos = paddle.randn(shape=[10, 10], dtype="float32")
+                new_pos = EagerParamBase(
+                    shape=pos.shape, dtype=pos.dtype, name=pos.name
+                )
+                pos._share_buffer_to(new_pos)
+                pos.stop_gradient = False
+                new_pos.stop_gradient = False
+
+                loss, param, grad = run_model(
+                    recompute_block=[2, 4],
+                    recompute_kwargs={"pos": (pos,), "use_reentrant": flag},
+                    recompute_use_kwargs_as_inputs=recompute_use_kwargs_as_inputs,
+                )
 
-        loss_ref, param_ref, grad_ref = run_model(
-            recompute_block=[1, 2, 3],
-            recompute_kwargs={"pos": (new_pos,)},
-        )
+                loss_ref, param_ref, grad_ref = run_model(
+                    recompute_block=[1, 2, 3],
+                    recompute_kwargs={"pos": (new_pos,), "use_reentrant": flag},
+                    recompute_use_kwargs_as_inputs=recompute_use_kwargs_as_inputs,
+                )
 
-        self.assertEqual(loss_ref, loss)
-        self.assertEqual(param_ref, param)
-        self.assertEqual(grad_ref, grad)
+                self.assertEqual(loss_ref, loss)
+                self.assertEqual(param_ref, param)
+                self.assertEqual(grad_ref, grad)
 
 
 if __name__ == '__main__':
diff --git a/test/collective/fleet/test_fleet_distributed_strategy.py b/test/collective/fleet/test_fleet_distributed_strategy.py
index 38ed379295f47..4f82a1d8799ae 100644
--- a/test/collective/fleet/test_fleet_distributed_strategy.py
+++ b/test/collective/fleet/test_fleet_distributed_strategy.py
@@ -470,11 +470,6 @@ def test_strategy_prototxt(self):
         build_strategy.enable_backward_optimizer_op_deps = True
         build_strategy.trainers_endpoints = ["1", "2"]
         strategy.build_strategy = build_strategy
-        exe_strategy = paddle.base.ExecutionStrategy()
-        exe_strategy.num_threads = 10
-        exe_strategy.num_iteration_per_drop_scope = 10
-        exe_strategy.num_iteration_per_run = 10
-        strategy.execution_strategy = exe_strategy
         strategy.save_to_prototxt("dist_strategy.prototxt")
         strategy2 = paddle.distributed.fleet.DistributedStrategy()
         strategy2.load_from_prototxt("dist_strategy.prototxt")
@@ -501,15 +496,6 @@ def test_build_strategy(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.build_strategy = build_strategy
 
-    def test_execution_strategy(self):
-        exe_strategy = paddle.base.ExecutionStrategy()
-        exe_strategy.num_threads = 10
-        exe_strategy.num_iteration_per_drop_scope = 10
-        exe_strategy.num_iteration_per_run = 10
-
-        strategy = paddle.distributed.fleet.DistributedStrategy()
-        strategy.execution_strategy = exe_strategy
-
     def test_unknown_strategy(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
         with self.assertRaises(TypeError):
diff --git a/test/collective/test_communication_api_base.py b/test/collective/test_communication_api_base.py
index abd56bfe3d3df..533dad6fc2073 100644
--- a/test/collective/test_communication_api_base.py
+++ b/test/collective/test_communication_api_base.py
@@ -79,15 +79,11 @@ def run_test_case(self, script_file, user_defined_envs=None):
             )
         except subprocess.TimeoutExpired as err:
             raise TimeoutError(
-                "Timeout while running command {}, try to set a longer period, {} is not enough.".format(
-                    err.cmd, err.timeout
-                )
+                f"Timeout while running command {err.cmd}, try to set a longer period, {err.timeout} is not enough."
             )
         except subprocess.CalledProcessError as err:
             raise RuntimeError(
-                "Error occurs when running this test case. The return code of command {} is {}".format(
-                    err.cmd, err.returncode
-                )
+                f"Error occurs when running this test case. The return code of command {err.cmd} is {err.returncode}"
             )
 
     def tearDown(self):
diff --git a/test/contrib/test_image_classification_fp16.py b/test/contrib/test_image_classification_fp16.py
index 43b6798f711d2..570e0df52a155 100644
--- a/test/contrib/test_image_classification_fp16.py
+++ b/test/contrib/test_image_classification_fp16.py
@@ -181,12 +181,7 @@ def train_loop(main_program):
                     fetch_list=[scaled_loss, avg_cost],
                 )
                 print(
-                    'PassID {:1}, BatchID {:04}, train loss {:2.4}, scaled train loss {:2.4}'.format(
-                        pass_id,
-                        batch_id + 1,
-                        float(loss),
-                        float(np_scaled_loss),
-                    )
+                    f'PassID {pass_id:1}, BatchID {batch_id + 1:04}, train loss {float(loss):2.4}, scaled train loss {float(np_scaled_loss):2.4}'
                 )
                 if (batch_id % 10) == 0:
                     acc_list = []
@@ -207,12 +202,7 @@ def train_loop(main_program):
                     avg_loss_value = numpy.array(avg_loss_list).mean()
 
                     print(
-                        'PassID {:1}, BatchID {:04}, test loss {:2.2}, acc {:2.2}'.format(
-                            pass_id,
-                            batch_id + 1,
-                            float(avg_loss_value),
-                            float(acc_value),
-                        )
+                        f'PassID {pass_id:1}, BatchID {batch_id + 1:04}, test loss {float(avg_loss_value):2.2}, acc {float(acc_value):2.2}'
                     )
 
                     if acc_value > 0.08:  # Low threshold for speeding up CI
diff --git a/test/contrib/test_multi_precision_fp16_train.py b/test/contrib/test_multi_precision_fp16_train.py
index 255b5f7bcbcb3..a561f9e89d1bb 100644
--- a/test/contrib/test_multi_precision_fp16_train.py
+++ b/test/contrib/test_multi_precision_fp16_train.py
@@ -179,9 +179,7 @@ def train_loop():
                 )
                 loss_v = float(loss) if isinstance(loss, np.ndarray) else loss
                 print(
-                    'PassID {:1}, Train Batch ID {:04}, train loss {:2.4}'.format(
-                        pass_id, batch_id + 1, float(loss_v)
-                    )
+                    f'PassID {pass_id:1}, Train Batch ID {batch_id + 1:04}, train loss {float(loss_v):2.4}'
                 )
                 train_loss_list.append(float(loss_v))
 
@@ -193,9 +191,7 @@ def train_loop():
                 )
                 test_loss_list.append(float(loss_t))
                 print(
-                    'PassID {:1}, Test Batch ID {:04}, test loss {:2.4}'.format(
-                        pass_id, tid + 1, float(loss_t)
-                    )
+                    f'PassID {pass_id:1}, Test Batch ID {tid + 1:04}, test loss {float(loss_t):2.4}'
                 )
 
         return train_loss_list, test_loss_list
diff --git a/test/cpp/auto_parallel/CMakeLists.txt b/test/cpp/auto_parallel/CMakeLists.txt
index 9b67183f02cd2..b4165c933a42f 100644
--- a/test/cpp/auto_parallel/CMakeLists.txt
+++ b/test/cpp/auto_parallel/CMakeLists.txt
@@ -8,7 +8,10 @@ cc_test(
   DEPS proto_desc)
 
 if(WITH_DISTRIBUTE)
-  cc_library(spmd_rule_test_util SRCS spmd_rule_test_util.cc)
+  cc_library(
+    spmd_rule_test_util
+    SRCS spmd_rule_test_util.cc
+    DEPS gtest)
   cc_test(
     dist_tensor_test
     SRCS dist_tensor_test.cc
diff --git a/test/cpp/cinn/test01_elementwise_add_main.cc b/test/cpp/cinn/test01_elementwise_add_main.cc
index 13adc3e143526..ca5ef5909b44c 100644
--- a/test/cpp/cinn/test01_elementwise_add_main.cc
+++ b/test/cpp/cinn/test01_elementwise_add_main.cc
@@ -31,7 +31,7 @@ TEST(test01_elementwise_add, basic) {
   C->Bind(C_buf);
 
   Target target;
-  target.arch = Target::Arch ::X86;
+  target.arch = common::X86Arch{};
   target.bits = Target::Bit ::k32;
   target.os = Target::OS ::Linux;
   Module::Builder builder("module1", target);
@@ -61,7 +61,7 @@ TEST(test01_elementwise_add, vectorize) {
   stages[C]->Vectorize(1, 8);
 
   Target target;
-  target.arch = Target::Arch ::X86;
+  target.arch = common::X86Arch{};
   target.bits = Target::Bit ::k32;
   target.os = Target::OS ::Linux;
   Module::Builder builder("module2", target);
diff --git a/test/cpp/cinn/test02_helper.h b/test/cpp/cinn/test02_helper.h
index d54830fa5a623..cbe0963f86611 100644
--- a/test/cpp/cinn/test02_helper.h
+++ b/test/cpp/cinn/test02_helper.h
@@ -194,7 +194,7 @@ auto CreateMatmulVectorizeModule(Target target, int m, int n, int k) {
 }
 
 ir::Module CreateMatmulLoopPermutation(Target target, int m, int n, int k_) {
-  target.arch = Target::Arch::X86;
+  target.arch = common::X86Arch{};
   target.bits = Target::Bit::k32;
   target.os = Target::OS::Linux;
 
diff --git a/test/cpp/cinn/test02_matmul_case.cc b/test/cpp/cinn/test02_matmul_case.cc
index fb4f56a1e9a79..97f74c6bd6f1f 100644
--- a/test/cpp/cinn/test02_matmul_case.cc
+++ b/test/cpp/cinn/test02_matmul_case.cc
@@ -196,7 +196,7 @@ TEST(test02, basic) {
   } while (false)
 
   cinn::Target target;
-  target.arch = cinn::Target::Arch::X86;
+  target.arch = cinn::common::X86Arch{};
   target.bits = cinn::Target::Bit::k32;
   target.os = cinn::Target::OS::Linux;
 
diff --git a/test/cpp/cinn/test02_matmul_main.cc b/test/cpp/cinn/test02_matmul_main.cc
index 594fa986a2e57..ac9d419565fd7 100644
--- a/test/cpp/cinn/test02_matmul_main.cc
+++ b/test/cpp/cinn/test02_matmul_main.cc
@@ -297,7 +297,7 @@ TEST(matmul, ArrayPacking_dynamic_shape) {
   stages[packedB]->Vectorize(2, 8);
 
   Target target;
-  target.arch = Target::Arch::X86;
+  target.arch = common::X86Arch{};
   target.bits = Target::Bit::k32;
   target.os = Target::OS::Linux;
 
diff --git a/test/cpp/fluid/CMakeLists.txt b/test/cpp/fluid/CMakeLists.txt
index 3a8f9326764cb..76aa8a6635225 100644
--- a/test/cpp/fluid/CMakeLists.txt
+++ b/test/cpp/fluid/CMakeLists.txt
@@ -13,11 +13,8 @@ if(WITH_DLNNE)
 endif()
 add_subdirectory(elementwise)
 add_subdirectory(fused)
-if(WITH_LITE)
-  add_subdirectory(lite)
-endif()
 add_subdirectory(math)
-if(WITH_MKLDNN)
+if(WITH_ONEDNN)
   add_subdirectory(mkldnn)
 endif()
 add_subdirectory(nccl)
@@ -62,7 +59,7 @@ if(WITH_GPU)
   nv_test(
     dropout_op_test
     SRCS dropout_op_test.cc
-    DEPS dropout_op tensor phi common)
+    DEPS dropout_op tensor phi common global_utils)
   nv_test(
     test_leaky_relu_grad_grad_functor
     SRCS test_leaky_relu_grad_grad_functor.cc
@@ -100,10 +97,6 @@ else()
   paddle_test(op_debug_string_test SRCS op_debug_string_test.cc)
 endif()
 
-if(WITH_GPU)
-  paddle_test(copy_cross_scope_test SRCS copy_cross_scope_test.cc)
-endif()
-
 if(WITH_ONNXRUNTIME AND WIN32)
   # Copy onnxruntime for some c++ test in Windows, since the test will
   # be build only in CI, so suppose the generator in Windows is Ninja.
diff --git a/test/cpp/fluid/cinn/CMakeLists.txt b/test/cpp/fluid/cinn/CMakeLists.txt
index 35d095fc90bda..a3a7eab8b107b 100644
--- a/test/cpp/fluid/cinn/CMakeLists.txt
+++ b/test/cpp/fluid/cinn/CMakeLists.txt
@@ -8,15 +8,4 @@ if(WITH_TESTING)
       "OMP_NUM_THREADS=1;runtime_include_dir=${CINN_INCLUDE_DIR}/paddle/cinn/runtime/cuda/"
   )
 
-  paddle_test(cinn_instruction_run_op_test SRCS cinn_instruction_run_op_test.cc)
-  target_link_libraries(cinn_instruction_run_op_test ${PYTHON_LIBRARIES})
-
-  get_property(
-    env
-    TEST cinn_instruction_run_op_test
-    PROPERTY ENVIRONMENT)
-  set_property(TEST cinn_instruction_run_op_test
-               PROPERTY ENVIRONMENT "${CINN_RUN_ENVIRONMENT}" ${env})
-  set_tests_properties(cinn_instruction_run_op_test PROPERTIES LABELS
-                                                               "RUN_TYPE=CINN")
 endif()
diff --git a/test/cpp/fluid/cinn/cinn_instruction_run_op_test.cc b/test/cpp/fluid/cinn/cinn_instruction_run_op_test.cc
deleted file mode 100644
index 36086a0bd1fab..0000000000000
--- a/test/cpp/fluid/cinn/cinn_instruction_run_op_test.cc
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdlib.h>
-
-#include <string>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "test/cpp/fluid/cinn/test_helper.h"
-
-USE_OP_ITSELF(cinn_launch);
-USE_OP_ITSELF(cinn_instruction_run);
-USE_OP_ITSELF(elementwise_add);
-
-PD_DECLARE_KERNEL(cinn_launch, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(cinn_instruction_run, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
-#ifdef PADDLE_WITH_CUDA
-PD_DECLARE_KERNEL(cinn_launch, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(cinn_instruction_run, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
-#endif
-
-namespace paddle::operators {
-
-using framework::paddle2cinn::CinnCompiler;
-
-class TestCinnInstructionRunOp : public ::testing::Test {
- public:
-  const char* test_op_out_name = "test_op_out";
-  const char* add_op_out_name = "add_op_out";
-  std::unique_ptr<framework::OperatorBase> cinn_launch_op;
-  std::unique_ptr<framework::OperatorBase> cinn_instruction_run_op;
-  std::unique_ptr<framework::OperatorBase> elementwise_add_op;
-
-  void SetUp() override {
-    auto compilation_key = CinnCompiler::GetInstance()->AddGraph(
-        CreateOnlyElementwiseAddGraph("x", "y", test_op_out_name));
-
-    // create necessary ops
-    cinn_launch_op = paddle::framework::OpRegistry::CreateOp(
-        "cinn_launch",
-        {{"X", {"x", "y"}}},
-        {{"Out", {test_op_out_name}}},
-        {{"compilation_key", compilation_key}});
-    cinn_instruction_run_op = paddle::framework::OpRegistry::CreateOp(
-        "cinn_instruction_run",
-        {{"X", {"x", "y"}}},
-        {{"Out", {test_op_out_name}}},
-        {{"cached_index", 0}, {"instruction_index", 0}});
-    elementwise_add_op =
-        paddle::framework::OpRegistry::CreateOp("elementwise_add",
-                                                {{"X", {"x"}}, {"Y", {"y"}}},
-                                                {{"Out", {add_op_out_name}}},
-                                                {{}});
-  }
-
-  void Compile(const platform::Place& place) {
-    // check case: a compiled object not cached before cinn_launch_op run,
-    // so a cinn_instruction_run_op will throw an error
-    framework::Scope scope;
-    InitVariablesWithRandomValue<float>({"x", "y"}, {10, 20}, place, &scope);
-    scope.Var(test_op_out_name)->GetMutable<phi::DenseTensor>();
-    ASSERT_THROW(cinn_instruction_run_op->Run(scope, place),
-                 paddle::platform::EnforceNotMet);
-
-    // run cinn_launch_op firstly to launch the compilation
-    // of the above graph and cache two compiled results
-    // of both type float and int
-    cinn_launch_op->Run(scope, place);
-    scope.EraseVars({"x", "y", test_op_out_name});
-    scope.Var(test_op_out_name)->GetMutable<phi::DenseTensor>();
-    InitVariablesWithRandomValue<int>({"x", "y"}, {30, 40}, place, &scope);
-    cinn_launch_op->Run(scope, place);
-  }
-
-  void RunAndCheck(const platform::Place& place, framework::Scope* scope) {
-    // Run ops and check the computation results
-    InitVariablesWithRandomValue<float>({"x", "y"}, {10, 20}, place, scope);
-    scope->Var(test_op_out_name)->GetMutable<phi::DenseTensor>();
-    scope->Var(add_op_out_name)->GetMutable<phi::DenseTensor>();
-    elementwise_add_op->Run(*scope, place);
-    cinn_launch_op->Run(*scope, place);
-    CompareOpResult<float>(scope->GetVar(test_op_out_name),
-                           scope->GetVar(add_op_out_name));
-  }
-
-  void TearDown() override { CinnCompiler::GetInstance()->Clear(); }
-};
-
-TEST_F(TestCinnInstructionRunOp, CPU) {
-  platform::CPUPlace place;
-  Compile(place);
-  framework::Scope scope1;
-  RunAndCheck(place, &scope1);
-  // the second run on the same place is to check the cache logic
-  framework::Scope scope2;
-  RunAndCheck(place, &scope2);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST_F(TestCinnInstructionRunOp, GPU) {
-  platform::CUDAPlace place;
-  Compile(place);
-  framework::Scope scope1;
-  RunAndCheck(place, &scope1);
-  framework::Scope scope2;
-  RunAndCheck(place, &scope2);
-}
-#endif
-
-}  // namespace paddle::operators
diff --git a/test/cpp/fluid/cinn/cinn_launch_context_test.cc b/test/cpp/fluid/cinn/cinn_launch_context_test.cc
index b11436c15b00d..f2d9097e75e0e 100644
--- a/test/cpp/fluid/cinn/cinn_launch_context_test.cc
+++ b/test/cpp/fluid/cinn/cinn_launch_context_test.cc
@@ -39,12 +39,6 @@ limitations under the License. */
 #include "paddle/pir/include/core/program.h"
 #include "paddle/pir/include/core/value.h"
 
-USE_OP_ITSELF(cinn_instruction_run);
-PD_DECLARE_KERNEL(cinn_instruction_run, CPU, ALL_LAYOUT);
-#ifdef PADDLE_WITH_CUDA
-PD_DECLARE_KERNEL(cinn_instruction_run, GPU, ALL_LAYOUT);
-#endif
-
 namespace paddle {
 namespace operators::details {
 
@@ -225,46 +219,6 @@ TEST_F(CinnLaunchContextTest, TestCheckTensorEquivalent) {
                paddle::platform::EnforceNotMet);
 }
 
-TEST_F(CinnLaunchContextTest, TestBuildCompiledProgram) {
-  platform::CPUPlace place;
-  framework::Scope scope;
-  ParallelExecutor* pe = nullptr;
-  ASSERT_NO_THROW((pe = launch_context->InitializePE(place, &scope)));
-
-  // check details of program build by compiled instructions
-  const ProgramDesc& program = pe->Graph().OriginProgram();
-  ASSERT_EQ(program.Size(), 1);
-  const auto& block = program.Block(0);
-  // vars
-  std::set<std::string> var_names = block.LocalVarNames();
-  ASSERT_EQ(var_names.size(), 5);
-  for (auto&& var_name : var_names) {
-    auto* var = block.FindVar(var_name);
-    ASSERT_NE(var, nullptr);
-    auto* buffer = launch_context->GetCinnBufferOfVar(var_name);
-    ASSERT_EQ(framework::DDim(buffer->dims, buffer->dimensions),
-              common::make_ddim(var->GetShape()));
-  }
-  ASSERT_TRUE(block.FindVar("var1")->Persistable());
-  ASSERT_FALSE(block.FindVar("var5")->Persistable());
-  ASSERT_TRUE(block.FindVar("var5")->IsParameter());
-  ASSERT_FALSE(block.FindVar("var1")->IsParameter());
-  // ops
-  ASSERT_EQ(block.OpSize(), 3);
-  auto* op1 = block.Op(0);
-  ASSERT_EQ(op1->Type(), "cinn_instruction_run");
-  ASSERT_EQ(op1->Input(kX), std::vector<std::string>({"var1", "var2"}));
-  ASSERT_EQ(op1->Output(kOutputs), std::vector<std::string>({"var3"}));
-  ASSERT_EQ(op1->GetAttrIfExists<int64_t>(kCachedIndex), 110);
-  ASSERT_EQ(op1->GetAttrIfExists<int64_t>(kInstructionIndex), 0);
-  auto* op3 = block.Op(2);
-  ASSERT_EQ(op3->Type(), "cinn_instruction_run");
-  ASSERT_EQ(op3->Input(kX), std::vector<std::string>({"var3", "cinn_var4"}));
-  ASSERT_EQ(op3->Output(kOutputs), std::vector<std::string>({"var5"}));
-  ASSERT_EQ(op3->GetAttrIfExists<int64_t>(kCachedIndex), 110);
-  ASSERT_EQ(op3->GetAttrIfExists<int64_t>(kInstructionIndex), 2);
-}
-
 // DEPRECATED(CtfGo): following test of callback assignment
 // will be deprecated after we switch to pe
 TEST_F(CinnLaunchContextTest, TestCallbackAssignment) {
diff --git a/test/cpp/fluid/cinn/cinn_launch_op_test.cc b/test/cpp/fluid/cinn/cinn_launch_op_test.cc
deleted file mode 100644
index edbfab53f07b5..0000000000000
--- a/test/cpp/fluid/cinn/cinn_launch_op_test.cc
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/cinn/cinn_launch_op.h"
-
-#include <stdlib.h>
-
-#include <mutex>
-#include <random>
-#include <string>
-
-#include "gtest/gtest.h"
-#include "paddle/common/ddim.h"
-#include "paddle/common/flags.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "test/cpp/fluid/cinn/test_helper.h"
-
-USE_OP_ITSELF(cinn_launch);
-USE_OP_ITSELF(cinn_instruction_run);
-USE_OP_ITSELF(elementwise_add);
-COMMON_DECLARE_double(eager_delete_tensor_gb);
-COMMON_DECLARE_bool(enable_pe_launch_cinn);
-COMMON_DECLARE_bool(enable_interpretercore_launch_cinn);
-COMMON_DECLARE_bool(enable_cinn_auto_tune);
-
-PD_DECLARE_KERNEL(cinn_launch, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(cinn_instruction_run, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
-#ifdef PADDLE_WITH_CUDA
-PD_DECLARE_KERNEL(cinn_launch, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(cinn_instruction_run, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
-#endif
-
-namespace paddle::operators {
-
-using framework::paddle2cinn::CinnCompiler;
-
-class TestCinnLaunchOp : public ::testing::Test {
- public:
-  const char* test_op_out_name = "test_op_out";
-  const char* add_op_out_name = "add_op_out";
-  std::unique_ptr<framework::OperatorBase> cinn_launch_op;
-  std::unique_ptr<framework::OperatorBase> elementwise_add_op;
-
-  void SetUp() override {
-    paddle::framework::InitDevices();
-    platform::SetNumThreads(1);
-    // cache test graph into CinnCompiler
-    auto compilation_key = CinnCompiler::GetInstance()->AddGraph(
-        CreateOnlyElementwiseAddGraph("x", "y", test_op_out_name));
-
-    // create cinn_launch_op and elementwise_add op
-    cinn_launch_op = paddle::framework::OpRegistry::CreateOp(
-        "cinn_launch",
-        {{"X", {"x", "y"}}},
-        {{"Out", {test_op_out_name}}},
-        {{"compilation_key", compilation_key}});
-    elementwise_add_op =
-        paddle::framework::OpRegistry::CreateOp("elementwise_add",
-                                                {{"X", {"x"}}, {"Y", {"y"}}},
-                                                {{"Out", {add_op_out_name}}},
-                                                {{}});
-  }
-
-  void RunAndCheck(const platform::Place& place, framework::Scope* scope) {
-    // Run ops and check the computation results
-    InitVariablesWithRandomValue<float>({"x", "y"}, {10, 20}, place, scope);
-    scope->Var(test_op_out_name)->GetMutable<phi::DenseTensor>();
-    scope->Var(add_op_out_name)->GetMutable<phi::DenseTensor>();
-    elementwise_add_op->Run(*scope, place);
-    cinn_launch_op->Run(*scope, place);
-    CompareOpResult<float>(scope->GetVar(test_op_out_name),
-                           scope->GetVar(add_op_out_name));
-  }
-
-  void TearDown() override { CinnCompiler::GetInstance()->Clear(); }
-};
-
-TEST_F(TestCinnLaunchOp, TestRunCPUInstructionByPE) {
-  framework::Scope scope1;
-  RunAndCheck(platform::CPUPlace(), &scope1);
-  // the second run on the same place is to check the cache logic
-  framework::Scope scope2;
-  RunAndCheck(platform::CPUPlace(), &scope2);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST_F(TestCinnLaunchOp, TestRunGPUInstructionByPE) {
-  framework::Scope scope1;
-  RunAndCheck(platform::CUDAPlace(), &scope1);
-  framework::Scope scope2;
-  RunAndCheck(platform::CUDAPlace(), &scope2);
-}
-#endif
-
-TEST_F(TestCinnLaunchOp, TestRunCPUInstructionByCinnProgram) {
-  // set FLAGS_enable_pe_launch_cinn=false to switch to use
-  // default scheduler of CINN to execute the compiled program
-  FLAGS_enable_pe_launch_cinn = false;
-  FLAGS_enable_interpretercore_launch_cinn = false;
-  framework::Scope scope1;
-  RunAndCheck(platform::CPUPlace(), &scope1);
-  framework::Scope scope2;
-  RunAndCheck(platform::CPUPlace(), &scope2);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST_F(TestCinnLaunchOp, TestRunGPUInstructionByCinnProgram) {
-  // set FLAGS_enable_pe_launch_cinn=false to switch to use
-  // default scheduler of CINN to execute the compiled program
-  FLAGS_enable_pe_launch_cinn = false;
-  FLAGS_enable_interpretercore_launch_cinn = false;
-  framework::Scope scope1;
-  RunAndCheck(platform::CUDAPlace(), &scope1);
-  framework::Scope scope2;
-  RunAndCheck(platform::CUDAPlace(), &scope2);
-}
-#endif
-
-TEST_F(TestCinnLaunchOp, TestRunWithAutoTuneEnabled) {
-  FLAGS_enable_cinn_auto_tune = true;
-
-  // currently only check on cpu, will add a test for gpu after CINN ready
-  framework::Scope scope1;
-  RunAndCheck(platform::CPUPlace(), &scope1);
-  framework::Scope scope2;
-  RunAndCheck(platform::CPUPlace(), &scope2);
-}
-
-namespace details {
-// Testing helper function used on CinnLaunchOpKernel in the following:
-// firstly build test data, then check both expected and illegal situations
-
-TEST(CinnLaunchOpHelperTest, TestPlaceToCinnTarget) {
-  ASSERT_EQ(PlaceToCinnTarget(platform::CPUPlace()),
-            ::cinn::common::DefaultHostTarget());
-  ASSERT_EQ(PlaceToCinnTarget(platform::CUDAPlace(0)),
-            ::cinn::common::DefaultNVGPUTarget());
-  ASSERT_THROW(PlaceToCinnTarget(platform::XPUPlace()),
-               paddle::platform::EnforceNotMet);
-}
-
-}  // namespace details
-}  // namespace paddle::operators
diff --git a/test/cpp/fluid/copy_cross_scope_test.cc b/test/cpp/fluid/copy_cross_scope_test.cc
deleted file mode 100644
index 5860360992f36..0000000000000
--- a/test/cpp/fluid/copy_cross_scope_test.cc
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <iostream>
-#include <list>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/utils/string/printf.h"
-
-#define Conn(x, y) x##y
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-template <typename T>
-void Compare1(f::Scope* scope,
-              const p::DeviceContext& ctx,
-              std::string op_type) {
-  // init
-  auto var_x = scope->Var("tmp");
-  auto x = var_x->GetMutable<phi::DenseTensor>();
-  std::vector<T> main_x = {1.0};
-  paddle::framework::TensorFromVector(main_x, ctx, x);
-
-  auto var_id = scope->Var("Id");
-  auto id = var_id->GetMutable<phi::DenseTensor>();
-  std::vector<int64_t> main_id = {1};
-  paddle::framework::TensorFromVector(main_id, ctx, id);
-  for (int i = 0; i < 3; i++) {
-    auto& child_scope = scope->NewScope();
-    auto child_var = child_scope.Var("tmp");
-    auto tensor_x = child_var->GetMutable<phi::DenseTensor>();
-    std::vector<T> init_x = {static_cast<T>(i)};
-    paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
-  }
-
-  ctx.Wait();
-
-  // run
-  f::AttributeMap attrs = {{"to_main_scope", false}, {"num_micro_batches", 3}};
-  f::VariableNameMap output;
-  auto op = f::OpRegistry::CreateOp(
-      op_type, {{"X", {"tmp"}}, {"Id", {"Id"}}}, output, attrs);
-
-  auto place = ctx.GetPlace();
-  op->Run(*scope, place);
-  ctx.Wait();
-
-  std::list<f::Scope*>::const_iterator iter = scope->kids().begin();
-  iter++;
-  iter++;
-
-  auto* kid_scope = *iter;
-  auto* dst_var = kid_scope->FindVar("tmp");
-  auto* tensor_out = dst_var->GetMutable<phi::DenseTensor>();
-
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-
-  int expected = 1;
-  EXPECT_EQ(static_cast<int>(out_vec[0]), expected);
-}
-
-template <typename T>
-void Compare2(f::Scope* scope,
-              const p::DeviceContext& ctx,
-              std::string op_type) {
-  // init
-  auto var_x = scope->Var("tmp");
-  auto x = var_x->GetMutable<phi::DenseTensor>();
-  std::vector<T> main_x = {1.0};
-  paddle::framework::TensorFromVector(main_x, ctx, x);
-
-  auto var_id = scope->Var("Id");
-  auto id = var_id->GetMutable<phi::DenseTensor>();
-  std::vector<int64_t> main_id = {0};
-  paddle::framework::TensorFromVector(main_id, ctx, id);
-  for (int i = 0; i < 3; i++) {
-    auto& child_scope = scope->NewScope();
-    auto child_var = child_scope.Var("tmp");
-    auto tensor_x = child_var->GetMutable<phi::DenseTensor>();
-    std::vector<T> init_x = {static_cast<T>(i)};
-    paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
-  }
-
-  ctx.Wait();
-
-  // run
-  f::AttributeMap attrs = {{"to_main_scope", true}, {"num_micro_batches", 3}};
-  f::VariableNameMap output;
-  auto op = f::OpRegistry::CreateOp(
-      op_type, {{"X", {"tmp"}}, {"Id", {"Id"}}}, output, attrs);
-
-  auto place = ctx.GetPlace();
-  op->Run(*scope, place);
-  ctx.Wait();
-
-  auto* dst_var = scope->FindVar("tmp");
-  auto* tensor_out = dst_var->GetMutable<phi::DenseTensor>();
-
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-
-  int expected = 0;
-  EXPECT_EQ(static_cast<int>(out_vec[0]), expected);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(copy_cross_scope, CUDA_fp32) {
-  f::Scope scope;
-  phi::GPUContext ctx(p::CUDAPlace(0));
-  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                       .GetAllocator(p::CUDAPlace(0), ctx.stream())
-                       .get());
-  ctx.PartialInitWithAllocator();
-  Compare1<float>(&scope, ctx, "copy_cross_scope");
-}
-
-TEST(copy_cross_scope_to_main_scope, CUDA_fp32) {
-  f::Scope scope;
-  phi::GPUContext ctx(p::CUDAPlace(0));
-  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                       .GetAllocator(p::CUDAPlace(0), ctx.stream())
-                       .get());
-  ctx.PartialInitWithAllocator();
-  Compare2<float>(&scope, ctx, "copy_cross_scope");
-}
-#endif
diff --git a/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc b/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc
index ddf1229cd0367..a29cc2ea43f7c 100644
--- a/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc
+++ b/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc
@@ -41,16 +41,16 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-class TestElementwiseDivGradGradWithoutDout
-    : public TestElementwiseOpGradGrad<T> {
+class TestElementwiseDivGradGradWithDout : public TestElementwiseOpGradGrad<T> {
  public:
-  TestElementwiseDivGradGradWithoutDout(const platform::Place &place,
-                                        const framework::DDim &dims)
-      : TestElementwiseOpGradGrad<T>("elementwise_div_grad_grad",
-                                     place,
-                                     dims,
-                                     {"Y", "Out", "DDX", "DDY", "DX"},
-                                     {"Y@GRAD", "DDOut"}) {}
+  TestElementwiseDivGradGradWithDout(const platform::Place &place,
+                                     const framework::DDim &dims)
+      : TestElementwiseOpGradGrad<T>(
+            "elementwise_div_grad_grad",
+            place,
+            dims,
+            {"Y", "Out", "Out@GRAD", "DDX", "DDY", "DX"},
+            {"Y@GRAD", "DDOut", "DOut"}) {}
 
   using TestElementwiseOpGradGrad<T>::feed_datas_;
   using TestElementwiseOpGradGrad<T>::expected_outs_;
@@ -59,6 +59,7 @@ class TestElementwiseDivGradGradWithoutDout
     size_t numel = static_cast<size_t>(common::product(dims_));
     std::vector<T> dy(numel);
     std::vector<T> ddout(numel);
+    std::vector<T> dout(numel);
     for (size_t i = 0; i < numel; ++i) {
       // dY(Y@GRAD) = Out * dX * ddY / Y - dX * ddX / Y
       dy[i] = (feed_datas_["DX"][i] / feed_datas_["Y"][i]) *
@@ -68,9 +69,12 @@ class TestElementwiseDivGradGradWithoutDout
       ddout[i] = (feed_datas_["DDX"][i] -
                   feed_datas_["Out"][i] * feed_datas_["DDY"][i]) /
                  (feed_datas_["Y"][i]);
+      // dOut = - DX * DDy
+      dout[i] = -feed_datas_["DX"][i] * feed_datas_["DDY"][i];
     }
     expected_outs_["Y@GRAD"] = dy;
     expected_outs_["DDOut"] = ddout;
+    expected_outs_["DOut"] = dout;
   }
 
   std::unique_ptr<framework::OperatorBase> CreateTestOp() override {
@@ -78,27 +82,28 @@ class TestElementwiseDivGradGradWithoutDout
         this->op_type_,
         {{"Y", {"Y"}},
          {"Out", {"Out"}},
+         {"Out@GRAD", {"Out@GRAD"}},
          {"DDX", {"DDX"}},
          {"DDY", {"DDY"}},
          {"DX", {"DX"}}},
-        {{"Y@GRAD", {"Y@GRAD"}}, {"DDOut", {"DDOut"}}},
+        {{"Y@GRAD", {"Y@GRAD"}}, {"DDOut", {"DDOut"}}, {"DOut", {"DOut"}}},
         {{"use_mkldnn", false}, {"axis", 0}});
     return op;
   }
 };
 
-TEST(test_elementwise_div_grad_grad_without_dout, cpu_place) {
+TEST(test_elementwise_div_grad_grad, cpu_place) {
   framework::DDim dims({32, 64});
   platform::CPUPlace p;
-  TestElementwiseDivGradGradWithoutDout<float> test(p, dims);
+  TestElementwiseDivGradGradWithDout<float> test(p, dims);
   ASSERT_TRUE(test.Check());
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-TEST(test_elementwise_div_grad_grad_without_dout, gpu_place) {
+TEST(test_elementwise_div_grad_grad, gpu_place) {
   framework::DDim dims({32, 64});
   platform::CUDAPlace p(0);
-  TestElementwiseDivGradGradWithoutDout<float> test(p, dims);
+  TestElementwiseDivGradGradWithDout<float> test(p, dims);
   ASSERT_TRUE(test.Check());
 }
 #endif
diff --git a/test/cpp/fluid/elementwise/test_elementwise_op_grad_grad.h b/test/cpp/fluid/elementwise/test_elementwise_op_grad_grad.h
index ab67c559532d9..3e772aa632e52 100644
--- a/test/cpp/fluid/elementwise/test_elementwise_op_grad_grad.h
+++ b/test/cpp/fluid/elementwise/test_elementwise_op_grad_grad.h
@@ -135,8 +135,18 @@ class TestElementwiseOpGradGrad {
           expected_outs_[out_name].data(),
           [](const float &l, const float &r) { return fabs(l - r) < 1e-8; });
 #else
-      auto is_equal =
-          std::equal(out_ptr, out_ptr + numel, expected_outs_[out_name].data());
+      bool is_equal;
+      if (op_type_ == "elementwise_div_grad_grad") {
+        is_equal = std::equal(out_ptr,
+                              out_ptr + numel,
+                              expected_outs_[out_name].data(),
+                              [](const float &l, const float &r) {
+                                return fabs(l - r) < 0.0005;
+                              });
+      } else {
+        is_equal = std::equal(
+            out_ptr, out_ptr + numel, expected_outs_[out_name].data());
+      }
 #endif
       if (!is_equal) {
         all_equal = false;
diff --git a/test/cpp/fluid/framework/details/broadcast_op_handle_test.h b/test/cpp/fluid/framework/details/broadcast_op_handle_test.h
index 42abb320c320c..51f4638269542 100644
--- a/test/cpp/fluid/framework/details/broadcast_op_handle_test.h
+++ b/test/cpp/fluid/framework/details/broadcast_op_handle_test.h
@@ -111,7 +111,7 @@ struct TestBroadcastOpHandle {
       nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_));
 #else
       PADDLE_THROW(
-          platform::errors::PreconditionNotMet("Not compiled with NCLL."));
+          platform::errors::PreconditionNotMet("Not compiled with NCCL."));
 #endif
     } else {
       int count = 8;
diff --git a/test/cpp/fluid/framework/details/reduce_op_handle_test.cc b/test/cpp/fluid/framework/details/reduce_op_handle_test.cc
index 459f4dfcff504..76060f4ef9192 100644
--- a/test/cpp/fluid/framework/details/reduce_op_handle_test.cc
+++ b/test/cpp/fluid/framework/details/reduce_op_handle_test.cc
@@ -74,7 +74,7 @@ struct TestReduceOpHandle {
       nccl_ctxs_ = std::make_unique<platform::NCCLContextMap>(gpu_list_);
 #else
       PADDLE_THROW(
-          platform::errors::PreconditionNotMet("Not compiled with NCLL."));
+          platform::errors::PreconditionNotMet("Not compiled with NCCL."));
 #endif
     } else {
       int count = 8;
@@ -109,7 +109,7 @@ struct TestReduceOpHandle {
           nodes.back().get(), local_scopes_, gpu_list_, nccl_ctxs_.get()));
 #else
       PADDLE_THROW(
-          platform::errors::PreconditionNotMet("Not compiled with NCLL."));
+          platform::errors::PreconditionNotMet("Not compiled with NCCL."));
 #endif
     } else {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
diff --git a/test/cpp/fluid/framework/io/cipher_utils_test.cc b/test/cpp/fluid/framework/io/cipher_utils_test.cc
index ee4453bcaab67..e50bbd0e94bfc 100644
--- a/test/cpp/fluid/framework/io/cipher_utils_test.cc
+++ b/test/cpp/fluid/framework/io/cipher_utils_test.cc
@@ -26,7 +26,7 @@ TEST(CipherUtils, load_config) {
   std::string filename("cryptotest_config_file.conf");
 
   std::ofstream fout(filename, std::ios::out);
-  fout << "# anotation test line:"
+  fout << "# annotation test line:"
           " must have two space along ':'."
        << std::endl;
   std::vector<std::string> key_value;
diff --git a/test/cpp/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt b/test/cpp/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
index 841ebd7c0fcc0..b6e2d8532325c 100644
--- a/test/cpp/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
+++ b/test/cpp/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
@@ -1,9 +1,3 @@
-if(WITH_CINN)
-  paddle_test(share_varinfo_into_cinn_pass_test SRCS
-              share_varinfo_into_cinn_pass_test.cc)
-  list(APPEND EAGER_DELETETION_PASS_DEPS share_varinfo_into_cinn_pass)
-endif()
-
 paddle_test(test_reference_count_pass_last_lived_ops SRCS
             test_reference_count_pass_last_lived_ops.cc DEPS common)
 
diff --git a/test/cpp/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc b/test/cpp/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
deleted file mode 100644
index 1f78e293a21a3..0000000000000
--- a/test/cpp/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
+++ /dev/null
@@ -1,154 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
-#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
-#include "paddle/fluid/framework/parallel_executor.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-USE_OP_ITSELF(mul);
-USE_OP_ITSELF(elementwise_add);
-
-USE_OP_ITSELF(cinn_launch);
-PD_DECLARE_KERNEL(cinn_launch, CPU, ALL_LAYOUT);
-#ifdef PADDLE_WITH_CUDA
-PD_DECLARE_KERNEL(cinn_launch, GPU, ALL_LAYOUT);
-#endif
-
-namespace paddle::framework {
-
-using Name2VarInfoMap =
-    std::unordered_map<std::string, std::shared_ptr<ir::MemOptVarInfo>>;
-
-static ProgramDesc BuildProgramInsideCinnLaunchOp() {
-  ProgramDesc program;
-  auto* block = program.MutableBlock(0);
-  block->Var("var1");
-  block->Var("var2");
-  block->Var("var3");
-  block->Var("var4");
-  block->Var("var5");
-
-  auto add_op =
-      std::unique_ptr<OpDesc>(new OpDesc("elementwise_add",
-                                         {{"X", {"var1"}}, {"Y", {"var2"}}},
-                                         {{"Out", {"var3"}}},
-                                         {}));
-  block->AppendAllocatedOp(std::move(add_op));
-  auto mul_op = std::unique_ptr<OpDesc>(new OpDesc(
-      "mul", {{"X", {"var3"}}, {"Y", {"var4"}}}, {{"Out", {"var5"}}}, {}));
-  block->AppendAllocatedOp(std::move(mul_op));
-  return program;
-}
-
-static ProgramDesc BuildProgramWithCinnLaunchOp(int64_t compilation_key) {
-  // create a cinn_launch op
-  ProgramDesc program;
-  auto* block = program.MutableBlock(0);
-  block->Var("var1");
-  block->Var("var2");
-  block->Var("var4");
-  block->Var("var5");
-
-  auto cinn_launch_op = std::unique_ptr<OpDesc>(
-      new OpDesc("cinn_launch",
-                 {{"X", {"var1", "var2", "var4"}}},
-                 {{"Out", {"var5"}}},
-                 {{"compilation_key", compilation_key}}));
-  block->AppendAllocatedOp(std::move(cinn_launch_op));
-  return program;
-}
-
-struct TestPassContext {
-  explicit TestPassContext(const ProgramDesc& program) {
-    graph = std::make_unique<ir::Graph>(program);
-    details::BuildStrategy build_strategy;
-    details::ExecutionStrategy exec_strategy;
-    exec_strategy.use_device_ = paddle::platform::kCUDA;
-    executor.reset(new ParallelExecutor(platform::CUDAPlace(0),
-                                        &scope,
-                                        exec_strategy,
-                                        build_strategy,
-                                        graph.get()));
-  }
-
-  Scope scope;
-  std::unique_ptr<ir::Graph> graph;
-  std::unique_ptr<ParallelExecutor> executor;
-};
-
-TEST(ShareMemInfoToSubGraphPassTest, test_main_graph_share_varinfo) {
-  // add a subgraph to CinnCompiler
-  auto subgraph = std::make_unique<ir::Graph>(BuildProgramInsideCinnLaunchOp());
-  subgraph->GetOrInit<Name2VarInfoMap>(
-      paddle2cinn::kMemOptVarInfoFromMainGraph);
-  auto compilation_key =
-      paddle2cinn::CinnCompiler::GetInstance()->AddGraph(std::move(subgraph));
-
-  // build test data and apply pass
-  auto context = std::make_unique<TestPassContext>(
-      BuildProgramWithCinnLaunchOp(compilation_key));
-
-  // check result
-  const ir::Graph& result_subgraph =
-      paddle2cinn::CinnCompiler::GetInstance()->FindGraph(compilation_key);
-  const auto& dst_varinfo_map = result_subgraph.Get<Name2VarInfoMap>(
-      paddle2cinn::kMemOptVarInfoFromMainGraph);
-  ASSERT_EQ(dst_varinfo_map.size(), 4);
-  EXPECT_EQ(dst_varinfo_map.count("var1"), 1);
-  EXPECT_EQ(dst_varinfo_map.count("var5"), 1);
-  EXPECT_EQ(dst_varinfo_map.at("var1").use_count(), 2);
-  EXPECT_EQ(dst_varinfo_map.at("var5").use_count(), 2);
-}
-
-TEST(ShareMemInfoToSubGraphPassTest, test_subgraph_take_varinfo) {
-  // build test data and apply pass
-  auto context =
-      std::make_unique<TestPassContext>(BuildProgramInsideCinnLaunchOp());
-  auto& varinfo_map_shared = context->graph->GetOrInit<Name2VarInfoMap>(
-      paddle2cinn::kMemOptVarInfoFromMainGraph);
-  varinfo_map_shared = {
-      {"var1", std::make_shared<ir::MemOptVarInfo>("var1", 1)},
-      {"var2", std::make_shared<ir::MemOptVarInfo>("var2", 2)},
-  };
-
-  ir::MemOptVarInfoMapList varinfo_maps(1);
-  auto& dst_varinfo_map = varinfo_maps.front();
-  dst_varinfo_map = {{"var1", std::make_shared<ir::MemOptVarInfo>("var1", 1)},
-                     {"var2", std::make_shared<ir::MemOptVarInfo>("var2", 1)},
-                     {"var3", std::make_shared<ir::MemOptVarInfo>("var3", 1)},
-                     {"var4", std::make_shared<ir::MemOptVarInfo>("var4", 1)},
-                     {"var5", std::make_shared<ir::MemOptVarInfo>("var5", 1)}};
-  auto share_pass =
-      ir::PassRegistry::Instance().Get("share_varinfo_into_cinn_pass");
-  share_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &varinfo_maps);
-  share_pass->Apply(context->graph.get());
-
-  // check result
-  ASSERT_NE(dst_varinfo_map.at("var1")->ParentHolder(), nullptr);
-  ASSERT_NE(dst_varinfo_map.at("var2")->ParentHolder(), nullptr);
-  ASSERT_EQ(dst_varinfo_map.at("var3")->ParentHolder(), nullptr);
-  ASSERT_EQ(dst_varinfo_map.at("var4")->ParentHolder(), nullptr);
-  ASSERT_EQ(dst_varinfo_map.at("var5")->ParentHolder(), nullptr);
-}
-
-}  // namespace paddle::framework
diff --git a/test/cpp/fluid/framework/new_executor/workqueue_test.cc b/test/cpp/fluid/framework/new_executor/workqueue_test.cc
index 1671b53113b1d..4b8b1cc59b00f 100644
--- a/test/cpp/fluid/framework/new_executor/workqueue_test.cc
+++ b/test/cpp/fluid/framework/new_executor/workqueue_test.cc
@@ -61,7 +61,7 @@ TEST(WorkQueue, TestSingleThreadedWorkQueue) {
   // AddTask
   EXPECT_EQ(finished.load(), false);
   EXPECT_EQ(counter.load(), 0u);
-  work_queue->AddTask([&counter, &finished, kLoopNum]() {
+  work_queue->AddTask([=, &counter, &finished]() {
     for (unsigned i = 0; i < kLoopNum; ++i) {
       ++counter;
     }
@@ -111,7 +111,7 @@ TEST(WorkQueue, TestMultiThreadedWorkQueue) {
   EXPECT_EQ(finished.load(), false);
   EXPECT_EQ(counter.load(), 0u);
   for (unsigned i = 0; i < kExternalLoopNum; ++i) {
-    work_queue->AddTask([&counter, &finished, kLoopNum]() {
+    work_queue->AddTask([=, &counter, &finished]() {
       for (unsigned i = 0; i < kLoopNum; ++i) {
         ++counter;
       }
@@ -147,7 +147,6 @@ TEST(WorkQueue, TestWorkQueueGroup) {
   using paddle::framework::EventsWaiter;
   using paddle::framework::WorkQueueGroup;
   using paddle::framework::WorkQueueOptions;
-  std::atomic<bool> finished{false};
   std::atomic<unsigned> counter{0};
   constexpr unsigned kExternalLoopNum = 100;
   constexpr unsigned kLoopNum = 1000000;
@@ -175,13 +174,13 @@ TEST(WorkQueue, TestWorkQueueGroup) {
   // AddTask
   EXPECT_EQ(counter.load(), 0u);
   for (unsigned i = 0; i < kExternalLoopNum; ++i) {
-    queue_group->AddTask(1, [&counter, &finished, kLoopNum]() {
+    queue_group->AddTask(1, [=, &counter]() {
       for (unsigned i = 0; i < kLoopNum; ++i) {
         ++counter;
       }
     });
   }
-  queue_group->AddTask(0, [&counter, &finished, kLoopNum]() {
+  queue_group->AddTask(0, [=, &counter]() {
     for (unsigned i = 0; i < kLoopNum; ++i) {
       ++counter;
     }
diff --git a/test/cpp/fluid/framework/op_compatible_info_test.cc b/test/cpp/fluid/framework/op_compatible_info_test.cc
index 63bad5c25f73d..fb4fa0cc5350a 100644
--- a/test/cpp/fluid/framework/op_compatible_info_test.cc
+++ b/test/cpp/fluid/framework/op_compatible_info_test.cc
@@ -27,8 +27,6 @@ TEST(test_op_compatible_info, test_op_compatible) {
   comp_map.InitOpCompatibleMap();
 
   ASSERT_NE(comp_map.GetDefaultRequiredVersion(), std::string());
-  ASSERT_NE(comp_map.GetOpCompatibleInfo("sequence_pad").required_version_,
-            std::string());
   ASSERT_NE(comp_map.GetOpCompatibleInfo("reshape").required_version_,
             std::string());
   ASSERT_NE(comp_map.GetOpCompatibleInfo("layer_norm").required_version_,
@@ -36,19 +34,6 @@ TEST(test_op_compatible_info, test_op_compatible) {
   ASSERT_NE(comp_map.GetOpCompatibleInfo("layer_xx").required_version_,
             std::string());
 
-  auto comp_1 = comp_map.IsRequireMiniVersion("sequence_pad", "1.5.0");
-  ASSERT_EQ(comp_1, OpCompatibleType::definite_not);
-  auto comp_2 = comp_map.IsRequireMiniVersion("sequence_pad", "1.6.0");
-  ASSERT_EQ(comp_2, OpCompatibleType::compatible);
-  auto comp_3 = comp_map.IsRequireMiniVersion("sequence_pad", "1.6.1");
-  ASSERT_EQ(comp_3, OpCompatibleType::compatible);
-  auto comp_6 = comp_map.IsRequireMiniVersion("sequence_pad", "1.7.0");
-  ASSERT_EQ(comp_6, OpCompatibleType::compatible);
-  auto comp_7 = comp_map.IsRequireMiniVersion("sequence_pad", "0.7.0");
-  ASSERT_EQ(comp_7, OpCompatibleType::definite_not);
-  auto comp_8 = comp_map.IsRequireMiniVersion("sequence_pad", "2.0.0");
-  ASSERT_EQ(comp_8, OpCompatibleType::compatible);
-
   ASSERT_EQ(comp_map.IsRequireMiniVersion("unkop", "2.0.0"),
             OpCompatibleType::compatible);
   ASSERT_EQ(comp_map.IsRequireMiniVersion("unkop", "0.7.0"),
diff --git a/test/cpp/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/test/cpp/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
index 2935c0b5876cd..d3cb9f7942eca 100644
--- a/test/cpp/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
+++ b/test/cpp/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
@@ -28,7 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
-#include "paddle/fluid/operators/cinn/cinn_launch_op.h"
+#include "paddle/fluid/operators/cinn/cinn_op_helper.h"
 
 namespace paddle {
 namespace framework {
diff --git a/test/cpp/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/test/cpp/fluid/framework/paddle2cinn/cinn_compiler_test.cc
index ee34c2e59ce1f..b3d974a700154 100644
--- a/test/cpp/fluid/framework/paddle2cinn/cinn_compiler_test.cc
+++ b/test/cpp/fluid/framework/paddle2cinn/cinn_compiler_test.cc
@@ -34,7 +34,7 @@
 #include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/cinn/cinn_launch_op.h"
+#include "paddle/fluid/operators/cinn/cinn_op_helper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/test/cpp/fluid/framework/phi_utils_test.cc b/test/cpp/fluid/framework/phi_utils_test.cc
index 2031a5da812a2..0be7f01c4dc22 100644
--- a/test/cpp/fluid/framework/phi_utils_test.cc
+++ b/test/cpp/fluid/framework/phi_utils_test.cc
@@ -31,10 +31,10 @@ TEST(PhiUtils, TransPhiKernelKeyToOpKernelType) {
             paddle::framework::LibraryType::kPlain);
 
 #ifdef PADDLE_WITH_DNNL
-  phi::KernelKey kernel_key_mkldnn(
+  phi::KernelKey kernel_key_onednn(
       phi::Backend::ONEDNN, phi::DataLayout::NCHW, phi::DataType::FLOAT32);
   op_kernel_type =
-      paddle::framework::TransPhiKernelKeyToOpKernelType(kernel_key_mkldnn);
+      paddle::framework::TransPhiKernelKeyToOpKernelType(kernel_key_onednn);
   ASSERT_EQ(op_kernel_type.data_type_, paddle::framework::proto::VarType::FP32);
   ASSERT_EQ(op_kernel_type.data_layout_, phi::DataLayout::kNCHW);
   ASSERT_TRUE(paddle::platform::is_cpu_place(op_kernel_type.place_));
@@ -67,16 +67,16 @@ TEST(PhiUtils, TransOpKernelTypeToPhiKernelKey) {
   ASSERT_EQ(kernel_key.backend(), phi::Backend::CPU);
 
 #ifdef PADDLE_WITH_DNNL
-  paddle::framework::OpKernelType op_kernel_type_mkldnn(
+  paddle::framework::OpKernelType op_kernel_type_onednn(
       paddle::framework::proto::VarType::FP32,
       paddle::platform::CPUPlace(),
       phi::DataLayout::ONEDNN,
       paddle::framework::LibraryType::kMKLDNN);
-  auto kernel_key_mkldnn =
-      paddle::framework::TransOpKernelTypeToPhiKernelKey(op_kernel_type_mkldnn);
-  ASSERT_EQ(kernel_key_mkldnn.dtype(), phi::DataType::FLOAT32);
-  ASSERT_EQ(kernel_key_mkldnn.layout(), phi::DataLayout::ONEDNN);
-  ASSERT_EQ(kernel_key_mkldnn.backend(), phi::Backend::ONEDNN);
+  auto kernel_key_onednn =
+      paddle::framework::TransOpKernelTypeToPhiKernelKey(op_kernel_type_onednn);
+  ASSERT_EQ(kernel_key_onednn.dtype(), phi::DataType::FLOAT32);
+  ASSERT_EQ(kernel_key_onednn.layout(), phi::DataLayout::ONEDNN);
+  ASSERT_EQ(kernel_key_onednn.backend(), phi::Backend::ONEDNN);
 #endif
 
 #ifdef PADDLE_WITH_CUDA
diff --git a/test/cpp/fluid/framework/selected_rows_utils_test.cc b/test/cpp/fluid/framework/selected_rows_utils_test.cc
index 36edf22ed92e0..7874305928c74 100644
--- a/test/cpp/fluid/framework/selected_rows_utils_test.cc
+++ b/test/cpp/fluid/framework/selected_rows_utils_test.cc
@@ -52,7 +52,7 @@ TEST_F(SelectedRowsTester, complete_dims) {
   ASSERT_EQ(selected_rows_->GetCompleteDims(), common::make_ddim({10, 100}));
 }
 
-TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
+TEST_F(SelectedRowsTester, SerializeAndDeserialize) {
   phi::SelectedRows dst_tensor;
   phi::CPUContext cpu_ctx(place_);
   std::ostringstream oss;
diff --git a/test/cpp/fluid/framework/tensor_test.cc b/test/cpp/fluid/framework/tensor_test.cc
index c565920a0a0c2..9096fb8032ee9 100644
--- a/test/cpp/fluid/framework/tensor_test.cc
+++ b/test/cpp/fluid/framework/tensor_test.cc
@@ -58,20 +58,20 @@ TEST(DenseTensor, MutableData) {
     auto p1_holder = src_tensor.Holder();
     EXPECT_NE(p1, nullptr);
     // set src_tensor a new dim with large size
-    // momery is supposed to be re-allocated
+    // memory is supposed to be re-allocated
     p2 = src_tensor.mutable_data<float>(common::make_ddim({3, 4}),
                                         platform::CPUPlace());
     EXPECT_NE(p2, nullptr);
     auto p2_holder1 = src_tensor.Holder();
     EXPECT_NE(p1_holder.get(), p2_holder1.get());
     // set src_tensor a new dim with same size
-    // momery block is supposed to be unchanged
+    // memory block is supposed to be unchanged
     p1 = src_tensor.mutable_data<float>(common::make_ddim({2, 2, 3}),
                                         platform::CPUPlace());
     auto p2_holder2 = src_tensor.Holder();
     EXPECT_EQ(p2_holder1.get(), p2_holder2.get());
     // set src_tensor a new dim with smaller size
-    // momery block is supposed to be unchanged
+    // memory block is supposed to be unchanged
     p2 = src_tensor.mutable_data<float>(common::make_ddim({2, 2}),
                                         platform::CPUPlace());
     auto p2_holder3 = src_tensor.Holder();
@@ -124,19 +124,19 @@ TEST(DenseTensor, MutableData) {
     auto p1_holder = src_tensor.Holder();
     EXPECT_NE(p1, nullptr);
     // set src_tensor a new dim with large size
-    // momery is supposed to be re-allocated
+    // memory is supposed to be re-allocated
     p2 = src_tensor.mutable_data<float>(common::make_ddim({3, 1024}),
                                         platform::CUDAPlace(0));
     auto p2_holder = src_tensor.Holder();
     EXPECT_NE(p2, nullptr);
     EXPECT_NE(p1_holder.get(), p2_holder.get());
     // set src_tensor a new dim with same size
-    // momery block is supposed to be unchanged
+    // memory block is supposed to be unchanged
     p1 = src_tensor.mutable_data<float>(common::make_ddim({2, 2, 3}),
                                         platform::CUDAPlace(0));
     EXPECT_EQ(p1, p2);
     // set src_tensor a new dim with smaller size
-    // momery block is supposed to be unchanged
+    // memory block is supposed to be unchanged
     p2 = src_tensor.mutable_data<float>(common::make_ddim({2, 2}),
                                         platform::CUDAPlace(0));
     EXPECT_EQ(p1, p2);
diff --git a/test/cpp/fluid/lite/CMakeLists.txt b/test/cpp/fluid/lite/CMakeLists.txt
deleted file mode 100644
index 6533073258ff5..0000000000000
--- a/test/cpp/fluid/lite/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-paddle_test(test_lite_engine_op SRCS lite_engine_op_test.cc)
-
-if(WITH_ONNXRUNTIME AND WIN32)
-  # Copy onnxruntime for some c++ test in Windows, since the test will
-  # be build only in CI, so suppose the generator in Windows is Ninja.
-  copy_onnx(test_lite_engine_op)
-endif()
diff --git a/test/cpp/fluid/lite/lite_engine_op_test.cc b/test/cpp/fluid/lite/lite_engine_op_test.cc
deleted file mode 100644
index ca4dd444335d0..0000000000000
--- a/test/cpp/fluid/lite/lite_engine_op_test.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/lite/lite_engine_op.h"
-
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/utils/singleton.h"
-#include "paddle/fluid/operators/lite/ut_helper.h"
-
-USE_NO_KERNEL_OP(lite_engine)
-
-using paddle::inference::lite::AddFetchListToBlockDesc;
-using paddle::inference::lite::AddTensorToBlockDesc;
-using paddle::inference::lite::CreateTensor;
-using paddle::inference::lite::serialize_params;
-namespace paddle {
-namespace operators {
-
-TEST(LiteEngineOp, engine_op) {
-  framework::ProgramDesc program;
-  auto* block_ = program.Proto()->mutable_blocks(0);
-  framework::BlockDesc block_desc(&program, block_);
-  auto* feed0 = block_desc.AppendOp();
-  feed0->SetType("feed");
-  feed0->SetInput("X", {"feed"});
-  feed0->SetOutput("Out", {"x"});
-  feed0->SetAttr("col", 0);
-  auto* feed1 = block_desc.AppendOp();
-  feed1->SetType("feed");
-  feed1->SetInput("X", {"feed"});
-  feed1->SetOutput("Out", {"y"});
-  feed1->SetAttr("col", 1);
-  LOG(INFO) << "create elementwise_add op";
-  auto* elt_add = block_desc.AppendOp();
-  elt_add->SetType("elementwise_add");
-  elt_add->SetInput("X", std::vector<std::string>({"x"}));
-  elt_add->SetInput("Y", std::vector<std::string>({"y"}));
-  elt_add->SetOutput("Out", std::vector<std::string>({"z"}));
-  elt_add->SetAttr("axis", -1);
-  LOG(INFO) << "create fetch op";
-  auto* fetch = block_desc.AppendOp();
-  fetch->SetType("fetch");
-  fetch->SetInput("X", std::vector<std::string>({"z"}));
-  fetch->SetOutput("Out", std::vector<std::string>({"out"}));
-  fetch->SetAttr("col", 0);
-  // Set inputs' variable shape in BlockDesc
-  AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4}), true);
-  AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({2, 4}), true);
-  AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 4}), false);
-  AddFetchListToBlockDesc(block_, "out");
-  *block_->add_ops() = *feed1->Proto();
-  *block_->add_ops() = *feed0->Proto();
-  *block_->add_ops() = *elt_add->Proto();
-  *block_->add_ops() = *fetch->Proto();
-  framework::Scope scope;
-  platform::CPUPlace place;
-  phi::CPUContext ctx(place);
-  // Prepare variables.
-  CreateTensor(&scope, "x", std::vector<int64_t>({2, 4}));
-  CreateTensor(&scope, "y", std::vector<int64_t>({2, 4}));
-  CreateTensor(&scope, "out", std::vector<int64_t>({2, 4}));
-
-  ASSERT_EQ(block_->ops_size(), 4);
-
-  std::vector<std::string> repetitive_params{"x", "y"};
-  inference::lite::EngineConfig config;
-  config.valid_places = {
-#if defined(PADDLE_WITH_ARM)
-    paddle::lite_api::Place({TARGET(kARM), PRECISION(kFloat)}),
-#else
-    paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}),
-#endif
-    paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}),
-  };
-  serialize_params(&(config.param), &scope, repetitive_params);
-  config.model = program.Proto()->SerializeAsString();
-  LOG(INFO) << "create lite_engine desc";
-  framework::OpDesc engine_op_desc(nullptr);
-  engine_op_desc.SetType("lite_engine");
-  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x", "y"}));
-  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"out"}));
-  std::string engine_key = "engine_0";
-  engine_op_desc.SetAttr("engine_key", engine_key);
-  engine_op_desc.SetAttr("enable_int8", false);
-  engine_op_desc.SetAttr("use_gpu", true);
-  engine_op_desc.SetAttr("zero_copy", true);
-  engine_op_desc.SetBlockAttr("sub_block", &block_desc);
-  // TODO(wilber): The ut is out of date, we need to a new lite subgraph test.
-  // inference::Singleton<inference::lite::EngineManager>::Global().Create(
-  //     engine_key, config);
-  // LOG(INFO) << "create engine op";
-  // auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
-  // LOG(INFO) << "engine_op " << engine_op.get();
-  // // Execute them.
-  // LOG(INFO) << "engine_op run";
-  // engine_op->Run(scope, place);
-  // LOG(INFO) << "done";
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/test/cpp/fluid/math/selected_rows_functor_test.cc b/test/cpp/fluid/math/selected_rows_functor_test.cc
index a32140f4a9c35..f0d1854d087e7 100644
--- a/test/cpp/fluid/math/selected_rows_functor_test.cc
+++ b/test/cpp/fluid/math/selected_rows_functor_test.cc
@@ -49,7 +49,7 @@ TEST(selected_rows_functor, cpu_add) {
   std::unique_ptr<phi::SelectedRows> output{new phi::SelectedRows()};
   auto* out_value = output->mutable_value();
 
-  // simplely concat two SelectedRows
+  // simply concat two SelectedRows
   out_value->mutable_data<float>(common::make_ddim({7, 10}), cpu_place);
 
   phi::funcs::SelectedRowsAdd<phi::CPUContext, float> add_functor;
@@ -144,7 +144,7 @@ TEST(selected_rows_functor, cpu_add_to) {
   output->set_height(height);
   auto* out_value = output->mutable_value();
 
-  // simplely concat two SelectedRows
+  // simply concat two SelectedRows
   out_value->mutable_data<float>(common::make_ddim({7, 10}), cpu_place);
 
   phi::funcs::SelectedRowsAddTo<phi::CPUContext, float> add_to_functor;
@@ -461,7 +461,7 @@ TEST(selected_rows_functor, cpu_sum_to) {
   std::unique_ptr<phi::SelectedRows> output{new phi::SelectedRows()};
   output->set_height(height);
   auto* out_value = output->mutable_value();
-  // simplely concat two SelectedRows
+  // simply concat two SelectedRows
   out_value->mutable_data<float>(common::make_ddim({7, 10}), cpu_place);
   phi::funcs::SelectedRowsSumTo<phi::CPUContext, float> sum_to_functor;
   sum_to_functor(ctx,
diff --git a/test/cpp/fluid/memory/buddy_allocator_test.cc b/test/cpp/fluid/memory/buddy_allocator_test.cc
index 7f4f452d0ebc3..73140ced4e734 100644
--- a/test/cpp/fluid/memory/buddy_allocator_test.cc
+++ b/test/cpp/fluid/memory/buddy_allocator_test.cc
@@ -175,7 +175,7 @@ TEST(BuddyAllocator, FractionRefillPool) {
 
   size_t alloc = platform::GpuAvailableMemToAlloc() *
                  FLAGS_fraction_of_gpu_memory_to_use;  // NOLINT
-  // Exceed pool trigger refilling size of fraction of avaiable gpu, and should
+  // Exceed pool trigger refilling size of fraction of available gpu, and should
   // be able to alloc 60% of the remaining GPU
   int* p1 = TestBuddyAllocator(&buddy_allocator,
                                alloc,
@@ -186,7 +186,7 @@ TEST(BuddyAllocator, FractionRefillPool) {
 
   alloc = platform::GpuAvailableMemToAlloc() *
           FLAGS_fraction_of_gpu_memory_to_use;  // NOLINT
-  // Exceed pool trigger refilling size of fraction of avaiable gpu, and should
+  // Exceed pool trigger refilling size of fraction of available gpu, and should
   // be able to alloc 60% of the remaining GPU
   TestBuddyAllocator(&buddy_allocator,
                      alloc,
diff --git a/test/cpp/fluid/memory/mmap_allocator_test.cc b/test/cpp/fluid/memory/mmap_allocator_test.cc
index a8c20e8d04d7b..cba47a9a5290b 100644
--- a/test/cpp/fluid/memory/mmap_allocator_test.cc
+++ b/test/cpp/fluid/memory/mmap_allocator_test.cc
@@ -25,7 +25,7 @@ namespace allocation {
 TEST(MemoryMapAllocation, test_allocation_base) {
   size_t data_size = 4UL * 1024;
 
-  // 1. allocate writer holader
+  // 1. allocate writer holder
   auto mmap_writer_holder = AllocateMemoryMapWriterAllocation(data_size);
   std::string ipc_name = mmap_writer_holder->ipc_name();
   // 2. write data
diff --git a/test/cpp/fluid/mkldnn/test_mkldnn_cpu_quantize_pass.cc b/test/cpp/fluid/mkldnn/test_mkldnn_cpu_quantize_pass.cc
index b87f32dcb5a85..de4481224fbf7 100644
--- a/test/cpp/fluid/mkldnn/test_mkldnn_cpu_quantize_pass.cc
+++ b/test/cpp/fluid/mkldnn/test_mkldnn_cpu_quantize_pass.cc
@@ -39,7 +39,7 @@ using VarQuantScale =
 
 static float const SCALE = 2.f;
 const std::vector<std::string> PreGraphPasses({
-    "conv_activation_mkldnn_fuse_pass",
+    "conv_activation_onednn_fuse_pass",
     "cpu_quantize_placement_pass",
     "cpu_quantize_pass",
 });
diff --git a/test/cpp/fluid/platform/device/custom/custom_device_test.cc b/test/cpp/fluid/platform/device/custom/custom_device_test.cc
index 4f0ce796ad66b..70cbee72f0acd 100644
--- a/test/cpp/fluid/platform/device/custom/custom_device_test.cc
+++ b/test/cpp/fluid/platform/device/custom/custom_device_test.cc
@@ -85,17 +85,17 @@ void TestTensorMutableData(const paddle::platform::Place& place) {
   auto p1_holder = src_tensor.Holder();
   EXPECT_NE(p1, nullptr);
   // set src_tensor a new dim with large size
-  // momery is supposed to be re-allocated
+  // memory is supposed to be re-allocated
   p2 = src_tensor.mutable_data<float>(common::make_ddim({3, 1024}), place);
   auto p2_holder = src_tensor.Holder();
   EXPECT_NE(p2, nullptr);
   EXPECT_NE(p1_holder.get(), p2_holder.get());
   // set src_tensor a new dim with same size
-  // momery block is supposed to be unchanged
+  // memory block is supposed to be unchanged
   p1 = src_tensor.mutable_data<float>(common::make_ddim({2, 2, 3}), place);
   EXPECT_EQ(p1, p2);
   // set src_tensor a new dim with smaller size
-  // momery block is supposed to be unchanged
+  // memory block is supposed to be unchanged
   p2 = src_tensor.mutable_data<float>(common::make_ddim({2, 2}), place);
   EXPECT_EQ(p1, p2);
 }
diff --git a/test/cpp/imperative/test_gradient_accmulator.cc b/test/cpp/imperative/test_gradient_accmulator.cc
index 12e2325873c47..84e1c937cf6fa 100644
--- a/test/cpp/imperative/test_gradient_accmulator.cc
+++ b/test/cpp/imperative/test_gradient_accmulator.cc
@@ -77,7 +77,7 @@ TEST(Test__SelectedRowsMerge_Test, SelectedRowsMerge) {
 }
 
 template <typename Place1, typename Place2, typename T>
-int TensorddTest(Place1 place1, Place2 place2, T t1, T t2) {
+int TensorAddTest(Place1 place1, Place2 place2, T t1, T t2) {
   framework::Variable var1;
   framework::Variable var2;
   std::vector<T> src_data(10, t1);
@@ -150,53 +150,53 @@ TEST(test_add_functor, add_functor) {
   int cpu_res = 1;
 
   // float32
-  cpu_res = TensorddTest(
+  cpu_res = TensorAddTest(
       cpu_place, cpu_place, static_cast<float>(1.0), static_cast<float>(2.0));
   EXPECT_EQ(cpu_res, 0);
   // float16
-  cpu_res = TensorddTest(cpu_place,
-                         cpu_place,
-                         static_cast<platform::float16>(1.0),
-                         static_cast<platform::float16>(2.0));
+  cpu_res = TensorAddTest(cpu_place,
+                          cpu_place,
+                          static_cast<platform::float16>(1.0),
+                          static_cast<platform::float16>(2.0));
   EXPECT_EQ(cpu_res, 0);
   // double
-  cpu_res = TensorddTest(
+  cpu_res = TensorAddTest(
       cpu_place, cpu_place, static_cast<double>(1.0), static_cast<double>(2.0));
   EXPECT_EQ(cpu_res, 0);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   int gpu_res = 1;
-  gpu_res = TensorddTest(gpu_place, gpu_place, 1.0, 0.0);
+  gpu_res = TensorAddTest(gpu_place, gpu_place, 1.0, 0.0);
   EXPECT_EQ(gpu_res, 0);
-  gpu_res = TensorddTest(
+  gpu_res = TensorAddTest(
       gpu_place, gpu_place, static_cast<double>(1.0), static_cast<double>(2.0));
   EXPECT_EQ(gpu_res, 0);
 
   // normal
-  gpu_res = TensorddTest(
+  gpu_res = TensorAddTest(
       gpu_place, gpu_place, static_cast<float>(1.0), static_cast<float>(2.0));
   EXPECT_EQ(gpu_res, 0);
-  gpu_res = TensorddTest(gpu_place,
-                         gpu_place,
-                         static_cast<platform::float16>(1.0),
-                         static_cast<platform::float16>(2.0));
+  gpu_res = TensorAddTest(gpu_place,
+                          gpu_place,
+                          static_cast<platform::float16>(1.0),
+                          static_cast<platform::float16>(2.0));
   EXPECT_EQ(gpu_res, 0);
   // different places
-  gpu_res = TensorddTest(
+  gpu_res = TensorAddTest(
       cpu_place, gpu_place, static_cast<float>(1.0), static_cast<float>(2.0));
   EXPECT_EQ(gpu_res, 0);
-  gpu_res = TensorddTest(
+  gpu_res = TensorAddTest(
       gpu_place, cpu_place, static_cast<float>(1.0), static_cast<float>(2.0));
   EXPECT_EQ(gpu_res, 0);
-  gpu_res = TensorddTest(cpu_place,
-                         gpu_place,
-                         static_cast<platform::float16>(1.0),
-                         static_cast<platform::float16>(2.0));
+  gpu_res = TensorAddTest(cpu_place,
+                          gpu_place,
+                          static_cast<platform::float16>(1.0),
+                          static_cast<platform::float16>(2.0));
   EXPECT_EQ(gpu_res, 0);
-  gpu_res = TensorddTest(gpu_place,
-                         cpu_place,
-                         static_cast<platform::float16>(1.0),
-                         static_cast<platform::float16>(2.0));
+  gpu_res = TensorAddTest(gpu_place,
+                          cpu_place,
+                          static_cast<platform::float16>(1.0),
+                          static_cast<platform::float16>(2.0));
   EXPECT_EQ(gpu_res, 0);
 #endif
 
@@ -204,56 +204,56 @@ TEST(test_add_functor, add_functor) {
   platform::XPUPlace xpu_place(0);
   int xpu_res = 1;
   // normal
-  xpu_res = TensorddTest(
+  xpu_res = TensorAddTest(
       xpu_place, xpu_place, static_cast<float>(1.0), static_cast<float>(2.0));
   EXPECT_EQ(xpu_res, 0);
-  xpu_res = TensorddTest(xpu_place,
-                         xpu_place,
-                         static_cast<platform::float16>(1.0),
-                         static_cast<platform::float16>(2.0));
+  xpu_res = TensorAddTest(xpu_place,
+                          xpu_place,
+                          static_cast<platform::float16>(1.0),
+                          static_cast<platform::float16>(2.0));
   EXPECT_EQ(xpu_res, 0);
-  xpu_res = TensorddTest(
+  xpu_res = TensorAddTest(
       xpu_place, xpu_place, static_cast<double>(1.0), static_cast<double>(2.0));
   EXPECT_EQ(xpu_res, 0);
   // different places
-  xpu_res = TensorddTest(
+  xpu_res = TensorAddTest(
       cpu_place, xpu_place, static_cast<float>(1.0), static_cast<float>(2.0));
   EXPECT_EQ(xpu_res, 0);
-  xpu_res = TensorddTest(
+  xpu_res = TensorAddTest(
       xpu_place, cpu_place, static_cast<float>(1.0), static_cast<float>(2.0));
   EXPECT_EQ(xpu_res, 0);
-  xpu_res = TensorddTest(cpu_place,
-                         xpu_place,
-                         static_cast<platform::float16>(1.0),
-                         static_cast<platform::float16>(2.0));
+  xpu_res = TensorAddTest(cpu_place,
+                          xpu_place,
+                          static_cast<platform::float16>(1.0),
+                          static_cast<platform::float16>(2.0));
   EXPECT_EQ(xpu_res, 0);
-  xpu_res = TensorddTest(xpu_place,
-                         cpu_place,
-                         static_cast<platform::float16>(1.0),
-                         static_cast<platform::float16>(2.0));
+  xpu_res = TensorAddTest(xpu_place,
+                          cpu_place,
+                          static_cast<platform::float16>(1.0),
+                          static_cast<platform::float16>(2.0));
   EXPECT_EQ(xpu_res, 0);
-  xpu_res = TensorddTest(
+  xpu_res = TensorAddTest(
       cpu_place, xpu_place, static_cast<double>(1.0), static_cast<double>(2.0));
   EXPECT_EQ(xpu_res, 0);
-  xpu_res = TensorddTest(
+  xpu_res = TensorAddTest(
       xpu_place, cpu_place, static_cast<double>(1.0), static_cast<double>(2.0));
   EXPECT_EQ(xpu_res, 0);
 #endif
 }
 
-TEST(test_add_functor, execption) {
+TEST(test_add_functor, exception) {
   platform::CUDAPinnedPlace cuda_pinned_place;
   platform::CUDAPlace cuda_place(0);
   platform::CPUPlace cpu_place;
 
-  ASSERT_ANY_THROW(TensorddTest(cpu_place, cpu_place, 1, 0));
+  ASSERT_ANY_THROW(TensorAddTest(cpu_place, cpu_place, 1, 0));
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   ASSERT_ANY_THROW(
-      TensorddTest(cuda_pinned_place, cuda_pinned_place, 1.0, 0.0));
-  ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place,
-                                cuda_pinned_place,
-                                static_cast<platform::float16>(1.0),
-                                static_cast<platform::float16>(2.0)));
+      TensorAddTest(cuda_pinned_place, cuda_pinned_place, 1.0, 0.0));
+  ASSERT_ANY_THROW(TensorAddTest(cuda_pinned_place,
+                                 cuda_pinned_place,
+                                 static_cast<platform::float16>(1.0),
+                                 static_cast<platform::float16>(2.0)));
 #endif
 }
 
diff --git a/test/cpp/imperative/test_hooks.cc b/test/cpp/imperative/test_hooks.cc
index d0b0910ebf925..90d5d364e7b53 100644
--- a/test/cpp/imperative/test_hooks.cc
+++ b/test/cpp/imperative/test_hooks.cc
@@ -148,7 +148,7 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) {
   }
 }
 
-void GradVarLeafBackwardHookWithGradAccmulatedTest() {
+void GradVarLeafBackwardHookWithGradAccumulatedTest() {
   // 1. prepare
   Tracer tracer;
   std::shared_ptr<VarBase> x(new VarBase(true, "x"));
@@ -268,13 +268,13 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() {
   }
 }
 
-TEST(TestHooks, TestGradVarLeafBackwardHookWithGradAccmulated) {
-  GradVarLeafBackwardHookWithGradAccmulatedTest();
+TEST(TestHooks, TestGradVarLeafBackwardHookWithGradAccumulated) {
+  GradVarLeafBackwardHookWithGradAccumulatedTest();
 }
 
-TEST(TestHooks, TestGradVarLeafBackwardHookWithSortedGradAccmulated) {
+TEST(TestHooks, TestGradVarLeafBackwardHookWithSortedGradAccumulated) {
   FLAGS_sort_sum_gradient = true;
-  GradVarLeafBackwardHookWithGradAccmulatedTest();
+  GradVarLeafBackwardHookWithGradAccumulatedTest();
   FLAGS_sort_sum_gradient = false;
 }
 
diff --git a/test/cpp/imperative/test_prepare_op.cc b/test/cpp/imperative/test_prepare_op.cc
index e9f554e2de660..adb938375c359 100644
--- a/test/cpp/imperative/test_prepare_op.cc
+++ b/test/cpp/imperative/test_prepare_op.cc
@@ -246,7 +246,7 @@ TEST(test_prepare_op, test_complex_eager) {
 }
 
 #ifdef PADDLE_WITH_DNNL
-TEST(test_prepare_op, test_prepare_data_cpu_mkldnn) {
+TEST(test_prepare_op, test_prepare_data_cpu_onednn) {
   TestPrepareDataSamePlace({{"use_mkldnn", true}});
 }
 #endif
diff --git a/test/cpp/inference/api/CMakeLists.txt b/test/cpp/inference/api/CMakeLists.txt
index ffb306df0810d..611a14edd0f6b 100644
--- a/test/cpp/inference/api/CMakeLists.txt
+++ b/test/cpp/inference/api/CMakeLists.txt
@@ -323,13 +323,6 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
   endfunction()
 
   if(NOT APPLE AND WITH_MKLML)
-    # RNN1
-    set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
-    download_model_and_data_without_verify(
-      ${RNN1_INSTALL_DIR} "rnn1/model.tar.gz" "rnn1/data.txt.tar.gz")
-    inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR}
-                                analyzer_rnn1_tester.cc EXTRA_DEPS common)
-
     # seq_pool1
     set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
     download_model_and_data_without_verify(
@@ -619,7 +612,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     test_analyzer_mobilenet_depthwise_conv ${IMG_CLASS_TEST_APP}
     ${MOBILENET_MODEL_DIR} false)
 
-  if(WITH_MKLDNN)
+  if(WITH_ONEDNN)
 
     ### INT8 tests
 
@@ -792,32 +785,16 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     # build test binary to be used in subsequent tests
     inference_analysis_api_test_build(${LEXICAL_TEST_APP}
                                       ${LEXICAL_TEST_APP_SRC})
-    # run lexcial analysis test
-    inference_analysis_api_lexical_test_run(
-      test_analyzer_lexical_gru ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH}
-      ${GRU_DATA_PATH})
-    # run bfloat16 lexical analysis test
-    inference_analysis_api_lexical_bfloat16_test_run(
-      test_analyzer_lexical_gru_bfloat16 ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH}
-      ${GRU_DATA_PATH})
-    # run post-training quantization lexical analysis test
-    inference_analysis_api_lexical_int8_test_run(
-      test_analyzer_lexical_gru_int8
-      ${LEXICAL_TEST_APP}
-      ${GRU_MODEL_PATH}
-      ${GRU_DATA_PATH}
-      true # enable_int8_ptq
-      false # enable_int8_qat
-      false) # fuse_multi_gru
+
     # run post-training quantization lexical analysis test with multi_gru fuse
-    inference_analysis_api_lexical_int8_test_run(
-      test_analyzer_lexical_gru_int8_multi_gru
-      ${LEXICAL_TEST_APP}
-      ${GRU_MODEL_PATH}
-      ${GRU_DATA_PATH}
-      true # enable_int8_ptq
-      false # enable_int8_qat
-      true) # fuse_multi_gru
+    # inference_analysis_api_lexical_int8_test_run(
+    #  test_analyzer_lexical_gru_int8_multi_gru
+    #  ${LEXICAL_TEST_APP}
+    #  ${GRU_MODEL_PATH}
+    #  ${GRU_DATA_PATH}
+    #  true # enable_int8_ptq
+    #  false # enable_int8_qat
+    #  true) # fuse_multi_gru
 
     # run qat gru test
     set(QAT_GRU_MODEL_ARCHIVE "GRU_quant_acc.tar.gz")
@@ -1262,7 +1239,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     --infer_data=${OCR_INSTALL_DIR}/data.txt
     --refer_result=${OCR_INSTALL_DIR}/result.txt)
 
-  if(WITH_MKLDNN)
+  if(WITH_ONEDNN)
     inference_analysis_test(
       test_analyzer_capi_exp_int
       SRCS
@@ -1326,7 +1303,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
                                                                        300)
   endif()
 
-  if(WITH_MKLDNN)
+  if(WITH_ONEDNN)
     set_tests_properties(test_analyzer_int8_resnet50 PROPERTIES TIMEOUT 120)
     set_tests_properties(test_analyzer_int8_mobilenet_ssd PROPERTIES TIMEOUT
                                                                      120)
@@ -1362,7 +1339,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       set_tests_properties(trt_rebind_stream_test
                            PROPERTIES TIMEOUT 360 LABELS "RUN_TYPE=EXCLUSIVE")
     endif()
-    if(WITH_MKLDNN)
+    if(WITH_ONEDNN)
       set_tests_properties(test_analyzer_bfloat16_resnet50 PROPERTIES TIMEOUT
                                                                       120)
     endif()
@@ -1494,7 +1471,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       --dirname=${WORD2VEC_MODEL_DIR})
   endif()
 
-  if(WITH_TESTING AND WITH_MKLDNN)
+  if(WITH_TESTING AND WITH_ONEDNN)
     if(NOT APPLE)
       inference_base_test(
         test_mkldnn_quantizer
diff --git a/test/cpp/inference/api/analysis_predictor_tester.cc b/test/cpp/inference/api/analysis_predictor_tester.cc
index a8813fb9597db..e23772f2babea 100644
--- a/test/cpp/inference/api/analysis_predictor_tester.cc
+++ b/test/cpp/inference/api/analysis_predictor_tester.cc
@@ -367,7 +367,7 @@ TEST(AnalysisPredictor, mkldnn_fc_passes_cpu_pass_strategy) {
   CpuPassStrategy cpuPassStrategy;
   cpuPassStrategy.EnableMKLDNN();
   const std::vector<std::string> fc_passes_to_erase(
-      {"fc_mkldnn_pass", "fc_act_mkldnn_fuse_pass"});
+      {"fc_onednn_pass", "fc_act_onednn_fuse_pass"});
   for (const auto& pass : fc_passes_to_erase) {
     ASSERT_NE(cpuPassStrategy.GetPassIndex(pass), (size_t)-1);
   }
diff --git a/test/cpp/inference/api/analyzer_bert_tester.cc b/test/cpp/inference/api/analyzer_bert_tester.cc
index 9f60c72cb0bdf..dc513b7d3b82d 100644
--- a/test/cpp/inference/api/analyzer_bert_tester.cc
+++ b/test/cpp/inference/api/analyzer_bert_tester.cc
@@ -120,7 +120,7 @@ TEST(Analyzer_bert, transfer_scope_cache) {
   std::string line;
 
   for (int i = 0; i < threads_num; i++) {
-    threads.emplace_back([&, i]() {
+    threads.emplace_back([&]() {
       std::getline(fin, line);
       input = ParseInputStreamToVector(line);
       predictor->Run(input, &output, FLAGS_batch_size);
diff --git a/test/cpp/inference/api/analyzer_capi_exp_pd_config_tester.cc b/test/cpp/inference/api/analyzer_capi_exp_pd_config_tester.cc
index 210ff8e3093c8..932056d3b6b89 100644
--- a/test/cpp/inference/api/analyzer_capi_exp_pd_config_tester.cc
+++ b/test/cpp/inference/api/analyzer_capi_exp_pd_config_tester.cc
@@ -84,8 +84,8 @@ TEST(PD_Config, interface) {
   EXPECT_EQ(cpu_threads, 10);
 
   PD_ConfigEnableMkldnnQuantizer(config);
-  bool mkldnn_qt_enabled = PD_ConfigMkldnnQuantizerEnabled(config);
-  EXPECT_TRUE(mkldnn_qt_enabled);
+  bool onednn_qt_enabled = PD_ConfigMkldnnQuantizerEnabled(config);
+  EXPECT_TRUE(onednn_qt_enabled);
 
   PD_ConfigEnableMkldnnBfloat16(config);
   PD_ConfigSetBfloat16Op(config, 1, &ops_name);
diff --git a/test/cpp/inference/api/analyzer_capi_exp_xpu_tester.cc b/test/cpp/inference/api/analyzer_capi_exp_xpu_tester.cc
index 928e0c6242634..818e428b4a30d 100644
--- a/test/cpp/inference/api/analyzer_capi_exp_xpu_tester.cc
+++ b/test/cpp/inference/api/analyzer_capi_exp_xpu_tester.cc
@@ -44,7 +44,7 @@ TEST(PD_Config, use_xpu) {
   bool use_xpu = PD_ConfigUseXpu(config);
   EXPECT_TRUE(use_xpu);
   int32_t device_id = PD_ConfigXpuDeviceId(config);
-  EXPECT_EQ(devive_id, 0);
+  EXPECT_EQ(device_id, 0);
   PD_ConfigSwitchIrOptim(config, TRUE);
   bool ir_optim = PD_IrOptim(config);
   EXPECT_TRUE(ir_optim);
diff --git a/test/cpp/inference/api/analyzer_dam_tester.cc b/test/cpp/inference/api/analyzer_dam_tester.cc
index 3770aac10e371..f33e8b52bfaaf 100644
--- a/test/cpp/inference/api/analyzer_dam_tester.cc
+++ b/test/cpp/inference/api/analyzer_dam_tester.cc
@@ -210,7 +210,7 @@ void profile(bool use_mkldnn = false) {
 
   if (use_mkldnn) {
     cfg.EnableMKLDNN();
-    // Enable all the mkldnn supported ops except conv3d in dam
+    // Enable all the onednn supported ops except conv3d in dam
     std::unordered_set<std::string> op_list = {
         "softmax", "elementwise_add", "relu", "fc"};
     cfg.SetMKLDNNOp(op_list);
@@ -270,7 +270,7 @@ void compare(bool use_mkldnn = false) {
   SetConfig(&cfg);
   if (use_mkldnn) {
     cfg.EnableMKLDNN();
-    // Enable all the mkldnn supported ops except conv3d in dam
+    // Enable all the onednn supported ops except conv3d in dam
     std::unordered_set<std::string> op_list = {
         "softmax", "elementwise_add", "relu"};
     cfg.SetMKLDNNOp(op_list);
diff --git a/test/cpp/inference/api/analyzer_ernie_int8_tester.cc b/test/cpp/inference/api/analyzer_ernie_int8_tester.cc
index 57febd88b2df2..8dee9f22679f6 100644
--- a/test/cpp/inference/api/analyzer_ernie_int8_tester.cc
+++ b/test/cpp/inference/api/analyzer_ernie_int8_tester.cc
@@ -58,7 +58,7 @@ void compare_int8(bool use_mkldnn = false) {
       reinterpret_cast<const PaddlePredictor::Config *>(&cfg), inputs);
 }
 
-TEST(Analyzer_ernie, compare_int8_mkldnn) {
+TEST(Analyzer_ernie, compare_int8_onednn) {
   compare_int8(true /* use_mkldnn */);
 }
 #endif
diff --git a/test/cpp/inference/api/analyzer_int8_image_classification_tester.cc b/test/cpp/inference/api/analyzer_int8_image_classification_tester.cc
index 77c12dcfe0f52..5e85c663ba6f1 100644
--- a/test/cpp/inference/api/analyzer_int8_image_classification_tester.cc
+++ b/test/cpp/inference/api/analyzer_int8_image_classification_tester.cc
@@ -56,8 +56,8 @@ TEST(Analyzer_int8_image_classification, quantization) {
         ::paddle::inference::GetWarmupData(input_slots_all);
 
     // INT8 implies FC oneDNN passes to be used
-    q_cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
-    q_cfg.pass_builder()->AppendPass("fc_act_mkldnn_fuse_pass");
+    q_cfg.pass_builder()->AppendPass("fc_onednn_pass");
+    q_cfg.pass_builder()->AppendPass("fc_act_onednn_fuse_pass");
 
     // configure quantizer
     q_cfg.EnableMkldnnQuantizer();
diff --git a/test/cpp/inference/api/analyzer_lexical_analysis_gru_tester.cc b/test/cpp/inference/api/analyzer_lexical_analysis_gru_tester.cc
index 2d0355d361b2d..5a6e59248ec04 100644
--- a/test/cpp/inference/api/analyzer_lexical_analysis_gru_tester.cc
+++ b/test/cpp/inference/api/analyzer_lexical_analysis_gru_tester.cc
@@ -274,7 +274,7 @@ TEST(Analyzer_lexical_test, Analyzer_lexical_analysis) {
     } else if (FLAGS_enable_int8_qat) {
       analysis_cfg.EnableMkldnnInt8();
     } else {
-      // if fp32 => disable mkldnn fc passes
+      // if fp32 => disable onednn fc passes
       // when passes are enabled dnnl error occurs for iterations==0
       analysis_cfg.DisableMkldnnFcPasses();
     }
diff --git a/test/cpp/inference/api/analyzer_mmp_tester.cc b/test/cpp/inference/api/analyzer_mmp_tester.cc
index 7d28e5524b8dd..040d420e29848 100644
--- a/test/cpp/inference/api/analyzer_mmp_tester.cc
+++ b/test/cpp/inference/api/analyzer_mmp_tester.cc
@@ -79,8 +79,7 @@ void compare(bool use_mkldnn = false) {
   output->copy_to_cpu(xx_output.data());
 
   // Initialize xx model's predictor to trigger oneDNN cache clearing
-  predictor_xx =
-      std::move(InitializePredictor(FLAGS_infer_model2, data, use_mkldnn));
+  predictor_xx = InitializePredictor(FLAGS_infer_model2, data, use_mkldnn);
 
   // Run sequence of models
   predictor_1->ZeroCopyRun();
diff --git a/test/cpp/inference/api/analyzer_vit_ocr_tester.cc b/test/cpp/inference/api/analyzer_vit_ocr_tester.cc
index 3b8eeb9a22bfe..a82f3ed0cd2fd 100644
--- a/test/cpp/inference/api/analyzer_vit_ocr_tester.cc
+++ b/test/cpp/inference/api/analyzer_vit_ocr_tester.cc
@@ -102,8 +102,8 @@ TEST(Analyzer_vit_ocr, fuse_status) {
   auto fuse_statis = GetFuseStatis(
       static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
 
-  CHECK_EQ(fuse_statis.at("fc_mkldnn_pass"), 33);
-  CHECK_EQ(fuse_statis.at("fused_conv2d_gelu_mkldnn_fuse_pass"), 2);
+  CHECK_EQ(fuse_statis.at("fc_onednn_pass"), 33);
+  CHECK_EQ(fuse_statis.at("fused_conv2d_gelu_onednn_fuse_pass"), 2);
 }
 #endif
 
diff --git a/test/cpp/inference/api/full_ILSVRC2012_val_preprocess.py b/test/cpp/inference/api/full_ILSVRC2012_val_preprocess.py
index 3ebe610ea0a0f..69ece9d573859 100644
--- a/test/cpp/inference/api/full_ILSVRC2012_val_preprocess.py
+++ b/test/cpp/inference/api/full_ILSVRC2012_val_preprocess.py
@@ -165,9 +165,7 @@ def run_convert():
     ):
         if os.path.exists(output_file):
             sys.stderr.write(
-                "\n\nThe existing binary file[{}] is broken. Start to generate new one...\n\n".format(
-                    output_file
-                )
+                f"\n\nThe existing binary file[{output_file}] is broken. Start to generate new one...\n\n"
             )
             os.remove(output_file)
         if retry < try_limit:
@@ -229,9 +227,7 @@ def convert_Imagenet_local2bin(args):
         )
         if os.path.getsize(bin_file_path) == target_size:
             print(
-                "Success! The user data output binary file can be found at: {}".format(
-                    bin_file_path
-                )
+                f"Success! The user data output binary file can be found at: {bin_file_path}"
             )
         else:
             print("Conversion failed!")
diff --git a/test/cpp/inference/api/int8_mkldnn_quantization.md b/test/cpp/inference/api/int8_mkldnn_quantization.md
index 6e71990385365..b3b98eab4bd62 100644
--- a/test/cpp/inference/api/int8_mkldnn_quantization.md
+++ b/test/cpp/inference/api/int8_mkldnn_quantization.md
@@ -7,7 +7,7 @@ This document describes how to use Paddle inference Engine to convert the FP32 m
 Follow PaddlePaddle [installation instruction](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification#installation) to install PaddlePaddle. If you build PaddlePaddle yourself, please use the following cmake arguments.
 
 ```bash
-cmake ..  -DWITH_TESTING=ON -DWITH_GPU=OFF -DWITH_MKL=ON -DWITH_MKLDNN=ON -DWITH_INFERENCE_API_TEST=ON -DON_INFER=ON
+cmake ..  -DWITH_TESTING=ON -DWITH_GPU=OFF -DWITH_MKL=ON -DWITH_ONEDNN=ON -DWITH_INFERENCE_API_TEST=ON -DON_INFER=ON
 ```
 
 Note: MKL-DNN and MKL are required.
diff --git a/test/cpp/inference/api/lite_mul_model_test.cc b/test/cpp/inference/api/lite_mul_model_test.cc
index eb83abe336bf3..ca1e3c3ad2d28 100644
--- a/test/cpp/inference/api/lite_mul_model_test.cc
+++ b/test/cpp/inference/api/lite_mul_model_test.cc
@@ -32,7 +32,7 @@ int test_predictor(const AnalysisConfig& config_in,
   std::unique_ptr<PaddlePredictor> predictor;
   {
     std::unique_lock<std::mutex> lock(mutex);
-    predictor = std::move(CreatePaddlePredictor(config));
+    predictor = CreatePaddlePredictor(config);
   }
   if (barrier) {
     barrier->Wait();
@@ -75,7 +75,7 @@ int test_predictor_zero_copy(const AnalysisConfig& config_in,
   std::unique_ptr<PaddlePredictor> predictor;
   {
     std::unique_lock<std::mutex> lock(mutex);
-    predictor = std::move(CreatePaddlePredictor(config));
+    predictor = CreatePaddlePredictor(config);
   }
   if (barrier) {
     barrier->Wait();
diff --git a/test/cpp/inference/api/mkldnn_quantizer_config_tester.cc b/test/cpp/inference/api/mkldnn_quantizer_config_tester.cc
index 6e372263ca828..f686b3d39f9cf 100644
--- a/test/cpp/inference/api/mkldnn_quantizer_config_tester.cc
+++ b/test/cpp/inference/api/mkldnn_quantizer_config_tester.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <fstream>
 #include <iostream>
 
-#include "paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h"
+#include "paddle/fluid/inference/api/paddle_onednn_quantizer_config.h"
 #include "test/cpp/inference/api/tester_helper.h"
 
 namespace paddle {
diff --git a/test/cpp/inference/api/mkldnn_quantizer_tester.cc b/test/cpp/inference/api/mkldnn_quantizer_tester.cc
index 28840dbbb0fb4..849f9ee6aa18b 100644
--- a/test/cpp/inference/api/mkldnn_quantizer_tester.cc
+++ b/test/cpp/inference/api/mkldnn_quantizer_tester.cc
@@ -16,7 +16,7 @@
 
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/mkldnn_quantizer.h"
+#include "paddle/fluid/inference/api/onednn_quantizer.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 
 PD_DEFINE_string(dirname, "", "dirname to tests.");
@@ -27,7 +27,7 @@ class MkldnnQuantizerTest : public testing::Test {
  public:
   MkldnnQuantizerTest() {
     AnalysisConfig config(FLAGS_dirname);
-    predictor = std::move(CreatePaddlePredictor(config));
+    predictor = CreatePaddlePredictor(config);
     auto* predictor_p = static_cast<AnalysisPredictor*>(predictor.get());
 
     auto qconfig = new MkldnnQuantizerConfig();
diff --git a/test/cpp/inference/infer_ut/CMakeLists.txt b/test/cpp/inference/infer_ut/CMakeLists.txt
index 087bd14340cad..87f9559925e48 100644
--- a/test/cpp/inference/infer_ut/CMakeLists.txt
+++ b/test/cpp/inference/infer_ut/CMakeLists.txt
@@ -177,7 +177,7 @@ if(WITH_MKL)
         ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
         ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
   endif()
-  set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn")
+  set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}onednn")
   if(EXISTS ${MKLDNN_PATH})
     include_directories("${MKLDNN_PATH}/include")
     if(WIN32)
diff --git a/test/cpp/inference/infer_ut/run.sh b/test/cpp/inference/infer_ut/run.sh
index f55f7811606aa..88cdb3bacc1e5 100755
--- a/test/cpp/inference/infer_ut/run.sh
+++ b/test/cpp/inference/infer_ut/run.sh
@@ -38,7 +38,7 @@ current_dir=`pwd`
 build_dir=${current_dir}/build
 log_dir=${current_dir}/log
 
-# check mkldnn installation
+# check onednn installation
 if [ $2 == ON ]; then
   # You can export yourself if move the install path
   MKL_LIB=${inference_install_dir}/third_party/install/mklml/lib
diff --git a/test/cpp/inference/infer_ut/test_ppyolo_mbv3.cc b/test/cpp/inference/infer_ut/test_ppyolo_mbv3.cc
index f0b128d97b5c7..90c3f1655b1a4 100644
--- a/test/cpp/inference/infer_ut/test_ppyolo_mbv3.cc
+++ b/test/cpp/inference/infer_ut/test_ppyolo_mbv3.cc
@@ -104,7 +104,7 @@ TEST(tensorrt_tester_ppyolo_mbv3, multi_thread4_trt_fp32_bz2) {
 }
 
 TEST(DISABLED_mkldnn_tester_ppyolo_mbv3, multi_thread4_mkl_bz2) {
-  // TODO(OliverLPH): mkldnn multi thread will fail
+  // TODO(OliverLPH): onednn multi thread will fail
   int thread_num = 4;
   // init input data
   auto input_data_map = PrepareInput(2);
diff --git a/test/cpp/inference/test_helper.h b/test/cpp/inference/test_helper.h
index cbef6a3f58809..3700ed0b602b1 100644
--- a/test/cpp/inference/test_helper.h
+++ b/test/cpp/inference/test_helper.h
@@ -225,7 +225,7 @@ void TestInference(const std::string& dirname,
     fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
   }
 
-  // 6. If export Flags_use_mkldnn=True, use mkldnn related ops.
+  // 6. If export Flags_use_mkldnn=True, use onednn related ops.
   if (FLAGS_use_mkldnn) executor.EnableMKLDNN(*inference_program);
 
   // 7. Run the inference program
diff --git a/test/cpp/new_executor/standalone_executor_test.cc b/test/cpp/new_executor/standalone_executor_test.cc
index 67f7aec8c8dfe..d993deb10c69e 100644
--- a/test/cpp/new_executor/standalone_executor_test.cc
+++ b/test/cpp/new_executor/standalone_executor_test.cc
@@ -126,17 +126,18 @@ ProgramDesc GetLmMainProgram() {
   auto& global_block = main_prog.Block(0);
   int64_t batch_size = 20;
 
-  auto& op1 = global_block.AllOps()[1];
+  const auto allOps = global_block.AllOps();
+  auto& op1 = allOps[1];
   auto shape1 = PADDLE_GET_CONST(std::vector<int64_t>, op1->GetAttr("shape"));
   shape1[0] = batch_size * 20;
   op1->SetAttr("shape", shape1);
 
-  auto& op2 = global_block.AllOps()[2];
+  auto& op2 = allOps[2];
   auto shape2 = PADDLE_GET_CONST(std::vector<int64_t>, op2->GetAttr("shape"));
   shape2[0] = batch_size;
   op2->SetAttr("shape", shape2);
 
-  auto& op3 = global_block.AllOps()[3];
+  auto& op3 = allOps[3];
   auto shape3 = PADDLE_GET_CONST(std::vector<int64_t>, op3->GetAttr("shape"));
   shape3[0] = batch_size;
   op3->SetAttr("shape", shape3);
diff --git a/test/cpp/pir/CMakeLists.txt b/test/cpp/pir/CMakeLists.txt
index e7de653656897..ce704664982b1 100644
--- a/test/cpp/pir/CMakeLists.txt
+++ b/test/cpp/pir/CMakeLists.txt
@@ -8,3 +8,4 @@ add_subdirectory(control_flow_dialect)
 add_subdirectory(shape_dialect)
 add_subdirectory(sub_graph)
 add_subdirectory(distributed)
+add_subdirectory(operator)
diff --git a/test/cpp/pir/cinn/CMakeLists.txt b/test/cpp/pir/cinn/CMakeLists.txt
index a21d476cc773f..bb68da48a8245 100644
--- a/test/cpp/pir/cinn/CMakeLists.txt
+++ b/test/cpp/pir/cinn/CMakeLists.txt
@@ -3,8 +3,6 @@ add_subdirectory(adt)
 if(WITH_TESTING AND WITH_CINN)
   paddle_test(test_pir_compiler SRCS pir_compiler_test.cc)
 
-  paddle_test(test_jit_instruction SRCS jit_instruction_test.cc)
-
   paddle_test(test_dialect_convert SRCS dialect_convert_test.cc)
 
   paddle_test(test_add_broadcast_to_elementwise SRCS
@@ -34,7 +32,6 @@ if(WITH_TESTING AND WITH_CINN)
   # CINN CI.
   set(cinn_unit_tests
       test_pir_compiler
-      test_jit_instruction
       test_dialect_convert
       test_add_broadcast_to_elementwise
       test_sub_graph_extract
diff --git a/test/cpp/pir/cinn/generate_shape_util_test.cc b/test/cpp/pir/cinn/generate_shape_util_test.cc
index b48d8dae882ce..8709d660bff2e 100644
--- a/test/cpp/pir/cinn/generate_shape_util_test.cc
+++ b/test/cpp/pir/cinn/generate_shape_util_test.cc
@@ -25,7 +25,7 @@ using namespace symbol;  // NOLINT
 
 namespace {
 DimExpr CreateExampleDimExpr() {
-  DimExprBuilder dim_expr_builder{nullptr};
+  DimExprBuilder dim_expr_builder;
   DimExpr sym0 = DimExpr("S0");
   DimExpr sym1 = DimExpr("S1");
   DimExpr constant = DimExpr(2);
diff --git a/test/cpp/pir/cinn/jit_instruction_test.cc b/test/cpp/pir/cinn/jit_instruction_test.cc
deleted file mode 100644
index 29c8300436b03..0000000000000
--- a/test/cpp/pir/cinn/jit_instruction_test.cc
+++ /dev/null
@@ -1,179 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-#include <memory>
-#include <set>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-
-#include "paddle/fluid/framework/new_executor/interpretercore.h"
-
-#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
-#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
-#include "paddle/pir/include/core/ir_context.h"
-#include "paddle/pir/include/core/program.h"
-
-#include "paddle/cinn/hlir/dialect/operator/ir/op_attribute.h"
-#include "paddle/cinn/hlir/dialect/operator/ir/op_dialect.h"
-#include "paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.h"
-#include "paddle/cinn/hlir/dialect/runtime/ir/runtime_dialect.h"
-#include "paddle/cinn/hlir/framework/pir_compiler.h"
-#include "paddle/cinn/utils/data_util.h"
-#include "paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h"
-#include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
-#include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-
-bool simple_cmp(float a, float b) { return std::abs((a - b) / a) < 1e-5; }
-
-std::unique_ptr<::pir::Program> BuildProgram() {
-  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
-  auto program = std::make_unique<::pir::Program>(ctx);
-  ::pir::Builder builder = ::pir::Builder(ctx, program->block());
-
-  const float value = 0.5;
-  auto full_op_x =
-      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{8, 8},
-                                             value,
-                                             phi::DataType::FLOAT32,
-                                             phi::GPUPlace());
-
-  auto full_op_y =
-      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{8, 8},
-                                             value,
-                                             phi::DataType::FLOAT32,
-                                             phi::GPUPlace());
-  auto full_op_z =
-      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{8, 8},
-                                             value,
-                                             phi::DataType::FLOAT32,
-                                             phi::GPUPlace());
-
-  auto sin = builder.Build<paddle::dialect::SinOp>(full_op_x.result(0));
-  auto cos = builder.Build<paddle::dialect::CosOp>(full_op_y.result(0));
-  auto add =
-      builder.Build<paddle::dialect::AddOp>(sin.result(0), cos.result(0));
-  builder.Build<paddle::dialect::FetchOp>(add.out(), "out", 0);
-  return std::move(program);
-}
-
-namespace paddle {
-namespace framework {
-
-TEST(CinnJitInstruction, Run) {
-  // Step 1: Construct pir::Program
-  std::unique_ptr<::pir::Program> program = BuildProgram();
-  EXPECT_EQ(program->block()->size(), 7u);
-
-  // Step 2: Compiler New pir::Program into Runtime Program
-  auto target = cinn::common::DefaultNVGPUTarget();
-  std::set<std::string> checking_cinn_ops = {"pd_op.sin", "pd_op.cos"};
-
-  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<cinn::dialect::RuntimeDialect>();
-  ctx->GetOrRegisterDialect<cinn::dialect::OperatorDialect>();
-  ctx->GetOrRegisterDialect<paddle::dialect::KernelDialect>();
-  auto ir_program = std::make_unique<::pir::Program>(ctx);
-  std::string jit_op_name = cinn::dialect::JitKernelOp::name();
-  ::pir::OpInfo op_info = ctx->GetRegisteredOpInfo(jit_op_name);
-
-  std::unordered_map<pir::Value, pir::Value> value_map;
-  for (auto it = program->block()->begin(); it != program->block()->end();
-       ++it) {
-    if (checking_cinn_ops.count(it->name())) {
-      auto ir_compiler =
-          std::make_shared<cinn::hlir::framework::PirCompiler>(target);
-
-      std::vector<::pir::Operation*> ops = {it};
-      auto group =
-          std::make_shared<cinn::hlir::framework::pir::OpLoweringGroup>(ops);
-      auto loop_ranges = std::vector<int64_t>{8, 8};
-      group->set_loop_ranges(loop_ranges);
-      group->mut_output_values().push_back(it->result(0));
-      auto fn_ptr_res = ir_compiler->Build({group});
-      std::unordered_map<std::string, ::pir::Attribute> op_attrs{
-          {cinn::dialect::JitKernelOp::kAttrName,
-           cinn::dialect::CINNKernelInfoAttribute::get(ctx, fn_ptr_res[0])},
-      };
-
-      auto out_type = it->result(0).type();
-      std::vector<pir::Value> vec_ins;
-      for (size_t i = 0; i < it->num_operands(); ++i) {
-        vec_ins.push_back(value_map.at(it->operand_source(i)));
-      }
-
-      ::pir::Operation* cinn_op =
-          ::pir::Operation::Create(vec_ins, op_attrs, {out_type}, op_info);
-
-      value_map[it->result(0)] = cinn_op->result(0);
-      ir_program->block()->push_back(cinn_op);
-    } else {
-      std::vector<pir::Value> vec_ins;
-
-      for (size_t i = 0; i < it->num_operands(); ++i) {
-        vec_ins.push_back(value_map.at(it->operand_source(i)));
-      }
-
-      auto type1 = it->result(0).type();
-      ::pir::OpInfo info1 = ctx->GetRegisteredOpInfo(it->name());
-      ::pir::Operation* op =
-          ::pir::Operation::Create(vec_ins, it->attributes(), {type1}, info1);
-
-      ir_program->block()->push_back(op);
-
-      value_map[it->result(0)] = op->result(0);
-    }
-  }
-
-  platform::Place place = platform::CUDAPlace(0);
-
-  auto kernel_program =
-      paddle::dialect::PdOpLowerToKernelPass(ir_program.get(), place);
-
-  Scope exe_scope;
-
-  paddle::framework::interpreter::ExecutionConfig exe_conf;
-  exe_conf.create_local_scope = false;
-  InterpreterCore executor(
-      place, {"out@fetch"}, kernel_program->block(), &exe_scope);
-
-  std::set<std::string> out_names;
-  out_names.insert("out@fetch");
-  auto local_names = exe_scope.LocalVarNames();
-  for (size_t i = 0; i < local_names.size(); ++i) {
-    out_names.insert(local_names[i]);
-  }
-
-  executor.SetSkipGcVars(out_names);
-  executor.Run({}, true);
-  auto out_tensor =
-      executor.local_scope()->FindVar("out@fetch")->Get<phi::DenseTensor>();
-
-  bool res0 = simple_cmp(out_tensor.data<float>()[0], 1.35701);
-  bool res1 = simple_cmp(out_tensor.data<float>()[1], 1.35701);
-  bool res2 = simple_cmp(out_tensor.data<float>()[2], 1.35701);
-  bool res3 = simple_cmp(out_tensor.data<float>()[3], 1.35701);
-
-  EXPECT_EQ(res0, true);
-  EXPECT_EQ(res1, true);
-  EXPECT_EQ(res2, true);
-  EXPECT_EQ(res3, true);
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/test/cpp/pir/core/CMakeLists.txt b/test/cpp/pir/core/CMakeLists.txt
index 0bb1c1b708ae0..ca5e3bfe969dc 100644
--- a/test/cpp/pir/core/CMakeLists.txt
+++ b/test/cpp/pir/core/CMakeLists.txt
@@ -7,7 +7,6 @@ paddle_test(ir_builder_test SRCS ir_builder_test.cc)
 paddle_test(ir_program_test SRCS ir_program_test.cc)
 paddle_test(ir_infershape_test SRCS ir_infershape_test.cc)
 paddle_test(scalar_attribute_test SRCS scalar_attribute_test.cc)
-paddle_test(ir_printer_test SRCS ir_printer_test.cc DEPS test_dialect)
 paddle_test(paddle_fatal_test SRCS paddle_fatal_test.cc)
 
 file(
diff --git a/test/cpp/pir/core/add_dialect_parser_test.cc b/test/cpp/pir/core/add_dialect_parser_test.cc
index 7a84ac142c750..d23b5f69a7d52 100644
--- a/test/cpp/pir/core/add_dialect_parser_test.cc
+++ b/test/cpp/pir/core/add_dialect_parser_test.cc
@@ -87,9 +87,12 @@ pir::Attribute TestParserDialect::ParseAttribute(
     pir::IrParser& parser) {  // NOLINT
   std::string type_name = parser.ConsumeToken().val_;
   std::string parenthesis_token_val = parser.ConsumeToken().val_;
-  IR_ENFORCE(parenthesis_token_val == ")",
-             "The token value of expectation is ), not " +
-                 parenthesis_token_val + "." + parser.GetErrorLocationInfo());
+  PADDLE_ENFORCE_EQ(
+      parenthesis_token_val,
+      ")",
+      phi::errors::InvalidArgument("The token value of expectation is ), not " +
+                                   parenthesis_token_val + "." +
+                                   parser.GetErrorLocationInfo()));
   return CharAttribute::Parse(parser);
 }
 
diff --git a/test/cpp/pir/core/ir_op_test.cc b/test/cpp/pir/core/ir_op_test.cc
index 3313d26dfad7b..a9ce28f69d22f 100644
--- a/test/cpp/pir/core/ir_op_test.cc
+++ b/test/cpp/pir/core/ir_op_test.cc
@@ -84,11 +84,11 @@ TEST(op_test, module_op_death) {
   std::vector<pir::Type> output_types = {pir::Float32Type::get(ctx)};
 
   EXPECT_THROW(pir::Operation::Create(inputs, {}, {}, op_info),
-               pir::IrNotMetException);
+               common::enforce::EnforceNotMet);
   EXPECT_THROW(pir::Operation::Create({}, attrs, {}, op_info),
-               pir::IrNotMetException);
+               common::enforce::EnforceNotMet);
   EXPECT_THROW(pir::Operation::Create({}, {}, output_types, op_info),
-               pir::IrNotMetException);
+               common::enforce::EnforceNotMet);
 
   pir::Program program(ctx);
 
@@ -114,7 +114,8 @@ TEST(op_test, trait_and_interface) {
   EXPECT_EQ(op2->HasInterface<test::InferShapeInterface>(), true);
 
   pir::OperationArgument argument(&ctx, "test.region");
-  EXPECT_THROW(builder.Build(std::move(argument)), pir::IrNotMetException);
+  EXPECT_THROW(builder.Build(std::move(argument)),
+               common::enforce::EnforceNotMet);
 }
 
 TEST(op_test, op_traits_test) {
@@ -163,7 +164,7 @@ TEST(op_test, same_operands_shape_trait_test1) {
   pir::Builder builder(ctx, block);
 
   EXPECT_THROW(builder.Build<test::SameOperandsShapeTraitOp1>(),
-               pir::IrNotMetException);
+               common::enforce::EnforceNotMet);
 }
 
 TEST(op_test, same_operands_shape_trait_test2) {
@@ -198,7 +199,7 @@ TEST(op_test, same_operands_shape_trait_test2) {
 
   EXPECT_THROW(builder.Build<test::SameOperandsShapeTraitOp2>(
                    op1->result(0), op2->result(0), dense_tensor_dtype),
-               pir::IrNotMetException);
+               common::enforce::EnforceNotMet);
 }
 
 TEST(op_test, same_operands_and_result_shape_trait_test1) {
@@ -211,7 +212,7 @@ TEST(op_test, same_operands_and_result_shape_trait_test1) {
   pir::Builder builder(ctx, block);
 
   EXPECT_THROW(builder.Build<test::SameOperandsAndResultShapeTraitOp1>(),
-               pir::IrNotMetException);
+               common::enforce::EnforceNotMet);
 }
 
 TEST(op_test, same_operands_and_result_shape_trait_test2) {
@@ -236,7 +237,7 @@ TEST(op_test, same_operands_and_result_shape_trait_test2) {
 
   EXPECT_THROW(builder.Build<test::SameOperandsAndResultShapeTraitOp2>(
                    op1->result(0), op2->result(0)),
-               pir::IrNotMetException);
+               common::enforce::EnforceNotMet);
 }
 
 TEST(op_test, same_operands_and_result_shape_trait_test3) {
@@ -270,7 +271,7 @@ TEST(op_test, same_operands_and_result_shape_trait_test3) {
   block->push_back(op2);
   EXPECT_THROW(builder.Build<test::SameOperandsAndResultShapeTraitOp3>(
                    op1->result(0), op2->result(0), dense_tensor_dtype),
-               pir::IrNotMetException);
+               common::enforce::EnforceNotMet);
 }
 
 TEST(op_test, same_operands_element_type_trait_test1) {
@@ -283,7 +284,7 @@ TEST(op_test, same_operands_element_type_trait_test1) {
   pir::Builder builder(ctx, block);
 
   EXPECT_THROW(builder.Build<test::SameOperandsElementTypeTraitOp1>(),
-               pir::IrNotMetException);
+               common::enforce::EnforceNotMet);
 }
 
 TEST(op_test, same_operands_element_type_trait_test2) {
@@ -315,7 +316,7 @@ TEST(op_test, same_operands_element_type_trait_test2) {
   block->push_back(op2);
   EXPECT_THROW(builder.Build<test::SameOperandsElementTypeTraitOp2>(
                    op1->result(0), op2->result(0), dense_tensor_dtype),
-               pir::IrNotMetException);
+               common::enforce::EnforceNotMet);
 }
 
 TEST(op_test, same_operands_and_result_element_type_trait_test1) {
@@ -328,7 +329,7 @@ TEST(op_test, same_operands_and_result_element_type_trait_test1) {
   pir::Builder builder(ctx, block);
 
   EXPECT_THROW(builder.Build<test::SameOperandsAndResultElementTypeTraitOp1>(),
-               pir::IrNotMetException);
+               common::enforce::EnforceNotMet);
 }
 
 TEST(op_test, same_operands_and_result_element_type_trait_test2) {
@@ -352,7 +353,7 @@ TEST(op_test, same_operands_and_result_element_type_trait_test2) {
   block->push_back(op2);
   EXPECT_THROW(builder.Build<test::SameOperandsAndResultElementTypeTraitOp2>(
                    op1->result(0), op2->result(0)),
-               pir::IrNotMetException);
+               common::enforce::EnforceNotMet);
 }
 
 TEST(op_test, same_operands_and_result_element_type_trait_test3) {
@@ -391,13 +392,13 @@ TEST(op_test, same_operands_and_result_element_type_trait_test3) {
                    op2->result(0),
                    dense_tensor_dtype1,
                    dense_tensor_dtype1),
-               pir::IrNotMetException);
+               common::enforce::EnforceNotMet);
   EXPECT_THROW(builder.Build<test::SameOperandsAndResultElementTypeTraitOp3>(
                    op1->result(0),
                    op1->result(0),
                    dense_tensor_dtype1,
                    dense_tensor_dtype2),
-               pir::IrNotMetException);
+               common::enforce::EnforceNotMet);
 }
 
 TEST(op_test, same_operands_and_result_type_trait_test1) {
@@ -410,7 +411,7 @@ TEST(op_test, same_operands_and_result_type_trait_test1) {
   pir::Builder builder(ctx, block);
 
   EXPECT_THROW(builder.Build<test::SameOperandsAndResultTypeTraitOp1>(),
-               pir::IrNotMetException);
+               common::enforce::EnforceNotMet);
 }
 
 TEST(op_test, same_operands_and_result_type_trait_test2) {
@@ -434,7 +435,7 @@ TEST(op_test, same_operands_and_result_type_trait_test2) {
   block->push_back(op2);
   EXPECT_THROW(builder.Build<test::SameOperandsAndResultTypeTraitOp2>(
                    op1->result(0), op2->result(0)),
-               pir::IrNotMetException);
+               common::enforce::EnforceNotMet);
 }
 
 TEST(op_test, same_operands_and_result_type_trait_test3) {
@@ -477,26 +478,85 @@ TEST(op_test, same_operands_and_result_type_trait_test3) {
                    op2->result(0),
                    dense_tensor_dtype1,
                    dense_tensor_dtype2),
-               pir::IrNotMetException);
+               common::enforce::EnforceNotMet);
 
   EXPECT_THROW(builder.Build<test::SameOperandsAndResultTypeTraitOp3>(
                    op1->result(0),
                    op2->result(0),
                    dense_tensor_dtype1,
                    dense_tensor_dtype3),
-               pir::IrNotMetException);
+               common::enforce::EnforceNotMet);
 
   EXPECT_THROW(builder.Build<test::SameOperandsAndResultTypeTraitOp3>(
                    op1->result(0),
                    op2->result(0),
                    dense_tensor_dtype1,
                    dense_tensor_dtype1),
-               pir::IrNotMetException);
+               common::enforce::EnforceNotMet);
 
   EXPECT_THROW(builder.Build<test::SameOperandsAndResultTypeTraitOp3>(
                    op2->result(0),
                    op1->result(0),
                    dense_tensor_dtype1,
                    dense_tensor_dtype1),
-               pir::IrNotMetException);
+               common::enforce::EnforceNotMet);
+}
+
+TEST(printer_test, custom_hooks) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Dialect *test_dialect = ctx->GetOrRegisterDialect<test::TestDialect>();
+  EXPECT_EQ(test_dialect != nullptr, true);
+
+  pir::OpInfo op1_info = ctx->GetRegisteredOpInfo(test::Operation1::name());
+  pir::OpInfo op2_info = ctx->GetRegisteredOpInfo(test::Operation2::name());
+
+  pir::Operation *op1 = pir::Operation::Create(
+      {},
+      test::CreateAttributeMap({"op1_attr1", "op1_attr2"},
+                               {"op1_attr1", "op1_attr2"}),
+      {pir::Float32Type::get(ctx)},
+      op1_info);
+  pir::Operation *op2 = pir::Operation::Create(
+      {op1->result(0)}, {}, {pir::Float32Type::get(ctx)}, op2_info);
+
+  pir::Program program(ctx);
+  program.block()->push_back(op1);
+  program.block()->push_back(op2);
+
+  pir::PrintHooks hooks;
+  // this one retains old printing and adds new info
+  hooks.value_print_hook = [](pir::Value v, pir::IrPrinter &printer) {
+    printer.IrPrinter::PrintValue(v);
+    printer.os << " [extra info]";
+  };
+  // this one overrides old printing
+  hooks.op_print_hook = [](pir::Operation *op, pir::IrPrinter &printer) {
+    printer.PrintOpResult(op);
+    printer.os << " :=";
+
+    printer.os << " \"" << op->name() << "\"";
+    printer.PrintOpOperands(op);
+    printer.PrintAttributeMap(op);
+    printer.os << " :";
+    printer.PrintOpReturnType(op);
+  };
+
+  hooks.attribute_print_hook = [](pir::Attribute attr,
+                                  pir::IrPrinter &printer) {
+    printer.os << "[PlaceHolder]";
+  };
+  hooks.type_print_hook = [](pir::Type type, pir::IrPrinter &printer) {
+    printer.os << "[" << type << "]";
+  };
+
+  std::stringstream ss;
+
+  ss << pir::CustomPrintHelper{program, hooks};
+  EXPECT_EQ(
+      ss.str(),
+      "{\n"
+      "(%0 [extra info]) := \"test.operation1\" () "
+      "{op1_attr1:[PlaceHolder],op1_attr2:[PlaceHolder]} :[f32]\n"
+      "(%1 [extra info]) := \"test.operation2\" (%0 [extra info]) {} :[f32]\n"
+      "}\n");
 }
diff --git a/test/cpp/pir/core/ir_parser_test.cc b/test/cpp/pir/core/ir_parser_test.cc
index e11ce29afc830..dbbf7d76b2766 100644
--- a/test/cpp/pir/core/ir_parser_test.cc
+++ b/test/cpp/pir/core/ir_parser_test.cc
@@ -118,7 +118,6 @@ TestTask* ParserTest::GetTestTask() {
 bool ParserTest::ConsumeTestTask(TestTask* test_task, pir::IrContext* ctx) {
   std::string test_info = test_task->test_info;
   TestType test_type = test_task->test_type;
-  std::unique_ptr<pir::IrPrinter> printer;
   std::unique_ptr<pir::IrParser> parser;
   std::stringstream is(test_info);
   parser.reset(new pir::IrParser(ctx, is));
diff --git a/test/cpp/pir/core/ir_printer_test.cc b/test/cpp/pir/core/ir_printer_test.cc
deleted file mode 100644
index 4077ddb1579eb..0000000000000
--- a/test/cpp/pir/core/ir_printer_test.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <sstream>
-
-#include "paddle/pir/include/core/dialect.h"
-#include "paddle/pir/include/core/ir_context.h"
-#include "paddle/pir/include/core/ir_printer.h"
-#include "paddle/pir/include/core/op_base.h"
-#include "paddle/pir/include/core/program.h"
-#include "test/cpp/pir/tools/test_dialect.h"
-#include "test/cpp/pir/tools/test_op.h"
-#include "test/cpp/pir/tools/test_pir_utils.h"
-
-TEST(printer_test, custom_hooks) {
-  pir::IrContext* ctx = pir::IrContext::Instance();
-  pir::Dialect* test_dialect = ctx->GetOrRegisterDialect<test::TestDialect>();
-  EXPECT_EQ(test_dialect != nullptr, true);
-
-  pir::OpInfo op1_info = ctx->GetRegisteredOpInfo(test::Operation1::name());
-  pir::OpInfo op2_info = ctx->GetRegisteredOpInfo(test::Operation2::name());
-
-  pir::Operation* op1 = pir::Operation::Create(
-      {},
-      test::CreateAttributeMap({"op1_attr1", "op1_attr2"},
-                               {"op1_attr1", "op1_attr2"}),
-      {pir::Float32Type::get(ctx)},
-      op1_info);
-  pir::Operation* op2 = pir::Operation::Create(
-      {op1->result(0)}, {}, {pir::Float32Type::get(ctx)}, op2_info);
-
-  pir::Program program(ctx);
-  program.block()->push_back(op1);
-  program.block()->push_back(op2);
-
-  pir::PrintHooks hooks;
-  // this one retains old printing and adds new info
-  hooks.value_print_hook = [](pir::Value v, pir::IrPrinter& printer) {
-    printer.IrPrinter::PrintValue(v);
-    printer.os << " [extra info]";
-  };
-  // this one overrides old printing
-  hooks.op_print_hook = [](pir::Operation* op, pir::IrPrinter& printer) {
-    printer.PrintOpResult(op);
-    printer.os << " :=";
-
-    printer.os << " \"" << op->name() << "\"";
-    printer.PrintOpOperands(op);
-    printer.PrintAttributeMap(op);
-    printer.os << " :";
-    printer.PrintOpReturnType(op);
-  };
-
-  hooks.attribute_print_hook = [](pir::Attribute attr,
-                                  pir::IrPrinter& printer) {
-    printer.os << "[PlaceHolder]";
-  };
-  hooks.type_print_hook = [](pir::Type type, pir::IrPrinter& printer) {
-    printer.os << "[" << type << "]";
-  };
-
-  std::stringstream ss;
-
-  ss << pir::CustomPrintHelper{program, hooks};
-  EXPECT_EQ(
-      ss.str(),
-      "{\n"
-      "(%0 [extra info]) := \"test.operation1\" () "
-      "{op1_attr1:[PlaceHolder],op1_attr2:[PlaceHolder]} :[f32]\n"
-      "(%1 [extra info]) := \"test.operation2\" (%0 [extra info]) {} :[f32]\n"
-      "}\n");
-}
diff --git a/test/cpp/pir/distributed/dist_dialect_test.cc b/test/cpp/pir/distributed/dist_dialect_test.cc
index 4a0e477b09ae3..8399abc30cb0b 100644
--- a/test/cpp/pir/distributed/dist_dialect_test.cc
+++ b/test/cpp/pir/distributed/dist_dialect_test.cc
@@ -345,8 +345,8 @@ TEST(shard_tensor_op_replicate_test, base) {
   auto dst_mesh_attr = ProcessMeshAttribute::get(ctx, dst_process_mesh);
   auto dst_tensor_dist_attr = TensorDistAttribute::get(
       ctx, dst_mesh_attr, dst_dims_mapping, partial_status);
-  paddle::dialect::ReShardOp reshard_op =
-      builder.Build<paddle::dialect::ReShardOp>(shard_op.out(),
+  paddle::dialect::ReshardOp reshard_op =
+      builder.Build<paddle::dialect::ReshardOp>(shard_op.out(),
                                                 dst_tensor_dist_attr);
 
   EXPECT_TRUE(reshard_op.result(0).type().isa<DistDenseTensorType>());
@@ -428,8 +428,8 @@ TEST(shard_tensor_op_shard_row_test, base) {
   auto dst_mesh_attr = ProcessMeshAttribute::get(ctx, dst_process_mesh);
   auto dst_tensor_dist_attr = TensorDistAttribute::get(
       ctx, dst_mesh_attr, dims_mapping, partial_status);
-  paddle::dialect::ReShardOp reshard_op =
-      builder.Build<paddle::dialect::ReShardOp>(shard_op.out(),
+  paddle::dialect::ReshardOp reshard_op =
+      builder.Build<paddle::dialect::ReshardOp>(shard_op.out(),
                                                 dst_tensor_dist_attr);
 
   EXPECT_TRUE(reshard_op.result(0).type().isa<DistDenseTensorType>());
@@ -511,8 +511,8 @@ TEST(shard_tensor_op_shard_col_test, base) {
   auto dst_mesh_attr = ProcessMeshAttribute::get(ctx, dst_process_mesh);
   auto dst_tensor_dist_attr = TensorDistAttribute::get(
       ctx, dst_mesh_attr, dst_dims_mapping, partial_status);
-  paddle::dialect::ReShardOp reshard_op =
-      builder.Build<paddle::dialect::ReShardOp>(shard_op.out(),
+  paddle::dialect::ReshardOp reshard_op =
+      builder.Build<paddle::dialect::ReshardOp>(shard_op.out(),
                                                 dst_tensor_dist_attr);
 
   EXPECT_TRUE(reshard_op.result(0).type().isa<DistDenseTensorType>());
diff --git a/test/cpp/pir/operator/CMakeLists.txt b/test/cpp/pir/operator/CMakeLists.txt
new file mode 100644
index 0000000000000..533fbbe6ab0ad
--- /dev/null
+++ b/test/cpp/pir/operator/CMakeLists.txt
@@ -0,0 +1,5 @@
+paddle_test(layout_transformation_interface_test SRCS
+            layout_transformation_interface_test.cc)
+if(WITH_ONNXRUNTIME AND WIN32)
+  copy_onnx(layout_transformation_interface_test)
+endif()
diff --git a/test/cpp/pir/operator/layout_transformation_interface_test.cc b/test/cpp/pir/operator/layout_transformation_interface_test.cc
new file mode 100644
index 0000000000000..7b9ed0e8c171b
--- /dev/null
+++ b/test/cpp/pir/operator/layout_transformation_interface_test.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <cstdint>
+
+#include "paddle/fluid/pir/dialect/operator/interface/layout_transformation.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/pir/include/core/builtin_dialect.h"
+#include "paddle/pir/include/core/builtin_type.h"
+#include "paddle/pir/include/core/dialect.h"
+#include "paddle/pir/include/core/op_trait.h"
+#include "paddle/pir/include/core/type.h"
+
+TEST(layout_transformation_interface_test, operator) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::Program program(ctx);
+  pir::Builder builder(ctx, program.block());
+
+  auto build_input_value = [&](std::vector<int64_t> shape = {2, 2}) {
+    auto uniform = builder.Build<paddle::dialect::UniformOp>(
+        shape, phi::DataType::FLOAT32, 0.0, 1.0, 2, phi::CPUPlace());
+    return uniform;
+  };
+
+  auto fused_conv = builder.Build<paddle::dialect::FusedConv2dAddActOp>(
+      build_input_value(std::vector<int64_t>{2, 2, 2, 2}).out(),
+      build_input_value(std::vector<int64_t>{2, 2, 2, 2}).out(),
+      build_input_value().out(),
+      build_input_value().out());
+
+  auto layout_transformation_iface =
+      fused_conv->dyn_cast<paddle::dialect::LayoutTransformationInterface>();
+  EXPECT_TRUE(layout_transformation_iface);
+
+  EXPECT_EQ(layout_transformation_iface.PreferLayout(fused_conv),
+            common::DataLayout::NHWC);
+  EXPECT_NO_THROW(layout_transformation_iface.RewriteByLayout(
+      fused_conv, common::DataLayout::NHWC));
+  EXPECT_EQ(layout_transformation_iface.RelevantInputs(fused_conv).size(),
+            fused_conv->operands().size());
+  EXPECT_EQ(layout_transformation_iface.RelevantOutputs(fused_conv).size(),
+            fused_conv->results().size());
+}
+
+TEST(immutable_layout_trait_test, operator) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::Program program(ctx);
+  pir::Builder builder(ctx, program.block());
+
+  auto build_input_value = [&](std::vector<int64_t> shape = {2, 2}) {
+    auto uniform = builder.Build<paddle::dialect::UniformOp>(
+        shape, phi::DataType::FLOAT32, 0.0, 1.0, 2, phi::CPUPlace());
+    return uniform;
+  };
+
+  auto out = builder.Build<pir::ShadowOutputOp>(
+      build_input_value(std::vector<int64_t>{2, 2, 2, 2}).out(), "test");
+  EXPECT_TRUE(out->HasTrait<pir::ImmutableLayoutTrait>());
+
+  auto immutable_layout_trait = out->dyn_cast<pir::ImmutableLayoutTrait>();
+  EXPECT_TRUE(immutable_layout_trait);
+}
diff --git a/test/cpp/pir/pass/pass_manager_test.cc b/test/cpp/pir/pass/pass_manager_test.cc
index 2a1c9a4ae4fdd..f381bace77e0b 100644
--- a/test/cpp/pir/pass/pass_manager_test.cc
+++ b/test/cpp/pir/pass/pass_manager_test.cc
@@ -102,8 +102,11 @@ IR_DEFINE_EXPLICIT_TYPE_ID(AddOp)
 
 struct CountOpAnalysis {
   explicit CountOpAnalysis(pir::Operation *container_op) {
-    IR_ENFORCE(container_op->num_regions() > 0,
-               "op must be a container with zero or multiple regions.");
+    PADDLE_ENFORCE_GT(
+        container_op->num_regions(),
+        0,
+        phi::errors::InvalidArgument(
+            "op must be a container with zero or multiple regions."));
 
     LOG(INFO) << "In CountOpAnalysis, op is " << container_op->name() << "\n";
     for (size_t i = 0; i < container_op->num_regions(); ++i) {
diff --git a/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc b/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc
index d13a2fafa8de3..a7674d60451cd 100644
--- a/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc
+++ b/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc
@@ -218,8 +218,10 @@ class RedundantTransposeFusePattern
     auto prev_trans_op = prev_op->dyn_cast<paddle::dialect::TransposeOp>();
     if (prev_trans_op) {
       std::vector<int> axis_first = GetAxis(prev_trans_op);
-      IR_ENFORCE(axis_first.size() == axis_last.size(),
-                 "transpose op's perm rank should be same.");
+      PADDLE_ENFORCE_EQ(axis_first.size(),
+                        axis_last.size(),
+                        phi::errors::InvalidArgument(
+                            "transpose op's perm rank should be same."));
       auto new_perm = GetPerm(axis_first, axis_last);
       rewriter.set_insertion_point(op);
       auto new_transpose_op = rewriter.Build<paddle::dialect::TransposeOp>(
diff --git a/test/cpp/pir/shape_dialect/CMakeLists.txt b/test/cpp/pir/shape_dialect/CMakeLists.txt
index 0fb3bac5bb10c..b9eadba894494 100644
--- a/test/cpp/pir/shape_dialect/CMakeLists.txt
+++ b/test/cpp/pir/shape_dialect/CMakeLists.txt
@@ -1,6 +1,7 @@
 paddle_test(symbol_dim_expr_test SRCS symbol_dim_expr_test.cc)
 paddle_test(simplify_dim_expr_test SRCS simplify_dim_expr_test.cc)
 paddle_test(dim_expr_util_test SRCS dim_expr_util_test.cc)
+paddle_test(constraints_manager_test SRCS constraints_manager_test.cc)
 
 if(WITH_CINN)
   paddle_test(shape_analysis_test SRCS shape_analysis_test.cc)
diff --git a/test/cpp/pir/shape_dialect/constraints_manager_test.cc b/test/cpp/pir/shape_dialect/constraints_manager_test.cc
new file mode 100644
index 0000000000000..d996d64100a42
--- /dev/null
+++ b/test/cpp/pir/shape_dialect/constraints_manager_test.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+
+#include "paddle/pir/include/dialect/shape/utils/constraints_manager.h"
+#include "paddle/pir/include/dialect/shape/utils/dim_expr_builder.h"
+
+namespace symbol::test {
+
+TEST(ConstraintsManager, EqualCstr) {
+  ConstraintsManager cstr_mgr;
+  DimExprBuilder builder;
+
+  // Eq(Mul(S0,S1),Mul(S2,S3))
+  DimExpr sym_expr_0 = builder.Symbol("S0");
+  DimExpr sym_expr_1 = builder.Symbol("S1");
+  DimExpr sym_expr_2 = builder.Symbol("S2");
+  DimExpr sym_expr_3 = builder.Symbol("S3");
+  DimExpr mul_expr_0 = builder.Mul(sym_expr_0, sym_expr_1);
+  DimExpr mul_expr_1 = builder.Mul(sym_expr_2, sym_expr_3);
+  cstr_mgr.AddEqCstr(mul_expr_0, mul_expr_1);
+  ASSERT_TRUE(cstr_mgr.IsEqual(mul_expr_0, mul_expr_1));
+
+  // Eq(Add(S0,S1),Add(S0,1))
+  DimExpr int_expr_1 = builder.ConstSize(1);
+  DimExpr add_expr_0 = builder.Add(sym_expr_0, sym_expr_1);
+  DimExpr add_expr_1 = builder.Add(sym_expr_0, int_expr_1);
+  cstr_mgr.AddEqCstr(add_expr_0, add_expr_1);
+  ASSERT_FALSE(cstr_mgr.IsEqual(add_expr_0, add_expr_1));
+  DimExpr mul_expr_2 = builder.Mul(sym_expr_0, int_expr_1);
+  ASSERT_TRUE(cstr_mgr.IsEqual(mul_expr_2, mul_expr_1));
+}
+
+TEST(ConstraintsManager, GreatThanOneCstr) {
+  ConstraintsManager cstr_mgr;
+  DimExpr sym_expr_0 = DimExpr("S0");
+  DimExpr int_expr = DimExpr(5);
+  ASSERT_TRUE(cstr_mgr.IsGTOne(int_expr + sym_expr_0));
+  ASSERT_TRUE(cstr_mgr.IsGTOne(int_expr * sym_expr_0));
+  cstr_mgr.AddGTOneCstr(sym_expr_0);
+  ASSERT_TRUE(cstr_mgr.IsGTOne(sym_expr_0));
+}
+
+TEST(ConstraintsManager, BroadcastableCstr) {
+  ConstraintsManager cstr_mgr;
+  DimExpr sym_expr_0 = DimExpr("S0");
+  DimExpr int_expr = DimExpr(5);
+  cstr_mgr.AddBroadcastableCstr(sym_expr_0, int_expr);
+  ASSERT_TRUE(cstr_mgr.IsBroadcastable(sym_expr_0, int_expr));
+}
+
+}  // namespace symbol::test
diff --git a/test/cpp/pir/shape_dialect/dim_expr_util_test.cc b/test/cpp/pir/shape_dialect/dim_expr_util_test.cc
index c725eb67baf22..9e9fd62dac38c 100644
--- a/test/cpp/pir/shape_dialect/dim_expr_util_test.cc
+++ b/test/cpp/pir/shape_dialect/dim_expr_util_test.cc
@@ -63,9 +63,37 @@ TEST(DimExprUtil, Calculate) {
   ASSERT_EQ(ret.Get<std::int64_t>(), 1);
 }
 
+TEST(DimExprUtil, GetDimExprPriority) {
+  DimExprBuilder builder;
+  int priority_int = GetDimExprPriority(builder.ConstSize(1));
+  ASSERT_EQ(priority_int, 0);
+  int priority_sym = GetDimExprPriority(builder.Symbol("S0"));
+  ASSERT_EQ(priority_sym, 1);
+  int priority_add =
+      GetDimExprPriority(builder.Add(DimExpr("S1"), DimExpr("S2")));
+  ASSERT_EQ(priority_add, 2);
+  int priority_bc =
+      GetDimExprPriority(builder.Broadcast(DimExpr("S3"), DimExpr("S4")));
+  ASSERT_EQ(priority_bc, 2);
+}
+
+TEST(DimExprUtil, CompareDimExprPriority) {
+  DimExprBuilder builder;
+  DimExpr sym_expr_0 = builder.Symbol("S0");
+  DimExpr sym_expr_1 = builder.Symbol("S1");
+  DimExpr add_expr = builder.Add(DimExpr("S2"), DimExpr("S3"));
+  DimExpr bc_expr = builder.Broadcast(DimExpr("S4"), DimExpr("S5"));
+  ASSERT_EQ(CompareDimExprPriority(sym_expr_0, sym_expr_1),
+            PriorityComparisonStatus::HIGHER);
+  ASSERT_EQ(CompareDimExprPriority(add_expr, sym_expr_0),
+            PriorityComparisonStatus::LOWER);
+  ASSERT_EQ(CompareDimExprPriority(add_expr, bc_expr),
+            PriorityComparisonStatus::EQUAL);
+}
+
 TEST(DimExpr, CollectDimExprSymbol) {
   DimExpr dim_expr = [&]() -> DimExpr {
-    DimExprBuilder builder(nullptr);
+    DimExprBuilder builder;
     DimExpr max_expr = builder.Max(DimExpr("S2"), DimExpr("S3"));
     DimExpr min_expr = builder.Min(max_expr, DimExpr("S4"));
     DimExpr broadcast_expr = builder.Broadcast(min_expr, DimExpr("S5"));
diff --git a/test/cpp/pir/shape_dialect/symbol_dim_expr_test.cc b/test/cpp/pir/shape_dialect/symbol_dim_expr_test.cc
index 5bfc8b5393fc6..28e54d117e14d 100644
--- a/test/cpp/pir/shape_dialect/symbol_dim_expr_test.cc
+++ b/test/cpp/pir/shape_dialect/symbol_dim_expr_test.cc
@@ -31,7 +31,7 @@ TEST(DimExpr, DimExprNaive) {
 
 // Construct DimExpr by DimExprBuilder
 TEST(DimExpr, DimExprBuilder) {
-  DimExprBuilder builder{nullptr};
+  DimExprBuilder builder;
   DimExpr sym0 = DimExpr("S0");
   DimExpr sym1 = DimExpr("S1");
   DimExpr constant1 = DimExpr(1);
@@ -39,26 +39,6 @@ TEST(DimExpr, DimExprBuilder) {
   DimExpr out = builder.Broadcast(add, constant1);
 }
 
-// Add constraints by DimExprBuilder
-TEST(DimExpr, Constraint) {
-  std::vector<DimExprConstraint> constraints{};
-  DimExprBuilder builder(&constraints);
-  DimExpr sym0 = DimExpr("S0");
-  DimExpr sym1 = DimExpr("S1");
-  builder.CstrEq(sym0, sym1);
-  ASSERT_EQ(static_cast<int>(constraints.size()), 1);
-  std::vector<DimExpr> lhs = builder.ConstShape({1, 2, 3});
-  std::vector<DimExpr> rhs = builder.ConstShape({1, 2, 3});
-  std::pair<std::vector<DimExpr>, std::vector<DimExpr>> expr_pair =
-      builder.SplitAt(rhs, 1);
-  ASSERT_EQ(static_cast<int>(expr_pair.first.size()), 1);
-  ASSERT_EQ(static_cast<int>(expr_pair.second.size()), 2);
-  std::vector<DimExpr> merged =
-      builder.Concat(expr_pair.first, expr_pair.second);
-  builder.CstrEq(lhs, merged);
-  ASSERT_EQ(static_cast<int>(constraints.size()), 4);
-}
-
 /*
   Simulate the ShapeOrDataDimExprs result of below codes:
   def (x, y):
@@ -109,7 +89,7 @@ TEST(Simplify, NumberArithmetic) {
 }
 
 TEST(DimExpr, Equal) {
-  DimExprBuilder builder{nullptr};
+  DimExprBuilder builder;
   DimExpr sym0 = DimExpr("S0");
   DimExpr sym1 = DimExpr("S1");
   DimExpr constant1 = DimExpr(1);
@@ -140,7 +120,7 @@ TEST(DimExpr, Equal) {
 }
 
 TEST(DimExpr, Print) {
-  DimExprBuilder builder{nullptr};
+  DimExprBuilder builder;
   DimExpr sym0 = DimExpr("S0");
   DimExpr sym1 = DimExpr("S1");
   ASSERT_EQ((ToString(sym0 + sym1)), "Add(S0, S1)");
@@ -153,7 +133,7 @@ TEST(DimExpr, Print) {
 }
 
 TEST(DimExpr, Hash) {
-  DimExprBuilder builder{nullptr};
+  DimExprBuilder builder;
   DimExpr sym0 = DimExpr("S0");
   DimExpr sym1 = DimExpr("S1");
   ASSERT_EQ((std::hash<DimExpr>()(sym0 + sym1)),
diff --git a/test/cpp/pir/tools/test_op.cc b/test/cpp/pir/tools/test_op.cc
index 6bfb0767b3d43..7d03e6ca5c0a5 100644
--- a/test/cpp/pir/tools/test_op.cc
+++ b/test/cpp/pir/tools/test_op.cc
@@ -32,9 +32,13 @@ void BranchOp::Build(pir::Builder &builder,             // NOLINT
 }
 
 void BranchOp::VerifySig() const {
-  IR_ENFORCE((*this)->num_successors() == 1u,
-             "successors number must equal to 1.");
-  IR_ENFORCE((*this)->successor(0), "successor[0] can't be nullptr");
+  PADDLE_ENFORCE_EQ(
+      (*this)->num_successors(),
+      1u,
+      phi::errors::InvalidArgument("successors number must equal to 1."));
+  PADDLE_ENFORCE_NOT_NULL(
+      (*this)->successor(0),
+      phi::errors::InvalidArgument("successor[0] can't be nullptr"));
 }
 
 const char *Operation1::attributes_name[2] = {"op1_attr1",
diff --git a/test/cpp/pir/tools/test_trait.cc b/test/cpp/pir/tools/test_trait.cc
index 431998b11c0ce..0ea1c19b0ff8d 100644
--- a/test/cpp/pir/tools/test_trait.cc
+++ b/test/cpp/pir/tools/test_trait.cc
@@ -19,10 +19,12 @@
 namespace test {
 void OneRegionTrait::Verify(pir::Operation *op) {
   VLOG(1) << "here";
-  IR_ENFORCE(op->num_regions() == 1u,
-             "%s op has one region trait, but its region size is %u",
-             op->name(),
-             op->num_regions());
+  PADDLE_ENFORCE_EQ(op->num_regions(),
+                    1u,
+                    phi::errors::InvalidArgument(
+                        "%s op has one region trait, but its region size is %u",
+                        op->name(),
+                        op->num_regions()));
 }
 }  // namespace test
 
diff --git a/test/cpp/prim/test_static_prim.cc b/test/cpp/prim/test_static_prim.cc
index 2449056625c08..dfda6cecbb411 100644
--- a/test/cpp/prim/test_static_prim.cc
+++ b/test/cpp/prim/test_static_prim.cc
@@ -174,13 +174,13 @@ TEST(StaticPrim, TanhBackwardComposite) {
       static_cast<prim::DescTensor*>(out_grad.impl().get())->get_ptr();
   target_block->RenameVar(out_grad_desc->Name(), "b@GRAD");
   std::vector<std::unique_ptr<framework::OpDesc>> grad_ops =
-      std::move(framework::OpInfoMap::Instance()
-                    .Get(forward_opdesc->Type())
-                    .CompGradOpMaker()(*forward_opdesc,
-                                       std::unordered_set<std::string>(),
-                                       &grad_to_var,
-                                       target_block,
-                                       grad_sub_block));
+      framework::OpInfoMap::Instance()
+          .Get(forward_opdesc->Type())
+          .CompGradOpMaker()(*forward_opdesc,
+                             std::unordered_set<std::string>(),
+                             &grad_to_var,
+                             target_block,
+                             grad_sub_block);
   ASSERT_EQ(target_block->AllOps().size(), static_cast<std::size_t>(1));
   ASSERT_EQ(grad_ops.size(), static_cast<std::size_t>(4));
   ASSERT_EQ(target_block->AllOps()[0]->Type(), "tanh");
diff --git a/test/cpp_extension/test_mixed_extension_setup.py b/test/cpp_extension/test_mixed_extension_setup.py
index 26c9dcbed81f7..67761df0c7651 100644
--- a/test/cpp_extension/test_mixed_extension_setup.py
+++ b/test/cpp_extension/test_mixed_extension_setup.py
@@ -103,9 +103,7 @@ def setUp(self):
         cur_dir = os.path.dirname(os.path.abspath(__file__))
         # install mixed custom_op and extension
         # compile, install the custom op egg into site-packages under background
-        cmd = 'cd {} && {} mix_relu_and_extension_setup.py install'.format(
-            cur_dir, sys.executable
-        )
+        cmd = f'cd {cur_dir} && {sys.executable} mix_relu_and_extension_setup.py install'
         run_cmd(cmd)
 
         site_dir = site.getsitepackages()[0]
@@ -213,9 +211,7 @@ def _test_double_grad_dynamic(self):
             np.testing.assert_array_equal(
                 dx_grad,
                 pd_dx_grad,
-                err_msg='custom op dx grad: {},\n paddle api dx grad: {}'.format(
-                    dx_grad, pd_dx_grad
-                ),
+                err_msg=f'custom op dx grad: {dx_grad},\n paddle api dx grad: {pd_dx_grad}',
             )
 
 
diff --git a/test/cpp_extension/utils.py b/test/cpp_extension/utils.py
index be19ccb518f4a..76502792f3f25 100644
--- a/test/cpp_extension/utils.py
+++ b/test/cpp_extension/utils.py
@@ -51,9 +51,7 @@ def check_output(out, pd_out, name):
             np.testing.assert_array_equal(
                 out[idx],
                 pd_out[idx],
-                err_msg='custom op {}: {},\n paddle api {}: {}'.format(
-                    name, out[idx], name, pd_out[idx]
-                ),
+                err_msg=f'custom op {name}: {out[idx]},\n paddle api {name}: {pd_out[idx]}',
             )
     else:
         np.testing.assert_array_equal(
diff --git a/test/custom_kernel/test_custom_kernel_dot.py b/test/custom_kernel/test_custom_kernel_dot.py
index 7059af7f49e3c..3514ee924087e 100644
--- a/test/custom_kernel/test_custom_kernel_dot.py
+++ b/test/custom_kernel/test_custom_kernel_dot.py
@@ -26,11 +26,7 @@ def setUp(self):
         cur_dir = os.path.dirname(os.path.abspath(__file__))
 
         # --inplace to place output so file to current dir
-        cmd = (
-            'cd {} && {} custom_kernel_dot_setup.py build_ext --inplace'.format(
-                cur_dir, sys.executable
-            )
-        )
+        cmd = f'cd {cur_dir} && {sys.executable} custom_kernel_dot_setup.py build_ext --inplace'
         os.system(cmd)
 
     def test_custom_kernel_dot_run(self):
@@ -59,9 +55,7 @@ def setUp(self):
         cur_dir = os.path.dirname(os.path.abspath(__file__))
 
         # --inplace to place output so file to current dir
-        cmd = 'cd {} && {} custom_kernel_dot_c_setup.py build_ext --inplace'.format(
-            cur_dir, sys.executable
-        )
+        cmd = f'cd {cur_dir} && {sys.executable} custom_kernel_dot_c_setup.py build_ext --inplace'
         os.system(cmd)
 
     def test_custom_kernel_dot_run(self):
diff --git a/test/custom_kernel/test_custom_kernel_load.py b/test/custom_kernel/test_custom_kernel_load.py
index a480567c5edcb..0c7952d3648ad 100644
--- a/test/custom_kernel/test_custom_kernel_load.py
+++ b/test/custom_kernel/test_custom_kernel_load.py
@@ -26,11 +26,7 @@ def setUp(self):
         cur_dir = os.path.dirname(os.path.abspath(__file__))
 
         # --inplace to place output so file to current dir
-        cmd = (
-            'cd {} && {} custom_kernel_dot_setup.py build_ext --inplace'.format(
-                cur_dir, sys.executable
-            )
-        )
+        cmd = f'cd {cur_dir} && {sys.executable} custom_kernel_dot_setup.py build_ext --inplace'
         os.system(cmd)
 
         # get paddle lib path and place so
diff --git a/test/custom_op/test_custom_relu_model.py b/test/custom_op/test_custom_relu_model.py
index 0e7d2c41257c7..a972831a2738d 100644
--- a/test/custom_op/test_custom_relu_model.py
+++ b/test/custom_op/test_custom_relu_model.py
@@ -26,9 +26,7 @@
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
-file = '{}\\custom_relu_for_model_jit\\custom_relu_for_model_jit.pyd'.format(
-    get_build_directory()
-)
+file = f'{get_build_directory()}\\custom_relu_for_model_jit\\custom_relu_for_model_jit.pyd'
 if os.name == 'nt' and os.path.isfile(file):
     cmd = f'del {file}'
     run_cmd(cmd, True)
diff --git a/test/custom_op/test_custom_relu_op_jit.py b/test/custom_op/test_custom_relu_op_jit.py
index 62113d7bcd563..e0d01e7cbafc2 100644
--- a/test/custom_op/test_custom_relu_op_jit.py
+++ b/test/custom_op/test_custom_relu_op_jit.py
@@ -110,9 +110,7 @@ def test_dynamic(self):
                     np.testing.assert_array_equal(
                         x_grad,
                         pd_x_grad,
-                        err_msg='custom op x grad: {},\n paddle api x grad: {}'.format(
-                            x_grad, pd_x_grad
-                        ),
+                        err_msg=f'custom op x grad: {x_grad},\n paddle api x grad: {pd_x_grad}',
                     )
 
     def test_exception(self):
diff --git a/test/custom_op/utils.py b/test/custom_op/utils.py
index c6928a0024bb8..9b36887455b1f 100644
--- a/test/custom_op/utils.py
+++ b/test/custom_op/utils.py
@@ -53,9 +53,7 @@ def check_output(out, pd_out, name):
             np.testing.assert_array_equal(
                 out[idx],
                 pd_out[idx],
-                err_msg='custom op {}: {},\n paddle api {}: {}'.format(
-                    name, out[idx], name, pd_out[idx]
-                ),
+                err_msg=f'custom op {name}: {out[idx]},\n paddle api {name}: {pd_out[idx]}',
             )
     else:
         np.testing.assert_array_equal(
diff --git a/test/custom_runtime/custom_device_multi_process_collective.py b/test/custom_runtime/custom_device_multi_process_collective.py
index d229c44d01cd8..36e51e1dc9078 100644
--- a/test/custom_runtime/custom_device_multi_process_collective.py
+++ b/test/custom_runtime/custom_device_multi_process_collective.py
@@ -27,16 +27,7 @@ def train(prefix):
     device_ids = os.getenv("PADDLE_WORLD_DEVICE_IDS")
     current_device_id = os.getenv("PADDLE_LOCAL_DEVICE_IDS")
 
-    details = "selected_accelerators:{} selected_custom_devices:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}".format(
-        selected_accelerators,
-        selected_custom_devices,
-        worker_endpoints,
-        trainers_num,
-        current_endpoint,
-        trainer_id,
-        device_ids,
-        current_device_id,
-    )
+    details = f"selected_accelerators:{selected_accelerators} selected_custom_devices:{selected_custom_devices} worker_endpoints:{worker_endpoints} trainers_num:{trainers_num} current_endpoint:{current_endpoint} trainer_id:{trainer_id} device_ids:{device_ids} device_id:{current_device_id}"
 
     print(details)
     with open(f"multi_process_{prefix}.check_{trainer_id}.log", "w") as f:
diff --git a/test/custom_runtime/test_custom_cpu_to_static.py b/test/custom_runtime/test_custom_cpu_to_static.py
index b365f8ab39811..a9e863cf5d61f 100644
--- a/test/custom_runtime/test_custom_cpu_to_static.py
+++ b/test/custom_runtime/test_custom_cpu_to_static.py
@@ -36,9 +36,7 @@ def train_func_base(epoch_id, train_loader, model, cost, optimizer):
         optimizer.step()
         optimizer.clear_grad()
         print(
-            "Epoch [{}/{}], Step [{}/{}], Loss: {}".format(
-                epoch_id + 1, EPOCH_NUM, batch_id + 1, total_step, loss.numpy()
-            )
+            f"Epoch [{epoch_id + 1}/{EPOCH_NUM}], Step [{batch_id + 1}/{total_step}], Loss: {loss.numpy()}"
         )
     epoch_end = time.time()
     print(
@@ -69,9 +67,7 @@ def train_func_ampo1(epoch_id, train_loader, model, cost, optimizer, scaler):
         scaler.minimize(optimizer, scaled)
         optimizer.clear_grad()
         print(
-            "Epoch [{}/{}], Step [{}/{}], Loss: {}".format(
-                epoch_id + 1, EPOCH_NUM, batch_id + 1, total_step, loss.numpy()
-            )
+            f"Epoch [{epoch_id + 1}/{EPOCH_NUM}], Step [{batch_id + 1}/{total_step}], Loss: {loss.numpy()}"
         )
     epoch_end = time.time()
     print(
diff --git a/test/custom_runtime/test_custom_op_setup.py b/test/custom_runtime/test_custom_op_setup.py
index 47c7d9821d6b8..d22c81019d3e5 100644
--- a/test/custom_runtime/test_custom_op_setup.py
+++ b/test/custom_runtime/test_custom_op_setup.py
@@ -223,9 +223,7 @@ def _test_double_grad_dynamic(self):
             np.testing.assert_array_equal(
                 dx_grad,
                 pd_dx_grad,
-                err_msg="custom op dx grad: {},\n paddle api dx grad: {}".format(
-                    dx_grad, pd_dx_grad
-                ),
+                err_msg=f"custom op dx grad: {dx_grad},\n paddle api dx grad: {pd_dx_grad}",
             )
 
     def _test_with_dataloader(self):
diff --git a/test/distributed_passes/dist_pass_test_base.py b/test/distributed_passes/dist_pass_test_base.py
index 945f6f29eeb43..c830e6879b81f 100644
--- a/test/distributed_passes/dist_pass_test_base.py
+++ b/test/distributed_passes/dist_pass_test_base.py
@@ -152,9 +152,9 @@ def _run_gpu_main(self, model, apply_pass, dump_file, **kwargs):
         with paddle.static.scope_guard(scope):
             exe.run(startup_prog)
             for batch_id, input_data in enumerate(reader()):
-                assert len(input_data) == len(inputs), "{} vs {}".format(
-                    len(input_data), len(inputs)
-                )
+                assert len(input_data) == len(
+                    inputs
+                ), f"{len(input_data)} vs {len(inputs)}"
                 feed = dict(zip(inputs, input_data))
                 fetch_values = exe.run(main_prog, feed=feed, fetch_list=outputs)
                 if paddle.distributed.get_rank() == 0:
@@ -246,9 +246,7 @@ def _distributed_launch(self, model, apply_pass, gpus=None, **kwargs):
             self.assertEqual(
                 exitcode,
                 0,
-                "Pass test failed with apply_pass = {}, please view log in {}".format(
-                    apply_pass, output_dir
-                ),
+                f"Pass test failed with apply_pass = {apply_pass}, please view log in {output_dir}",
             )
 
             results = []
@@ -256,9 +254,7 @@ def _distributed_launch(self, model, apply_pass, gpus=None, **kwargs):
                 dump_file = f'{output_dir}/{i}.bin'
                 self.assertTrue(
                     os.path.exists(dump_file),
-                    "Pass test failed with apply_pass = {}, please view log in {}".format(
-                        apply_pass, output_dir
-                    ),
+                    f"Pass test failed with apply_pass = {apply_pass}, please view log in {output_dir}",
                 )
                 with open(dump_file, "rb") as f:
                     results.append(pickle.load(f))
@@ -295,9 +291,7 @@ def apply_passes(self, main_prog, startup_prog):
             self.assertEqual(
                 id(p1),
                 id(p2),
-                "After solving conflicts, the {}-th pass is different: {} vs {}".format(
-                    i, p1.name, p2.name
-                ),
+                f"After solving conflicts, the {i}-th pass is different: {p1.name} vs {p2.name}",
             )
 
         auto_pass_manager.apply([main_prog], [startup_prog])
diff --git a/test/distributed_passes/test_build_cinn_pass_simple_net.py b/test/distributed_passes/test_build_cinn_pass_simple_net.py
deleted file mode 100644
index 6a1fb5552982e..0000000000000
--- a/test/distributed_passes/test_build_cinn_pass_simple_net.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from dist_pass_test_base import DistPassTestBase
-from model_zoo import simple_net
-
-import paddle
-from paddle.distributed.passes import PassManager, new_pass
-
-
-class TestBuildCINNPass(DistPassTestBase):
-    def init(self):
-        self.atol = 0.0
-        self.rtol = 0.0
-
-    def apply_passes(self, main_prog, startup_prog):
-        pass_manager = PassManager(
-            [
-                new_pass("build_cinn"),
-                new_pass("fuse_elewise_add_act"),
-            ]
-        )
-        pass_manager.apply([main_prog], [startup_prog])
-        op_types = [op.type for op in main_prog.global_block().ops]
-        self.assertTrue('cinn_launch' in op_types)
-
-    def test_bs_32(self):
-        if paddle.is_compiled_with_cinn():
-            self.check_main(simple_net, batch_size=32)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/dygraph_to_static/CMakeLists.txt b/test/dygraph_to_static/CMakeLists.txt
index 98d9498a089c6..d2e415b042b5b 100644
--- a/test/dygraph_to_static/CMakeLists.txt
+++ b/test/dygraph_to_static/CMakeLists.txt
@@ -5,15 +5,6 @@ file(
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 set(SOT_ENVS SOT_LOG_LEVEL=0 COST_MODEL=False MIN_GRAPH_SIZE=0
              STRICT_MODE=False)
-set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0)
-
-list(REMOVE_ITEM TEST_OPS test_lac)
-# NOTE(Aurelius84): In case of Windows CI, if open ON_INFER, RWLOCK of Scope
-# will be removed and will cause some random failed in multi-thread.
-if(WITH_PYTHON)
-  py_test_modules(test_lac MODULES test_lac)
-  set_tests_properties(test_lac PROPERTIES TIMEOUT 120)
-endif()
 
 if(WIN32 AND NOT WITH_GPU)
   # disable on Windows CPU CI for timeout
@@ -35,7 +26,7 @@ if(NOT WITH_GPU)
 endif()
 
 foreach(TEST_OP ${TEST_OPS})
-  py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS} ${SOT_ENVS})
+  py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${SOT_ENVS})
 endforeach()
 
 set_tests_properties(test_se_resnet PROPERTIES TIMEOUT 900)
@@ -47,6 +38,7 @@ set_tests_properties(test_reinforcement_learning PROPERTIES TIMEOUT 120)
 set_tests_properties(test_bmn PROPERTIES TIMEOUT 300)
 set_tests_properties(test_loop PROPERTIES TIMEOUT 180)
 set_tests_properties(test_mnist_amp PROPERTIES TIMEOUT 240)
+set_tests_properties(test_sentiment PROPERTIES TIMEOUT 150)
 
 if(TEST test_resnet_amp)
   set_tests_properties(test_resnet_amp PROPERTIES TIMEOUT 360)
@@ -71,11 +63,14 @@ if(WITH_GPU)
 endif()
 
 # Legacy IR only tests for dygraph_to_static
-set(LEGACY_ONLY_TEST_FILES test_legacy_error test_pylayer)
+set(LEGACY_ONLY_TEST_FILES test_legacy_error test_pylayer test_local_cast)
 foreach(ITEST ${LEGACY_ONLY_TEST_FILES})
   if(TEST ${ITEST})
     set_tests_properties(
-      ${ITEST} PROPERTIES ENVIRONMENT "FLAGS_enable_pir_with_pt_in_dy2st=0")
+      ${ITEST}
+      PROPERTIES
+        ENVIRONMENT
+        "FLAGS_enable_pir_with_pt_in_dy2st=0;FLAGS_enable_pir_api=False")
     message(
       STATUS
         "PT Disabled OpTest: set FLAGS_enable_pir_with_pt_in_dy2st to False for ${ITEST}"
diff --git a/test/dygraph_to_static/check_approval.py b/test/dygraph_to_static/check_approval.py
index a6d1795e7cc9e..8fecd46873a60 100644
--- a/test/dygraph_to_static/check_approval.py
+++ b/test/dygraph_to_static/check_approval.py
@@ -130,7 +130,6 @@ def __init__(self, start: Location, end: Location):
         "test_error.py",
         "test_op_attr.py",
         "test_se_resnet.py",
-        "test_lac.py",
         "test_convert_call.py",
         "test_local_cast.py",
         "test_origin_info.py",
@@ -156,18 +155,13 @@ def __init__(self, start: Location, end: Location):
         "test_logical.py",
         "test_inplace_assign.py",
         # TODO: Remove these files from the allow list after it's support PIR mode
-        "test_list.py",
         "test_bmn.py",
         "test_tensor_hook.py",
-        "test_container.py",
         "test_to_tensor.py",
         "test_warning.py",
         "test_typing.py",
         "test_gradname_parse.py",
-        "test_cache_program.py",
         "test_for_enumerate.py",
-        "test_lac.py",
-        "test_sentiment.py",
         "test_save_load.py",
         "test_cinn.py",
         "test_declarative.py",
diff --git a/test/dygraph_to_static/dygraph_to_static_utils.py b/test/dygraph_to_static/dygraph_to_static_utils.py
index 82b6c29074f3d..1510cb3378187 100644
--- a/test/dygraph_to_static/dygraph_to_static_utils.py
+++ b/test/dygraph_to_static/dygraph_to_static_utils.py
@@ -60,6 +60,9 @@ def test_case1(self):
 ENV_ENABLE_PIR_WITH_PT_IN_DY2ST = BooleanEnvironmentVariable(
     "FLAGS_enable_pir_with_pt_in_dy2st", True
 )
+ENV_EXE_SEQUENTIAL_RUN = BooleanEnvironmentVariable(
+    "FLAGS_new_executor_sequential_run", False
+)
 
 
 class ToStaticMode(Flag):
@@ -103,6 +106,24 @@ def lower_case_name(self):
 }
 
 
+@contextmanager
+def pir_dygraph_guard():
+    in_dygraph_mode = paddle.in_dynamic_mode()
+    with paddle.pir_utils.IrGuard():
+        if in_dygraph_mode:
+            paddle.disable_static()
+        yield
+
+
+@contextmanager
+def legacy_ir_dygraph_guard():
+    in_dygraph_mode = paddle.in_dynamic_mode()
+    with paddle.pir_utils.OldIrGuard():
+        if in_dygraph_mode:
+            paddle.disable_static()
+        yield
+
+
 def to_ast_test(fn):
     """
     convert run AST
@@ -155,15 +176,18 @@ def to_legacy_ir_test(fn):
     @wraps(fn)
     def impl(*args, **kwargs):
         logger.info("[LEGACY_IR] running legacy ir")
-        pt_in_dy2st_flag = ENV_ENABLE_PIR_WITH_PT_IN_DY2ST.name
-        original_flag_value = get_flags(pt_in_dy2st_flag)[pt_in_dy2st_flag]
-        with EnvironmentVariableGuard(ENV_ENABLE_PIR_WITH_PT_IN_DY2ST, False):
-            try:
-                set_flags({pt_in_dy2st_flag: False})
-                ir_outs = fn(*args, **kwargs)
-            finally:
-                set_flags({pt_in_dy2st_flag: original_flag_value})
-            return ir_outs
+        with legacy_ir_dygraph_guard():
+            pt_in_dy2st_flag = ENV_ENABLE_PIR_WITH_PT_IN_DY2ST.name
+            original_flag_value = get_flags(pt_in_dy2st_flag)[pt_in_dy2st_flag]
+            with EnvironmentVariableGuard(
+                ENV_ENABLE_PIR_WITH_PT_IN_DY2ST, False
+            ):
+                try:
+                    set_flags({pt_in_dy2st_flag: False})
+                    ir_outs = fn(*args, **kwargs)
+                finally:
+                    set_flags({pt_in_dy2st_flag: original_flag_value})
+                return ir_outs
 
     return impl
 
@@ -172,20 +196,21 @@ def to_pt_test(fn):
     @wraps(fn)
     def impl(*args, **kwargs):
         logger.info("[PT] running PT")
-        pt_in_dy2st_flag = ENV_ENABLE_PIR_WITH_PT_IN_DY2ST.name
-        original_flag_value = get_flags(pt_in_dy2st_flag)[pt_in_dy2st_flag]
-        if os.environ.get('FLAGS_use_stride_kernel', False):
-            return
-        with static.scope_guard(static.Scope()):
-            with static.program_guard(static.Program()):
-                with EnvironmentVariableGuard(
-                    ENV_ENABLE_PIR_WITH_PT_IN_DY2ST, True
-                ):
-                    try:
-                        set_flags({pt_in_dy2st_flag: True})
-                        ir_outs = fn(*args, **kwargs)
-                    finally:
-                        set_flags({pt_in_dy2st_flag: original_flag_value})
+        with legacy_ir_dygraph_guard():
+            pt_in_dy2st_flag = ENV_ENABLE_PIR_WITH_PT_IN_DY2ST.name
+            original_flag_value = get_flags(pt_in_dy2st_flag)[pt_in_dy2st_flag]
+            if os.environ.get('FLAGS_use_stride_kernel', False):
+                return
+            with static.scope_guard(static.Scope()):
+                with static.program_guard(static.Program()):
+                    with EnvironmentVariableGuard(
+                        ENV_ENABLE_PIR_WITH_PT_IN_DY2ST, True
+                    ):
+                        try:
+                            set_flags({pt_in_dy2st_flag: True})
+                            ir_outs = fn(*args, **kwargs)
+                        finally:
+                            set_flags({pt_in_dy2st_flag: original_flag_value})
         return ir_outs
 
     return impl
@@ -195,10 +220,7 @@ def to_pir_test(fn):
     @wraps(fn)
     def impl(*args, **kwargs):
         logger.info("[PIR] running pir")
-        in_dygraph_mode = paddle.in_dynamic_mode()
-        with paddle.pir_utils.IrGuard():
-            if in_dygraph_mode:
-                paddle.disable_static()
+        with pir_dygraph_guard():
             ir_outs = fn(*args, **kwargs)
         return ir_outs
 
@@ -438,3 +460,17 @@ def enable_to_static_guard(flag: bool):
         yield
     finally:
         program_translator.enable(original_flag_value)
+
+
+@contextmanager
+def exe_sequential_run_guard(value: bool):
+    exe_sequential_run_flag = ENV_EXE_SEQUENTIAL_RUN.name
+    original_flag_value = paddle.get_flags(exe_sequential_run_flag)[
+        exe_sequential_run_flag
+    ]
+    with EnvironmentVariableGuard(ENV_EXE_SEQUENTIAL_RUN, value):
+        try:
+            set_flags({exe_sequential_run_flag: value})
+            yield
+        finally:
+            set_flags({exe_sequential_run_flag: original_flag_value})
diff --git a/test/dygraph_to_static/ifelse_simple_func.py b/test/dygraph_to_static/ifelse_simple_func.py
index ab34d43a70a65..c57f232760b79 100644
--- a/test/dygraph_to_static/ifelse_simple_func.py
+++ b/test/dygraph_to_static/ifelse_simple_func.py
@@ -281,9 +281,7 @@ def forward(self, input):
         hidden_dim = input.shape[-1]
         if hidden_dim != self.hidden_dim:
             raise ValueError(
-                "hidden_dim {} of input is not equal to FC.weight[0]: {}".format(
-                    hidden_dim, self.hidden_dim
-                )
+                f"hidden_dim {hidden_dim} of input is not equal to FC.weight[0]: {self.hidden_dim}"
             )
 
         self.constant_vars['bias'] = paddle.tensor.fill_constant(
diff --git a/test/dygraph_to_static/test_bert.py b/test/dygraph_to_static/test_bert.py
index 41420f3b16549..84ee5e915af6b 100644
--- a/test/dygraph_to_static/test_bert.py
+++ b/test/dygraph_to_static/test_bert.py
@@ -320,28 +320,19 @@ def verify_predict(self):
                         st_res,
                         dy_res,
                         rtol=1e-05,
-                        err_msg='dygraph_res: {},\n static_res: {}'.format(
-                            dy_res[~np.isclose(st_res, dy_res)],
-                            st_res[~np.isclose(st_res, dy_res)],
-                        ),
+                        err_msg=f'dygraph_res: {dy_res[~np.isclose(st_res, dy_res)]},\n static_res: {st_res[~np.isclose(st_res, dy_res)]}',
                     )
                     np.testing.assert_allclose(
                         st_res,
                         dy_jit_res,
                         rtol=1e-05,
-                        err_msg='dygraph_jit_res: {},\n static_res: {}'.format(
-                            dy_jit_res[~np.isclose(st_res, dy_jit_res)],
-                            st_res[~np.isclose(st_res, dy_jit_res)],
-                        ),
+                        err_msg=f'dygraph_jit_res: {dy_jit_res[~np.isclose(st_res, dy_jit_res)]},\n static_res: {st_res[~np.isclose(st_res, dy_jit_res)]}',
                     )
                     np.testing.assert_allclose(
                         st_res,
                         predictor_res,
                         rtol=1e-05,
-                        err_msg='dygraph_jit_res: {},\n static_res: {}'.format(
-                            predictor_res[~np.isclose(st_res, predictor_res)],
-                            st_res[~np.isclose(st_res, predictor_res)],
-                        ),
+                        err_msg=f'dygraph_jit_res: {predictor_res[~np.isclose(st_res, predictor_res)]},\n static_res: {st_res[~np.isclose(st_res, predictor_res)]}',
                     )
             break
 
diff --git a/test/dygraph_to_static/test_bmn.py b/test/dygraph_to_static/test_bmn.py
index efb568618aa3f..91eab243daa9b 100644
--- a/test/dygraph_to_static/test_bmn.py
+++ b/test/dygraph_to_static/test_bmn.py
@@ -741,10 +741,7 @@ def test_train_pir(self):
             dygraph_res,
             static_res,
             rtol=1e-05,
-            err_msg='dygraph_res: {},\n static_res: {}'.format(
-                dygraph_res[~np.isclose(dygraph_res, static_res)],
-                static_res[~np.isclose(dygraph_res, static_res)],
-            ),
+            err_msg=f'dygraph_res: {dygraph_res[~np.isclose(dygraph_res, static_res)]},\n static_res: {static_res[~np.isclose(dygraph_res, static_res)]}',
             atol=1e-8,
         )
 
@@ -757,10 +754,7 @@ def test_train(self):
             dygraph_res,
             static_res,
             rtol=1e-05,
-            err_msg='dygraph_res: {},\n static_res: {}'.format(
-                dygraph_res[~np.isclose(dygraph_res, static_res)],
-                static_res[~np.isclose(dygraph_res, static_res)],
-            ),
+            err_msg=f'dygraph_res: {dygraph_res[~np.isclose(dygraph_res, static_res)]},\n static_res: {static_res[~np.isclose(dygraph_res, static_res)]}',
             atol=1e-8,
         )
 
@@ -788,30 +782,21 @@ def verify_predict(self):
                     st_res,
                     dy_res,
                     rtol=1e-05,
-                    err_msg='dygraph_res: {},\n static_res: {}'.format(
-                        dy_res[~np.isclose(st_res, dy_res)],
-                        st_res[~np.isclose(st_res, dy_res)],
-                    ),
+                    err_msg=f'dygraph_res: {dy_res[~np.isclose(st_res, dy_res)]},\n static_res: {st_res[~np.isclose(st_res, dy_res)]}',
                     atol=1e-8,
                 )
                 np.testing.assert_allclose(
                     st_res,
                     dy_jit_res,
                     rtol=1e-05,
-                    err_msg='dygraph_jit_res: {},\n static_res: {}'.format(
-                        dy_jit_res[~np.isclose(st_res, dy_jit_res)],
-                        st_res[~np.isclose(st_res, dy_jit_res)],
-                    ),
+                    err_msg=f'dygraph_jit_res: {dy_jit_res[~np.isclose(st_res, dy_jit_res)]},\n static_res: {st_res[~np.isclose(st_res, dy_jit_res)]}',
                     atol=1e-8,
                 )
                 np.testing.assert_allclose(
                     st_res,
                     predictor_res,
                     rtol=1e-05,
-                    err_msg='dygraph_jit_res: {},\n static_res: {}'.format(
-                        predictor_res[~np.isclose(st_res, predictor_res)],
-                        st_res[~np.isclose(st_res, predictor_res)],
-                    ),
+                    err_msg=f'dygraph_jit_res: {predictor_res[~np.isclose(st_res, predictor_res)]},\n static_res: {st_res[~np.isclose(st_res, predictor_res)]}',
                     atol=1e-8,
                 )
             break
diff --git a/test/dygraph_to_static/test_break_continue.py b/test/dygraph_to_static/test_break_continue.py
index 8fed82dc91c45..e341f91e28c5f 100644
--- a/test/dygraph_to_static/test_break_continue.py
+++ b/test/dygraph_to_static/test_break_continue.py
@@ -18,11 +18,13 @@
 from dygraph_to_static_utils import (
     Dy2StTestBase,
     enable_to_static_guard,
+    exe_sequential_run_guard,
     test_ast_only,
     test_legacy_and_pt_and_pir,
 )
 
 import paddle
+from paddle.framework import use_pir_api
 from paddle.jit.dy2static.utils import Dygraph2StaticException
 
 SEED = 2020
@@ -354,11 +356,17 @@ class TestOptimBreakInWhile(TestContinueInWhile):
     def init_dygraph_func(self):
         self.dygraph_func = test_optim_break_in_while
 
-    # TODO: Open PIR test when while_loop dy2st fixed
+    @test_legacy_and_pt_and_pir
     def test_transformed_static_result(self):
         self.init_dygraph_func()
         dygraph_res = self.run_dygraph_mode()
-        static_res = self.run_static_mode()
+        # NOTE(SigureMo): Temperary run the test in sequential run mode to avoid dependency
+        # on the execution order of the test cases.
+        if use_pir_api():
+            with exe_sequential_run_guard(True):
+                static_res = self.run_static_mode()
+        else:
+            static_res = self.run_static_mode()
         np.testing.assert_allclose(
             dygraph_res,
             static_res,
diff --git a/test/dygraph_to_static/test_cache_program.py b/test/dygraph_to_static/test_cache_program.py
index 185341da6042d..67e8b0599b52f 100644
--- a/test/dygraph_to_static/test_cache_program.py
+++ b/test/dygraph_to_static/test_cache_program.py
@@ -78,9 +78,7 @@ def test_cache(self):
                     prev_out_numpy,
                     cur_out_numpy,
                     rtol=1e-05,
-                    err_msg='Output in previous batch is {}\n Output in current batch is \n{}'.format(
-                        prev_out_numpy, cur_out_numpy
-                    ),
+                    err_msg=f'Output in previous batch is {prev_out_numpy}\n Output in current batch is \n{cur_out_numpy}',
                 )
                 self.assertEqual(prev_ops, cur_ops)
 
diff --git a/test/dygraph_to_static/test_cast.py b/test/dygraph_to_static/test_cast.py
index ef7a3a4fbf7bd..e427f15c98ada 100644
--- a/test/dygraph_to_static/test_cast.py
+++ b/test/dygraph_to_static/test_cast.py
@@ -92,9 +92,7 @@ def test_cast_result(self):
         res = self.do_test().numpy()
         self.assertTrue(
             res.dtype == self.cast_dtype,
-            msg='The target dtype is {}, but the casted dtype is {}.'.format(
-                self.cast_dtype, res.dtype
-            ),
+            msg=f'The target dtype is {self.cast_dtype}, but the casted dtype is {res.dtype}.',
         )
         ref_val = self.input.astype(self.cast_dtype)
         np.testing.assert_allclose(
@@ -159,9 +157,7 @@ def test_cast_result(self):
         res = self.do_test().numpy()
         self.assertTrue(
             res.dtype == self.cast_dtype,
-            msg='The target dtype is {}, but the casted dtype is {}.'.format(
-                self.cast_dtype, res.dtype
-            ),
+            msg=f'The target dtype is {self.cast_dtype}, but the casted dtype is {res.dtype}.',
         )
         ref_val = (
             self.input.astype(self.cast_int)
diff --git a/test/dygraph_to_static/test_cinn.py b/test/dygraph_to_static/test_cinn.py
deleted file mode 100644
index c5b907c4f4753..0000000000000
--- a/test/dygraph_to_static/test_cinn.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from dygraph_to_static_utils import Dy2StTestBase
-
-import paddle
-
-
-class Net(paddle.nn.Layer):
-    def __init__(self):
-        super().__init__()
-        self.relu = paddle.nn.functional.relu
-        self.fc = paddle.nn.Linear(4, 4)
-
-    def forward(self, x):
-        y = paddle.full_like(x, 1.0)
-        y.stop_gradient = False
-        z = self.fc(x) * y
-        out = y + z
-        out = self.relu(out)
-
-        return out
-
-
-def apply_to_static(net, use_cinn):
-    build_strategy = paddle.static.BuildStrategy()
-    build_strategy.build_cinn_pass = use_cinn
-    return paddle.jit.to_static(net, build_strategy=build_strategy)
-
-
-class TestCINN(Dy2StTestBase):
-    def setUp(self):
-        self.x = paddle.randn([2, 4])
-        self.x.stop_gradient = False
-
-    def train(self, use_cinn):
-        paddle.seed(2022)
-        net = Net()
-        sgd = paddle.optimizer.SGD(
-            learning_rate=0.1, parameters=net.parameters()
-        )
-        if use_cinn:
-            net = apply_to_static(net, use_cinn)
-
-        res = []
-        for step in range(10):
-            out = net(self.x)
-            loss = paddle.mean(out)
-            loss.backward()
-            sgd.step()
-            sgd.clear_grad()
-
-            res.append(out.numpy())
-
-            if use_cinn and paddle.is_compiled_with_cinn():
-                self.assertTrue(
-                    paddle.framework.core.is_run_with_cinn(),
-                    msg="The test was not running with CINN! Please check.",
-                )
-            else:
-                self.assertFalse(
-                    paddle.framework.core.is_run_with_cinn(),
-                    msg="The test should not running with CINN when the whl package was not compiled with CINN! Please check.",
-                )
-
-        return res
-
-    def test_cinn(self):
-        dy_res = self.train(use_cinn=False)
-        cinn_res = self.train(use_cinn=True)
-
-        for i in range(len(dy_res)):
-            np.testing.assert_array_equal(cinn_res[i], dy_res[i])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/dygraph_to_static/test_cinn_prim.py b/test/dygraph_to_static/test_cinn_prim.py
deleted file mode 100644
index 7edfbb57776fc..0000000000000
--- a/test/dygraph_to_static/test_cinn_prim.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from dygraph_to_static_utils import (
-    Dy2StTestBase,
-    test_ast_only,
-    test_legacy_and_pt_and_pir,
-)
-
-import paddle
-import paddle.nn.functional as F
-from paddle.base import core
-
-
-def apply_to_static(net, use_cinn):
-    build_strategy = paddle.static.BuildStrategy()
-    build_strategy.build_cinn_pass = use_cinn
-    return paddle.jit.to_static(net, build_strategy=build_strategy)
-
-
-class PrimeNet(paddle.nn.Layer):
-    def __init__(self):
-        super().__init__()
-        self.fc = paddle.nn.Linear(4, 4)
-
-    def forward(self, x):
-        y = self.fc(x)
-        out = F.softmax(y)
-        return out
-
-
-class TestPrimForward(Dy2StTestBase):
-    """
-    This case only tests prim_forward + to_static + cinn. Thus we need to
-    set this flag as False to avoid prim_backward.
-    core.set_prim_backward(False)
-    """
-
-    def setUp(self):
-        paddle.seed(2022)
-        self.x = paddle.randn([2, 4])
-        self.x.stop_gradient = False
-
-    def train(self, use_prim):
-        paddle.seed(2022)
-        net = PrimeNet()
-        sgd = paddle.optimizer.SGD(
-            learning_rate=0.1, parameters=net.parameters()
-        )
-        core._set_prim_forward_enabled(use_prim)
-        if use_prim:
-            net = apply_to_static(net, use_prim)
-
-        res = []
-        for _ in range(10):
-            out = net(self.x)
-            loss = paddle.mean(out)
-            loss.backward()
-            sgd.step()
-            sgd.clear_grad()
-
-            res.append(out.numpy())
-
-        self.check_prim(net, use_prim)
-
-        return res
-
-    def check_prim(self, net, use_prim):
-        if not use_prim:
-            return
-        # Please use PartialProgramLayer(second output parameter of get_concrete_program) rather than
-        # main_program here, as main_program is original program before to_prim.
-        fwd_ops = [
-            op.type
-            for op in net.forward.get_concrete_program(self.x)[1]
-            .train_program.block(0)
-            .ops
-        ]
-        # Ensure that softmax is splitted into small ops
-        self.assertTrue('softmax' not in fwd_ops)
-
-    @test_ast_only
-    def test_cinn_prim_forward(self):
-        dy_res = self.train(use_prim=False)
-        use_cinn = False
-        if paddle.is_compiled_with_cinn():
-            use_cinn = True
-        cinn_res = self.train(use_prim=use_cinn)
-
-        for i in range(len(dy_res)):
-            np.testing.assert_allclose(
-                cinn_res[i], dy_res[i], rtol=1e-7, atol=1e-7
-            )
-
-
-class TestPrimForwardAndBackward(Dy2StTestBase):
-    """
-    Test PrimeNet with @to_static + prim forward + prim backward + cinn v.s Dygraph
-    """
-
-    def setUp(self):
-        paddle.seed(2022)
-        self.x = paddle.randn([2, 4])
-        self.x.stop_gradient = False
-
-    def train(self, use_prim):
-        paddle.seed(2022)
-        net = PrimeNet()
-        sgd = paddle.optimizer.SGD(
-            learning_rate=0.1, parameters=net.parameters()
-        )
-        core._set_prim_all_enabled(use_prim)
-        if use_prim:
-            net = apply_to_static(net, use_prim)
-
-        res = []
-        for _ in range(10):
-            out = net(self.x)
-            loss = paddle.mean(out)
-            loss.backward()
-            sgd.step()
-            sgd.clear_grad()
-
-            res.append(out.numpy())
-
-        self.check_prim(net, use_prim)
-
-        return res
-
-    def check_prim(self, net, use_prim):
-        if not use_prim:
-            return
-        fwd_ops = [
-            op.type
-            for op in net.forward.get_concrete_program(self.x)[1]
-            .train_program.block(0)
-            .ops
-        ]
-        all_ops = [
-            op.type
-            for op in net.forward.program_cache.last()[-1][-1]
-            .train_program.block(0)
-            .ops
-        ]
-        # Ensure that softmax is splitted into small ops
-        self.assertTrue('softmax' not in fwd_ops)
-        for op in all_ops:
-            if op != "matmul_v2_grad":
-                self.assertTrue("_grad" not in op)
-
-    @test_ast_only
-    def test_cinn_prim(self):
-        dy_res = self.train(use_prim=False)
-        use_cinn = False
-        if paddle.is_compiled_with_cinn():
-            use_cinn = True
-        cinn_res = self.train(use_prim=use_cinn)
-
-        for i in range(len(dy_res)):
-            np.testing.assert_allclose(
-                cinn_res[i], dy_res[i], rtol=1e-6, atol=1e-6
-            )
-
-
-class TestBackend(Dy2StTestBase):
-    @test_legacy_and_pt_and_pir
-    def test_backend(self):
-        x = paddle.randn([2, 4])
-        if paddle.is_compiled_with_cinn():
-            out1 = self.forward(x, 'CINN')
-        else:
-            out1 = self.forward(x, None)
-        out2 = self.forward(x, None)
-        np.testing.assert_allclose(out1, out2, rtol=1e-6)
-
-    def forward(self, x, backend=None):
-        paddle.seed(2022)
-        net = PrimeNet()
-        net = paddle.jit.to_static(net, backend=backend)
-        out = net(x)
-        return out
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/dygraph_to_static/test_cinn_prim_gelu.py b/test/dygraph_to_static/test_cinn_prim_gelu.py
deleted file mode 100644
index c08cb058d838a..0000000000000
--- a/test/dygraph_to_static/test_cinn_prim_gelu.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from dygraph_to_static_utils import Dy2StTestBase, test_ast_only
-
-import paddle
-import paddle.nn.functional as F
-from paddle.base import core
-
-TOLERANCE = {
-    "float16": {"rtol": 1e-3, "atol": 1e-3},
-    "float32": {"rtol": 1e-6, "atol": 1e-6},
-    "float64": {"rtol": 1e-15, "atol": 1e-15},
-}
-
-approximate_conds = [True, False]
-
-
-def apply_to_static(net, use_cinn):
-    build_strategy = paddle.static.BuildStrategy()
-    build_strategy.build_cinn_pass = use_cinn
-    return paddle.jit.to_static(net, build_strategy=build_strategy)
-
-
-def generate_data(shape, dtype="float32"):
-    np_data = np.random.random(shape).astype(dtype)
-    return np_data
-
-
-class PrimeNet(paddle.nn.Layer):
-    def __init__(self, approximate):
-        super().__init__()
-        self.fc = paddle.nn.Linear(4, 4)
-        self.approximate = approximate
-
-    def forward(self, x):
-        # y = self.fc(x)
-        out = F.gelu(x, approximate=self.approximate)
-        return out
-
-
-class TestPrimForwardAndBackward(Dy2StTestBase):
-    """
-    Test PrimeNet with @to_static + prim forward + prim backward + cinn v.s Dygraph
-    """
-
-    def setUp(self):
-        paddle.seed(2022)
-        self.shapes = [[2, 4], [64, 16, 4]]
-        self.dtypes = ["float16", "float32"]
-
-    def train(self, use_prim, data):
-        for approximate in approximate_conds:
-            return self._train(use_prim, approximate, data)
-
-    def _train(self, use_prim, approximate, data):
-        paddle.seed(2022)
-        net = PrimeNet(approximate)
-        sgd = paddle.optimizer.SGD(
-            learning_rate=0.1, parameters=net.parameters()
-        )
-        core._set_prim_all_enabled(use_prim)
-        if use_prim:
-            net = apply_to_static(net, use_prim)
-
-        res = []
-        self.x = data
-        for _ in range(10):
-            out = net(data)
-            loss = paddle.mean(out)
-            loss.backward()
-            sgd.step()
-            sgd.clear_grad()
-
-            res.append(out.numpy())
-        self.check_prim(net, use_prim)
-
-        return res
-
-    def check_prim(self, net, use_prim):
-        if not use_prim:
-            return
-        # Please use PartialProgramLayer(second output parameter of get_concrete_program) rather than
-        # main_program here, as main_program is original program before to_prim.
-        fwd_ops = [
-            op.type
-            for op in net.forward.get_concrete_program(self.x)[1]
-            .train_program.block(0)
-            .ops
-        ]
-        # Ensure that gelu is splitted into small ops
-        self.assertTrue('gelu' not in fwd_ops)
-
-    @test_ast_only
-    def test_cinn_prim(self):
-        for shape in self.shapes:
-            for dtype in self.dtypes:
-                if paddle.device.get_device() == "cpu" and dtype == "float16":
-                    print("need pass this case")
-                    continue
-                data = generate_data(shape, dtype)
-                data_t = paddle.to_tensor(data)
-                data_t.stop_gradient = False
-                dy_res = self.train(use_prim=False, data=data_t)
-                cinn_res = self.train(use_prim=True, data=data_t)
-                for i in range(len(dy_res)):
-                    np.testing.assert_allclose(
-                        cinn_res[i],
-                        dy_res[i],
-                        rtol=TOLERANCE[dtype]['rtol'],
-                        atol=TOLERANCE[dtype]['atol'],
-                    )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/dygraph_to_static/test_cinn_prim_layer_norm.py b/test/dygraph_to_static/test_cinn_prim_layer_norm.py
deleted file mode 100644
index 4189697fd0471..0000000000000
--- a/test/dygraph_to_static/test_cinn_prim_layer_norm.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from dygraph_to_static_utils import Dy2StTestBase, test_ast_only
-
-import paddle
-import paddle.nn.functional as F
-from paddle.base import core
-
-TOLERANCE = {
-    "float16": {"rtol": 1e-2, "atol": 1e-2},
-    "float32": {"rtol": 1e-5, "atol": 1e-5},
-    "float64": {"rtol": 1e-13, "atol": 1e-13},
-}
-
-
-def generate_data(dtype="float32"):
-    np_data1 = np.random.random([2, 64]).astype(dtype)
-    np_data2 = np.random.random([64]).astype(dtype)
-    np_data3 = np.random.random([64]).astype(dtype)
-    return np_data1, np_data2, np_data3
-
-
-def apply_to_static(net, use_cinn):
-    build_strategy = paddle.static.BuildStrategy()
-    build_strategy.build_cinn_pass = use_cinn
-    return paddle.jit.to_static(net, build_strategy=build_strategy)
-
-
-class PrimeNet(paddle.nn.Layer):
-    def __init__(self):
-        super().__init__()
-        self.fc = paddle.nn.Linear(64, 64)
-
-    def forward(self, x, w, b):
-        n_shape = x.shape[1:]
-        out = F.layer_norm(x, n_shape, w, b)
-        return out[0]
-
-
-class TestPrimForward(Dy2StTestBase):
-    """
-    This case only tests prim_forward + to_static + cinn. Thus we need to
-    set this flag as False to avoid prim_backward.
-    core.set_prim_backward(False)
-    """
-
-    def setUp(self):
-        self.x = None
-        self.w = None
-        self.b = None
-        self.dtypes = ["float16", "float32"]
-
-    def train(self, use_prim):
-        net = PrimeNet()
-        sgd = paddle.optimizer.SGD(
-            learning_rate=0.1, parameters=net.parameters()
-        )
-        core._set_prim_forward_enabled(use_prim)
-        core._add_skip_comp_ops("sqrt")
-        # TODO(Ruting) delete this after modify sqrt
-        if use_prim:
-            net = apply_to_static(net, use_prim)
-
-        out = net(self.x, self.w, self.b)
-        loss = paddle.mean(out)
-        loss.backward()
-        sgd.step()
-        sgd.clear_grad()
-
-        self.check_prim(net, use_prim)
-
-        return out.numpy()
-
-    def check_prim(self, net, use_prim):
-        if not use_prim:
-            return
-        # Please use PartialProgramLayer(second output parameter of get_concrete_program) rather than
-        # main_program here, as main_program is original program before to_prim.
-        fwd_ops = [
-            op.type
-            for op in net.forward.get_concrete_program(self.x, self.w, self.b)[
-                1
-            ]
-            .train_program.block(0)
-            .ops
-        ]
-        # Ensure that layer_norm is splitted into small ops
-        self.assertTrue('layer_norm' not in fwd_ops)
-
-    @test_ast_only
-    def test_cinn_prim_forward(self):
-        for dtype in self.dtypes:
-            if paddle.device.get_device() == "cpu":
-                print("need pass this case")
-                continue
-            x_n, w_n, b_n = generate_data(dtype)
-            self.x = paddle.to_tensor(x_n)
-            self.w = paddle.to_tensor(w_n)
-            self.b = paddle.to_tensor(b_n)
-            self.x.stop_gradient = False
-            dy_res = self.train(use_prim=False)
-            cinn_res = self.train(use_prim=True)
-
-            np.testing.assert_allclose(
-                cinn_res,
-                dy_res,
-                rtol=TOLERANCE[dtype]['rtol'],
-                atol=TOLERANCE[dtype]['atol'],
-            )
-
-
-class TestPrimForwardAndBackward(Dy2StTestBase):
-    """
-    Test PrimeNet with @to_static + prim forward + prim backward + cinn v.s Dygraph
-    """
-
-    def setUp(self):
-        self.x = None
-        self.w = None
-        self.b = None
-        self.dtypes = ["float16", "float32"]
-
-    def train(self, use_prim):
-        net = PrimeNet()
-        sgd = paddle.optimizer.SGD(
-            learning_rate=0.1, parameters=net.parameters()
-        )
-        core._set_prim_all_enabled(use_prim)
-        core._add_skip_comp_ops("sqrt")
-        # TODO(Ruting) delete this after modify sqrt
-        if use_prim:
-            net = apply_to_static(net, use_prim)
-
-        out = net(self.x, self.w, self.b)
-        loss = paddle.mean(out)
-        loss.backward()
-        sgd.step()
-        sgd.clear_grad()
-
-        self.check_prim(net, use_prim)
-
-        return out.numpy()
-
-    def check_prim(self, net, use_prim):
-        if not use_prim:
-            return
-        fwd_ops = [
-            op.type
-            for op in net.forward.get_concrete_program(self.x, self.w, self.b)[
-                1
-            ]
-            .train_program.block(0)
-            .ops
-        ]
-        # Ensure that layer_norm is splitted into small ops
-        self.assertTrue('layer_norm' not in fwd_ops)
-
-    @test_ast_only
-    def test_cinn_prim(self):
-        for dtype in self.dtypes:
-            if paddle.device.get_device() == "cpu":
-                print("need pass this case")
-                continue
-            x_n, w_n, b_n = generate_data(dtype)
-            self.x = paddle.to_tensor(x_n)
-            self.w = paddle.to_tensor(w_n)
-            self.b = paddle.to_tensor(b_n)
-            self.x.stop_gradient = False
-            dy_res = self.train(use_prim=False)
-            cinn_res = self.train(use_prim=True)
-
-            np.testing.assert_allclose(
-                cinn_res,
-                dy_res,
-                rtol=TOLERANCE[dtype]['rtol'],
-                atol=TOLERANCE[dtype]['atol'],
-            )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/dygraph_to_static/test_cinn_prim_mean.py b/test/dygraph_to_static/test_cinn_prim_mean.py
deleted file mode 100644
index a920acce335e6..0000000000000
--- a/test/dygraph_to_static/test_cinn_prim_mean.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from dygraph_to_static_utils import Dy2StTestBase, test_ast_only
-
-import paddle
-from paddle import tensor
-from paddle.base import core
-
-TOLERANCE = {
-    "float16": {"rtol": 1e-3, "atol": 1e-3},
-    "float32": {"rtol": 1e-6, "atol": 1e-6},
-    "float64": {"rtol": 1e-15, "atol": 1e-15},
-}
-
-keepdim_conds = [True, False]
-axes_condis = [-1, 0, 1]
-
-
-def apply_to_static(net, use_cinn):
-    build_strategy = paddle.static.BuildStrategy()
-    build_strategy.build_cinn_pass = use_cinn
-    return paddle.jit.to_static(net, build_strategy=build_strategy)
-
-
-def generate_data(shape, dtype="float32"):
-    np_data = np.random.random(shape).astype(dtype)
-    return np_data
-
-
-class PrimeNet(
-    paddle.nn.Layer,
-):
-    def __init__(self):
-        super().__init__()
-        self.fc = paddle.nn.Linear(4, 4)
-
-    def forward(self, x):
-        # y = self.fc(x)
-        out = tensor.mean(x)
-        return out
-
-
-class TestPrimForward(Dy2StTestBase):
-    """
-    This case only tests prim_forward + to_static + cinn. Thus we need to
-    set this flag as False to avoid prim_backward.
-    core.set_prim_backward(False)
-    """
-
-    def setUp(self):
-        paddle.seed(2022)
-        self.shapes = [[2, 4], [64, 16, 4]]
-        self.dtypes = ["float16", "float32", "float64"]
-
-    def train(self, use_prim, data):
-        for keep_dim in keepdim_conds:
-            for axis in axes_condis:
-                return self._train(use_prim, data, axis, keep_dim)
-
-    def _train(self, use_prim, data, axis, keep_dim):
-        paddle.seed(2022)
-        net = PrimeNet()
-        sgd = paddle.optimizer.SGD(
-            learning_rate=0.1, parameters=net.parameters()
-        )
-        core._set_prim_forward_enabled(use_prim)
-        if use_prim:
-            net = apply_to_static(net, use_prim)
-
-        res = []
-        self.x = data
-        for _ in range(10):
-            out = net(data)
-            loss = paddle.mean(out, axis, keep_dim)
-            loss.backward()
-            sgd.step()
-            sgd.clear_grad()
-
-            res.append(out.numpy())
-
-        self.check_prim(net, use_prim)
-
-        return res
-
-    def check_prim(self, net, use_prim):
-        if not use_prim:
-            return
-        # Please use PartialProgramLayer(second output parameter of get_concrete_program) rather than
-        # main_program here, as main_program is original program before to_prim.
-        fwd_ops = [
-            op.type
-            for op in net.forward.get_concrete_program(self.x)[1]
-            .train_program.block(0)
-            .ops
-        ]
-        # Ensure that reduce_mean is splitted into small ops
-        self.assertTrue('reduce_mean' not in fwd_ops)
-
-    @test_ast_only
-    def test_cinn_prim_forward(self):
-        for shape in self.shapes:
-            for dtype in self.dtypes:
-                # mean-kernel on cpu not support float16
-                if paddle.device.get_device() == "cpu" and dtype == "float16":
-                    print("need pass this case")
-                    continue
-                data = generate_data(shape, dtype)
-                data_t = paddle.to_tensor(data)
-                data_t.stop_gradient = False
-                dy_res = self.train(use_prim=False, data=data_t)
-                cinn_res = self.train(use_prim=True, data=data_t)
-
-                np.testing.assert_allclose(
-                    cinn_res,
-                    dy_res,
-                    rtol=TOLERANCE[dtype]['rtol'],
-                    atol=TOLERANCE[dtype]['atol'],
-                )
-
-
-class TestPrimForwardAndBackward(Dy2StTestBase):
-    """
-    Test PrimeNet with @to_static + prim forward + prim backward + cinn v.s Dygraph
-    """
-
-    def setUp(self):
-        paddle.seed(2022)
-        self.shapes = [[2, 4], [64, 16, 4]]
-        self.dtypes = ["float16", "float32", "float64"]
-
-    def train(self, use_prim, data):
-        for keep_dim in keepdim_conds:
-            for axis in axes_condis:
-                return self._train(use_prim, data, axis, keep_dim)
-
-    def _train(self, use_prim, data, axis, keep_dim):
-        paddle.seed(2022)
-        net = PrimeNet()
-        sgd = paddle.optimizer.SGD(
-            learning_rate=0.1, parameters=net.parameters()
-        )
-        core._set_prim_all_enabled(use_prim)
-        if use_prim:
-            net = apply_to_static(net, use_prim)
-
-        res = []
-        self.x = data
-        for _ in range(10):
-            out = net(data)
-            loss = paddle.mean(out, axis, keep_dim)
-            loss.backward()
-            sgd.step()
-            sgd.clear_grad()
-
-            res.append(out.numpy())
-
-        self.check_prim(net, use_prim)
-
-        return res
-
-    def check_prim(self, net, use_prim):
-        if not use_prim:
-            return
-        fwd_ops = [
-            op.type
-            for op in net.forward.get_concrete_program(self.x)[1]
-            .train_program.block(0)
-            .ops
-        ]
-        # Ensure that reduce_mean is splitted into small ops
-        self.assertTrue('reduce_mean' not in fwd_ops)
-
-    @test_ast_only
-    def test_cinn_prim(self):
-        for shape in self.shapes:
-            for dtype in self.dtypes:
-                # mean-kernel on cpu not support float16
-                if paddle.device.get_device() == "cpu" and dtype == "float16":
-                    print("need pass this case")
-                    continue
-                data = generate_data(shape, dtype)
-                data_t = paddle.to_tensor(data)
-                data_t.stop_gradient = False
-                dy_res = self.train(use_prim=False, data=data_t)
-                cinn_res = self.train(use_prim=True, data=data_t)
-
-                np.testing.assert_allclose(
-                    cinn_res,
-                    dy_res,
-                    rtol=TOLERANCE[dtype]['rtol'],
-                    atol=TOLERANCE[dtype]['atol'],
-                )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/dygraph_to_static/test_closure_analysis.py b/test/dygraph_to_static/test_closure_analysis.py
index 4f43d1b902a12..0b29228d7c0af 100644
--- a/test/dygraph_to_static/test_closure_analysis.py
+++ b/test/dygraph_to_static/test_closure_analysis.py
@@ -40,9 +40,7 @@ def visit_FunctionDef(self, node):
         assert scope.existed_vars() == expected, "Not Equals."
         assert (
             scope.modified_vars() == exp_mod
-        ), "Not Equals in function:{} . expect {} , but get {}".format(
-            node.name, exp_mod, scope.modified_vars()
-        )
+        ), f"Not Equals in function:{node.name} . expect {exp_mod} , but get {scope.modified_vars()}"
         self.generic_visit(node)
 
 
@@ -55,9 +53,7 @@ def visit_FunctionDef(self, node):
         expected = self.pp_var.get(node.name, set())
         assert (
             scope.push_pop_vars == expected
-        ), "Not Equals in function:{} . expect {} , but get {}".format(
-            node.name, expected, scope.push_pop_vars
-        )
+        ), f"Not Equals in function:{node.name} . expect {expected} , but get {scope.push_pop_vars}"
         self.generic_visit(node)
 
 
diff --git a/test/dygraph_to_static/test_container.py b/test/dygraph_to_static/test_container.py
index e4ba864516af8..fb63aab6ecddd 100644
--- a/test/dygraph_to_static/test_container.py
+++ b/test/dygraph_to_static/test_container.py
@@ -17,7 +17,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_utils import Dy2StTestBase
+from dygraph_to_static_utils import (
+    Dy2StTestBase,
+    test_legacy_and_pt_and_pir,
+)
 
 import paddle
 from paddle.framework import use_pir_api
@@ -73,7 +76,6 @@ def forward(self, x):
 
 class TestSequential(Dy2StTestBase):
     def setUp(self):
-        paddle.set_device('cpu')
         self.seed = 2021
         self.temp_dir = tempfile.TemporaryDirectory()
         self._init_config()
@@ -110,8 +112,8 @@ def _run(self, to_static):
 
         return out
 
+    @test_legacy_and_pt_and_pir
     def test_train(self):
-        paddle.jit.set_code_level(100)
         dy_out = self._run(to_static=False)
         st_out = self._run(to_static=True)
         np.testing.assert_allclose(
diff --git a/test/dygraph_to_static/test_duplicate_output.py b/test/dygraph_to_static/test_duplicate_output.py
index c7ac39d2a7a4e..5c6d446e8f28e 100644
--- a/test/dygraph_to_static/test_duplicate_output.py
+++ b/test/dygraph_to_static/test_duplicate_output.py
@@ -24,11 +24,6 @@
 
 np.random.seed(1)
 
-if paddle.base.is_compiled_with_cuda():
-    place = paddle.base.CUDAPlace(0)
-else:
-    place = paddle.base.CPUPlace()
-
 
 class SimpleNet(paddle.nn.Layer):
     def __init__(self):
@@ -41,6 +36,17 @@ def forward(self, x):
         return x, x
 
 
+class DuplicateOutputInPaddleLayer(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        # In GRUCell, the output is a tuple (h, h)
+        self.layer = paddle.nn.GRUCell(10, 20)
+
+    def forward(self, x):
+        x = self.layer(x)
+        return x
+
+
 class TestDuplicateOutput(Dy2StTestBase):
     def _run_static(self):
         net = paddle.jit.to_static(SimpleNet())
@@ -58,5 +64,19 @@ def test_ast_to_func(self):
         self._run_static()
 
 
+class TestDuplicateOutputInPaddleLayer(Dy2StTestBase):
+    def check_dygraph_and_static_result(self, net, x):
+        static_net = paddle.jit.to_static(net)
+        dy_out = net(x)
+        st_out = static_net(x)
+        np.testing.assert_allclose(dy_out, st_out)
+
+    @test_legacy_and_pt_and_pir
+    def test_ast_to_func(self):
+        net = DuplicateOutputInPaddleLayer()
+        x = paddle.randn([10, 10])
+        self.check_dygraph_and_static_result(net, x)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/dygraph_to_static/test_fallback.py b/test/dygraph_to_static/test_fallback.py
deleted file mode 100644
index 4e8c4255cc881..0000000000000
--- a/test/dygraph_to_static/test_fallback.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-from dygraph_to_static_utils import (
-    Dy2StTestBase,
-    test_ast_only,
-    test_legacy_and_pt_and_pir,
-)
-
-import paddle
-
-
-def support_func(x):
-    return 2 * x
-
-
-def unsupport_func(x):
-    x = 2 * x
-    t = x.numpy()
-    t = np.ones(t)
-    return paddle.to_tensor(t)
-
-
-class SupportNet(paddle.nn.Layer):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x):
-        return support_func(x)
-
-
-class UnsupportNet(paddle.nn.Layer):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x):
-        if self.training:
-            return unsupport_func(x)
-        else:
-            return unsupport_func(x - 1)
-
-
-class TestFallback(Dy2StTestBase):
-    def setUp(self):
-        self.x = paddle.to_tensor([2]).astype('int')
-
-    def tearDown(self):
-        pass
-
-    @test_legacy_and_pt_and_pir
-    def test_case_support(self):
-        output = paddle.jit.to_static(support_func)(self.x)
-        np.testing.assert_allclose(output.numpy(), 4)
-
-    def test_case_func_fallback(self):
-        build_strategy = paddle.static.BuildStrategy()
-        build_strategy.build_cinn_pass = True
-        output = paddle.jit.to_static(
-            unsupport_func, build_strategy=build_strategy
-        )(self.x)
-        np.testing.assert_allclose(output.numpy(), unsupport_func(self.x))
-
-    def test_case_net_fallback(self):
-        s_net = SupportNet()
-        u_net = UnsupportNet()
-        np.testing.assert_allclose(
-            paddle.jit.to_static(s_net)(self.x).numpy(), 4
-        )
-        build_strategy = paddle.static.BuildStrategy()
-        build_strategy.build_cinn_pass = True
-        np.testing.assert_allclose(
-            paddle.jit.to_static(u_net, build_strategy=build_strategy)(
-                self.x
-            ).numpy(),
-            u_net(self.x).numpy(),
-        )
-
-    @test_ast_only
-    def test_case_net_error(self):
-        s_net = SupportNet()
-        u_net = UnsupportNet()
-        np.testing.assert_allclose(
-            paddle.jit.to_static(s_net)(self.x).numpy(), 4
-        )
-        build_strategy = paddle.static.BuildStrategy()
-        build_strategy.build_cinn_pass = False
-        with self.assertRaises(TypeError):
-            np.testing.assert_allclose(
-                paddle.jit.to_static(u_net, build_strategy=build_strategy)(
-                    self.x
-                ).numpy(),
-                u_net(self.x).numpy(),
-            )
-
-    def test_case_training(self):
-        build_strategy = paddle.static.BuildStrategy()
-        build_strategy.build_cinn_pass = True
-        u_net = paddle.jit.to_static(
-            UnsupportNet(), build_strategy=build_strategy
-        )
-        u_net.eval()
-        np.testing.assert_allclose(u_net(self.x).numpy(), [1, 1])
-        assert u_net.training is False, "Training must be false."
-
-    def test_case_save_error(self):
-        """
-        test the save will raise error.
-        """
-        # TODO(pir-save-load): Open this case after pir support save and load.
-        u_net = UnsupportNet()
-        u_net = paddle.jit.to_static(
-            u_net, input_spec=[paddle.static.InputSpec(name='x', shape=[1])]
-        )
-        with self.assertRaises(TypeError):
-            paddle.jit.save(u_net, path="model")
-
-    def test_case_save_error_2(self):
-        """
-        test the save will raise error.
-        """
-        u_net = UnsupportNet()
-        build_strategy = paddle.static.BuildStrategy()
-        build_strategy.build_cinn_pass = True
-        u_net = paddle.jit.to_static(u_net, build_strategy=build_strategy)
-        u_net(self.x)
-        with self.assertRaises(RuntimeError):
-            print(u_net.forward.main_program)
-
-    def test_case_flag(self):
-        """
-        test the flags is working. TODO: add a global flags.
-        """
-        pass
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/dygraph_to_static/test_lac.py b/test/dygraph_to_static/test_lac.py
deleted file mode 100644
index 0759865cd5b00..0000000000000
--- a/test/dygraph_to_static/test_lac.py
+++ /dev/null
@@ -1,707 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import os
-import tempfile
-import time
-import unittest
-
-import numpy as np
-
-os.environ["CUDA_VISIBLE_DEVICES"] = "2"
-
-from dygraph_to_static_utils import Dy2StTestBase, enable_to_static_guard
-
-import paddle
-from paddle import _legacy_C_ops, base
-from paddle.framework import in_dynamic_mode
-from paddle.jit.api import to_static
-from paddle.jit.translated_layer import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
-
-SEED = 2020
-
-# Add InputSpec to make unittest run faster.
-input_specs = [
-    paddle.static.InputSpec([None, None], 'int64'),
-    paddle.static.InputSpec([None, None], 'int64'),
-    paddle.static.InputSpec([None], 'int64'),
-]
-
-
-class DynamicGRU(paddle.nn.Layer):
-    def __init__(
-        self,
-        size,
-        h_0=None,
-        param_attr=None,
-        bias_attr=None,
-        is_reverse=False,
-        gate_activation='sigmoid',
-        candidate_activation='tanh',
-        origin_mode=False,
-        init_size=None,
-    ):
-        super().__init__()
-
-        self.gru_unit = paddle.nn.GRUCell(
-            size * 3,
-            size,
-        )
-
-        self.size = size
-        self.h_0 = h_0
-        self.is_reverse = is_reverse
-
-    def forward(self, inputs):
-        # Use `paddle.assign` to create a copy of global h_0 created not in `DynamicGRU`,
-        # to avoid modify it because `h_0` is both used in other `DynamicGRU`.
-        hidden = paddle.assign(self.h_0)
-        hidden.stop_gradient = True
-
-        res = []
-        for i in range(inputs.shape[1]):
-            if self.is_reverse:
-                j = paddle.shape(inputs)[1] - 1 - i
-            else:
-                j = i
-
-            # input_ = inputs[:, j:j+1, :]  # original code
-            input_ = paddle.slice(inputs, axes=[1], starts=[j], ends=[j + 1])
-            input_ = paddle.reshape(input_, [-1, input_.shape[2]])
-            hidden, reset = self.gru_unit(input_, hidden)
-            hidden_ = paddle.reshape(hidden, [-1, 1, hidden.shape[1]])
-            res.append(hidden_)
-
-        if self.is_reverse:
-            res = res[::-1]
-        res = paddle.concat(res, axis=1)
-        return res
-
-
-class BiGRU(paddle.nn.Layer):
-    def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
-        super().__init__()
-
-        self.pre_gru = paddle.nn.Linear(
-            in_features=input_dim,
-            out_features=grnn_hidden_dim * 3,
-            weight_attr=base.ParamAttr(
-                initializer=paddle.nn.initializer.Uniform(
-                    low=-init_bound, high=init_bound
-                ),
-                regularizer=paddle.regularizer.L2Decay(coeff=1e-4),
-            ),
-        )
-
-        self.gru = DynamicGRU(
-            size=grnn_hidden_dim,
-            h_0=h_0,
-            param_attr=base.ParamAttr(
-                initializer=paddle.nn.initializer.Uniform(
-                    low=-init_bound, high=init_bound
-                ),
-                regularizer=paddle.regularizer.L2Decay(coeff=1e-4),
-            ),
-        )
-
-        self.pre_gru_r = paddle.nn.Linear(
-            in_features=input_dim,
-            out_features=grnn_hidden_dim * 3,
-            weight_attr=base.ParamAttr(
-                initializer=paddle.nn.initializer.Uniform(
-                    low=-init_bound, high=init_bound
-                ),
-                regularizer=paddle.regularizer.L2Decay(coeff=1e-4),
-            ),
-        )
-
-        self.gru_r = DynamicGRU(
-            size=grnn_hidden_dim,
-            is_reverse=True,
-            h_0=h_0,
-            param_attr=base.ParamAttr(
-                initializer=paddle.nn.initializer.Uniform(
-                    low=-init_bound, high=init_bound
-                ),
-                regularizer=paddle.regularizer.L2Decay(coeff=1e-4),
-            ),
-        )
-
-    def forward(self, input_feature):
-        res_pre_gru = self.pre_gru(input_feature)
-        res_gru = self.gru(res_pre_gru)
-
-        res_pre_gru_r = self.pre_gru_r(input_feature)
-        res_gru_r = self.gru_r(res_pre_gru_r)
-
-        bi_merge = paddle.concat([res_gru, res_gru_r], axis=-1)
-        return bi_merge
-
-
-class LinearChainCRF(paddle.nn.Layer):
-    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
-        super().__init__()
-
-        self._param_attr = param_attr
-        self._dtype = dtype
-        self._size = size
-        self._is_test = is_test
-        self._transition = self.create_parameter(
-            attr=self._param_attr,
-            shape=[self._size + 2, self._size],
-            dtype=self._dtype,
-        )
-
-    @property
-    def weight(self):
-        return self._transition
-
-    @weight.setter
-    def weight(self, value):
-        self._transition = value
-
-    def forward(self, input, label, length=None):
-        if in_dynamic_mode():
-            _, _, _, log_likelihood = _legacy_C_ops.linear_chain_crf(
-                input, self._transition, label, length, "is_test", self._is_test
-            )
-            return log_likelihood
-
-        alpha = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype
-        )
-        emission_exps = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype
-        )
-        transition_exps = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype
-        )
-        log_likelihood = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype
-        )
-        this_inputs = {
-            "Emission": [input],
-            "Transition": self._transition,
-            "Label": [label],
-        }
-        if length is not None:
-            this_inputs['Length'] = [length]
-        self._helper.append_op(
-            type='linear_chain_crf',
-            inputs=this_inputs,
-            outputs={
-                "Alpha": [alpha],
-                "EmissionExps": [emission_exps],
-                "TransitionExps": transition_exps,
-                "LogLikelihood": log_likelihood,
-            },
-            attrs={
-                "is_test": self._is_test,
-            },
-        )
-        return log_likelihood
-
-
-class CRFDecoding(paddle.nn.Layer):
-    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
-        super().__init__()
-
-        self._dtype = dtype
-        self._size = size
-        self._is_test = is_test
-        self._param_attr = param_attr
-        self._transition = self.create_parameter(
-            attr=self._param_attr,
-            shape=[self._size + 2, self._size],
-            dtype=self._dtype,
-        )
-
-    @property
-    def weight(self):
-        return self._transition
-
-    @weight.setter
-    def weight(self, value):
-        self._transition = value
-
-    def forward(self, input, label=None, length=None):
-        if in_dynamic_mode():
-            return _legacy_C_ops.crf_decoding(
-                input, self._transition, label, length, "is_test", self._is_test
-            )
-
-        viterbi_path = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype
-        )
-        this_inputs = {
-            "Emission": [input],
-            "Transition": self._transition,
-            "Label": label,
-        }
-        if length is not None:
-            this_inputs['Length'] = [length]
-        self._helper.append_op(
-            type='crf_decoding',
-            inputs=this_inputs,
-            outputs={"ViterbiPath": [viterbi_path]},
-            attrs={
-                "is_test": self._is_test,
-            },
-        )
-        return viterbi_path
-
-
-class ChunkEval(paddle.nn.Layer):
-    def __init__(
-        self, num_chunk_types, chunk_scheme, excluded_chunk_types=None
-    ):
-        super().__init__()
-        self.num_chunk_types = num_chunk_types
-        self.chunk_scheme = chunk_scheme
-        self.excluded_chunk_types = excluded_chunk_types
-
-    def forward(self, input, label, seq_length=None):
-        if in_dynamic_mode():
-            return _legacy_C_ops.chunk_eval(
-                input,
-                label,
-                seq_length,
-                "num_chunk_types",
-                self.num_chunk_types,
-                "chunk_scheme",
-                self.chunk_scheme,
-                "excluded_chunk_types",
-                self.excluded_chunk_types or [],
-            )
-
-        precision = self._helper.create_variable_for_type_inference(
-            dtype="float32"
-        )
-        recall = self._helper.create_variable_for_type_inference(
-            dtype="float32"
-        )
-        f1_score = self._helper.create_variable_for_type_inference(
-            dtype="float32"
-        )
-        num_infer_chunks = self._helper.create_variable_for_type_inference(
-            dtype="int64"
-        )
-        num_label_chunks = self._helper.create_variable_for_type_inference(
-            dtype="int64"
-        )
-        num_correct_chunks = self._helper.create_variable_for_type_inference(
-            dtype="int64"
-        )
-
-        this_input = {"Inference": [input], "Label": [label]}
-        if seq_length is not None:
-            this_input["SeqLength"] = [seq_length]
-
-        self._helper.append_op(
-            type='chunk_eval',
-            inputs=this_input,
-            outputs={
-                "Precision": [precision],
-                "Recall": [recall],
-                "F1-Score": [f1_score],
-                "NumInferChunks": [num_infer_chunks],
-                "NumLabelChunks": [num_label_chunks],
-                "NumCorrectChunks": [num_correct_chunks],
-            },
-            attrs={
-                "num_chunk_types": self.num_chunk_types,
-                "chunk_scheme": self.chunk_scheme,
-                "excluded_chunk_types": self.excluded_chunk_types or [],
-            },
-        )
-        return (
-            precision,
-            recall,
-            f1_score,
-            num_infer_chunks,
-            num_label_chunks,
-            num_correct_chunks,
-        )
-
-
-class LexNet(paddle.nn.Layer):
-    def __init__(self, args, length=None):
-        super().__init__()
-        """
-        define the lexical analysis network structure
-        word: stores the input of the model
-        for_infer: a boolean value, indicating if the model to be created is for training or predicting.
-
-        return:
-            for infer: return the prediction
-            otherwise: return the prediction
-        """
-        self.word_emb_dim = args.word_emb_dim
-        self.vocab_size = args.vocab_size
-        self.num_labels = args.num_labels
-        self.grnn_hidden_dim = args.grnn_hidden_dim
-        self.emb_lr = (
-            args.emb_learning_rate if 'emb_learning_rate' in dir(args) else 1.0
-        )
-        self.crf_lr = (
-            args.emb_learning_rate if 'crf_learning_rate' in dir(args) else 1.0
-        )
-        self.bigru_num = args.bigru_num
-        self.init_bound = 0.1
-
-        self.word_embedding = paddle.nn.Embedding(
-            self.vocab_size,
-            self.word_emb_dim,
-            weight_attr=base.ParamAttr(
-                learning_rate=self.emb_lr,
-                name="word_emb",
-                initializer=paddle.nn.initializer.Uniform(
-                    low=-self.init_bound, high=self.init_bound
-                ),
-            ),
-        )
-
-        h_0 = np.zeros((args.batch_size, self.grnn_hidden_dim), dtype="float32")
-        h_0 = paddle.to_tensor(h_0)
-
-        self.bigru_units = []
-        for i in range(self.bigru_num):
-            if i == 0:
-                self.bigru_units.append(
-                    self.add_sublayer(
-                        "bigru_units%d" % i,
-                        BiGRU(
-                            self.grnn_hidden_dim,
-                            self.grnn_hidden_dim,
-                            self.init_bound,
-                            h_0=h_0,
-                        ),
-                    )
-                )
-            else:
-                self.bigru_units.append(
-                    self.add_sublayer(
-                        "bigru_units%d" % i,
-                        BiGRU(
-                            self.grnn_hidden_dim * 2,
-                            self.grnn_hidden_dim,
-                            self.init_bound,
-                            h_0=h_0,
-                        ),
-                    )
-                )
-
-        self.fc = paddle.nn.Linear(
-            in_features=self.grnn_hidden_dim * 2,
-            out_features=self.num_labels,
-            weight_attr=base.ParamAttr(
-                initializer=paddle.nn.initializer.Uniform(
-                    low=-self.init_bound, high=self.init_bound
-                ),
-                regularizer=paddle.regularizer.L2Decay(coeff=1e-4),
-            ),
-        )
-
-        self.linear_chain_crf = LinearChainCRF(
-            param_attr=base.ParamAttr(
-                name='linear_chain_crfw', learning_rate=self.crf_lr
-            ),
-            size=self.num_labels,
-        )
-
-        self.crf_decoding = CRFDecoding(
-            param_attr=base.ParamAttr(name='crfw', learning_rate=self.crf_lr),
-            size=self.num_labels,
-        )
-        # share weight
-        self.crf_decoding.weight = self.linear_chain_crf.weight
-
-    @to_static(input_spec=input_specs)
-    def forward(self, word, target, length=None):
-        """
-        Configure the network
-        """
-        word_embed = self.word_embedding(word)
-        input_feature = word_embed
-
-        for i in range(self.bigru_num):
-            bigru_output = self.bigru_units[i](input_feature)
-            input_feature = bigru_output
-
-        emission = self.fc(bigru_output)
-
-        crf_cost = self.linear_chain_crf(
-            input=emission, label=target, length=length
-        )
-        avg_cost = paddle.mean(x=crf_cost)
-        crf_decode = self.crf_decoding(input=emission, length=length)
-        return avg_cost, crf_decode
-
-
-class Args:
-    epoch = 1
-    batch_size = 4
-    vocab_size = 100
-    num_labels = 10
-    word_emb_dim = 128
-    grnn_hidden_dim = 128
-    base_learning_rate = 0.01
-    bigru_num = 2
-    print_steps = 1
-
-
-def get_random_input_data(batch_size, vocab_size, num_labels, max_seq_len=64):
-    local_random = np.random.RandomState(SEED)
-    padding_id = np.int64(0)
-    iter_num = 5
-
-    def __reader__():
-        batch, init_lens = [], []
-        for i in range(iter_num * batch_size):
-            cur_len = local_random.randint(3, max_seq_len)
-            word_ids = (
-                local_random.randint(0, vocab_size, [cur_len])
-                .astype('int64')
-                .tolist()
-            )
-            label_ids = (
-                local_random.randint(0, num_labels, [cur_len])
-                .astype('int64')
-                .tolist()
-            )
-            batch.append((word_ids, label_ids))
-            init_lens.append(cur_len)
-            if len(batch) == batch_size:
-                batch_max_len = min(max(init_lens), max_seq_len)
-                new_batch = []
-                for words_len, (word_ids, label_ids) in zip(init_lens, batch):
-                    word_ids = word_ids[0:batch_max_len]
-                    words_len = np.int64(len(word_ids))
-                    word_ids += [
-                        padding_id for _ in range(batch_max_len - words_len)
-                    ]
-                    label_ids = label_ids[0:batch_max_len]
-                    label_ids += [
-                        padding_id for _ in range(batch_max_len - words_len)
-                    ]
-                    assert len(word_ids) == len(label_ids)
-                    new_batch.append((word_ids, label_ids, words_len))
-                yield new_batch
-                batch, init_lens = [], []
-
-    return __reader__
-
-
-def create_dataloader(reader, place):
-    data_loader = base.io.DataLoader.from_generator(
-        capacity=16, use_double_buffer=True, iterable=True
-    )
-
-    data_loader.set_sample_list_generator(reader, places=place)
-
-    return data_loader
-
-
-class TestLACModel(Dy2StTestBase):
-    def setUp(self):
-        self.args = Args()
-        self.place = (
-            base.CUDAPlace(0)
-            if base.is_compiled_with_cuda()
-            else base.CPUPlace()
-        )
-        self.temp_dir = tempfile.TemporaryDirectory()
-        self.model_save_dir = os.path.join(self.temp_dir.name, 'inference')
-        self.model_save_prefix = os.path.join(self.model_save_dir, 'lac')
-        self.model_filename = "lac" + INFER_MODEL_SUFFIX
-        self.params_filename = "lac" + INFER_PARAMS_SUFFIX
-        self.dy_param_path = os.path.join(self.temp_dir.name, 'lac_dy_param')
-
-    def train(self, args, to_static):
-        place = (
-            base.CUDAPlace(0)
-            if base.is_compiled_with_cuda()
-            else base.CPUPlace()
-        )
-        with base.dygraph.guard(place):
-            paddle.seed(SEED)
-            paddle.framework.random._manual_program_seed(SEED)
-
-            reader = get_random_input_data(
-                args.batch_size, args.vocab_size, args.num_labels
-            )
-            train_loader = create_dataloader(reader, place)
-
-            model = LexNet(args)
-            optimizer = paddle.optimizer.Adam(
-                learning_rate=args.base_learning_rate,
-                parameters=model.parameters(),
-            )
-            chunk_eval = ChunkEval(
-                int(math.ceil((args.num_labels - 1) / 2.0)), "IOB"
-            )
-
-            step = 0
-
-            loss_data = []
-            for epoch_id in range(args.epoch):
-                for batch in train_loader():
-                    words, targets, length = batch
-                    start_time = time.time()
-                    avg_cost, crf_decode = model(words, targets, length)
-                    loss_data.append(float(avg_cost))
-
-                    # backward and optimization
-                    avg_cost.backward()
-                    optimizer.minimize(avg_cost)
-                    model.clear_gradients()
-                    end_time = time.time()
-
-                    if step % args.print_steps == 0:
-                        (
-                            precision,
-                            recall,
-                            f1_score,
-                            num_infer_chunks,
-                            num_label_chunks,
-                            num_correct_chunks,
-                        ) = chunk_eval(
-                            input=crf_decode,
-                            label=targets,
-                            seq_length=length,
-                        )
-                        outputs = [avg_cost, precision, recall, f1_score]
-                        avg_cost, precision, recall, f1_score = (
-                            np.mean(x.numpy()) for x in outputs
-                        )
-
-                        print(
-                            "[train] step = %d, loss = %f, P: %f, R: %f, F1: %f, elapsed time %f"
-                            % (
-                                step,
-                                avg_cost,
-                                precision,
-                                recall,
-                                f1_score,
-                                end_time - start_time,
-                            )
-                        )
-
-                    step += 1
-            # save inference model
-            if to_static:
-                paddle.jit.save(
-                    layer=model,
-                    path=self.model_save_prefix,
-                    input_spec=input_specs,
-                    output_spec=[crf_decode],
-                    input_names_after_prune=[
-                        input_specs[0].name,
-                        input_specs[-1].name,
-                    ],
-                )
-            else:
-                paddle.save(
-                    model.state_dict(), self.dy_param_path + '.pdparams'
-                )
-
-            return np.array(loss_data)
-
-    def _train(self, to_static: bool):
-        with enable_to_static_guard(to_static):
-            self.train(self.args, to_static)
-
-    def test_train(self):
-        st_out = self._train(to_static=True)
-        dy_out = self._train(to_static=False)
-        np.testing.assert_allclose(
-            dy_out,
-            st_out,
-            rtol=1e-05,
-            err_msg=f'dygraph output:\n{dy_out},\nstatic output:\n {st_out}.',
-        )
-        # Prediction needs trained models, so put `test_predict` at last of `test_train`
-        # self.verify_predict()
-
-    def verify_predict(self):
-        reader = get_random_input_data(
-            self.args.batch_size, self.args.vocab_size, self.args.num_labels
-        )
-        for batch in reader():
-            batch = [np.vstack(var) for var in zip(*batch)]
-            dy_pre = self.predict_dygraph(batch)
-            st_pre = self.predict_static(batch)
-            dy_jit_pre = self.predict_dygraph_jit(batch)
-            np.testing.assert_allclose(dy_pre, st_pre, rtol=1e-05)
-            np.testing.assert_allclose(dy_jit_pre, st_pre, rtol=1e-05)
-
-    def predict_dygraph(self, batch):
-        words, targets, length = batch
-        with enable_to_static_guard(False):
-            with base.dygraph.guard(self.place):
-                model = LexNet(self.args)
-                # load dygraph trained parameters
-                model_dict = paddle.load(self.dy_param_path + ".pdparams")
-                model.set_dict(model_dict)
-                model.eval()
-
-                _, pred_res = model(
-                    paddle.to_tensor(words),
-                    paddle.to_tensor(targets),
-                    paddle.to_tensor(length),
-                )
-
-                return pred_res.numpy()
-
-    def predict_static(self, batch):
-        """
-        LAC model contains h_0 created in `__init__` that is necessary for inferring.
-        Load inference model to test it's ok for prediction.
-        """
-        paddle.enable_static()
-        exe = base.Executor(self.place)
-        # load inference model
-        [
-            inference_program,
-            feed_target_names,
-            fetch_targets,
-        ] = paddle.static.io.load_inference_model(
-            self.model_save_dir,
-            executor=exe,
-            model_filename=self.model_filename,
-            params_filename=self.params_filename,
-        )
-
-        words, targets, length = batch
-        pred_res = exe.run(
-            inference_program,
-            feed={feed_target_names[0]: words, feed_target_names[1]: length},
-            fetch_list=fetch_targets,
-        )
-        return pred_res[0]
-
-    def predict_dygraph_jit(self, batch):
-        words, targets, length = batch
-        with base.dygraph.guard(self.place):
-            model = paddle.jit.load(self.model_save_prefix)
-            model.eval()
-
-            pred_res = model(paddle.to_tensor(words), paddle.to_tensor(length))
-
-            return pred_res.numpy()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/dygraph_to_static/test_loop.py b/test/dygraph_to_static/test_loop.py
index d6eac57df3ae6..2e807c91a8c8b 100644
--- a/test/dygraph_to_static/test_loop.py
+++ b/test/dygraph_to_static/test_loop.py
@@ -282,16 +282,12 @@ def test_nested_loop_vars(self):
                 self.assertEqual(
                     loop_var_names,
                     self.loop_var_names[i],
-                    msg="loop_var_names : {}, \nexpected loop_var_names : {}".format(
-                        loop_var_names, self.loop_var_names[i]
-                    ),
+                    msg=f"loop_var_names : {loop_var_names}, \nexpected loop_var_names : {self.loop_var_names[i]}",
                 )
                 self.assertEqual(
                     create_var_names,
                     self.create_var_names[i],
-                    msg="i = {}\ncreate_var_names : {}, \nexpected create_var_names : {}".format(
-                        i, create_var_names, self.create_var_names[i]
-                    ),
+                    msg=f"i = {i}\ncreate_var_names : {create_var_names}, \nexpected create_var_names : {self.create_var_names[i]}",
                 )
                 i += 1
 
diff --git a/test/dygraph_to_static/test_mnist.py b/test/dygraph_to_static/test_mnist.py
index 71554434cd463..4c34ae320abad 100644
--- a/test/dygraph_to_static/test_mnist.py
+++ b/test/dygraph_to_static/test_mnist.py
@@ -183,9 +183,7 @@ def test_mnist_declarative_cpu_vs_mkldnn(self):
             dygraph_loss_cpu,
             dygraph_loss_mkldnn,
             rtol=1e-05,
-            err_msg='cpu dygraph is {}\n mkldnn dygraph is \n{}'.format(
-                dygraph_loss_cpu, dygraph_loss_mkldnn
-            ),
+            err_msg=f'cpu dygraph is {dygraph_loss_cpu}\n mkldnn dygraph is \n{dygraph_loss_mkldnn}',
         )
 
     def train(self, to_static=False):
@@ -221,13 +219,7 @@ def train(self, to_static=False):
                 mnist.clear_gradients()
                 if batch_id % 10 == 0:
                     print(
-                        "Loss at epoch {} step {}: loss: {:}, acc: {}, cost: {}".format(
-                            epoch,
-                            batch_id,
-                            avg_loss.numpy(),
-                            acc.numpy(),
-                            time() - start,
-                        )
+                        f"Loss at epoch {epoch} step {batch_id}: loss: {avg_loss.numpy()}, acc: {acc.numpy()}, cost: {time() - start}"
                     )
                     start = time()
                 if batch_id == 50:
diff --git a/test/dygraph_to_static/test_mnist_amp.py b/test/dygraph_to_static/test_mnist_amp.py
index ac5a3b13fcb6e..a19e6249e11e2 100644
--- a/test/dygraph_to_static/test_mnist_amp.py
+++ b/test/dygraph_to_static/test_mnist_amp.py
@@ -91,13 +91,7 @@ def train(self, to_static=False):
                 mnist.clear_gradients()
                 if batch_id % 10 == 0:
                     print(
-                        "Loss at epoch {} step {}: loss: {:}, acc: {}, cost: {}".format(
-                            epoch,
-                            batch_id,
-                            avg_loss.numpy(),
-                            acc.numpy(),
-                            time() - start,
-                        )
+                        f"Loss at epoch {epoch} step {batch_id}: loss: {avg_loss.numpy()}, acc: {acc.numpy()}, cost: {time() - start}"
                     )
                     start = time()
                 if batch_id == 50:
diff --git a/test/dygraph_to_static/test_mnist_pure_fp16.py b/test/dygraph_to_static/test_mnist_pure_fp16.py
index 83431e4892cbd..dea4428c20e88 100644
--- a/test/dygraph_to_static/test_mnist_pure_fp16.py
+++ b/test/dygraph_to_static/test_mnist_pure_fp16.py
@@ -105,13 +105,7 @@ def train(self, to_static=False):
                 mnist.clear_gradients()
                 if batch_id % 2 == 0:
                     print(
-                        "Loss at epoch {} step {}: loss: {:}, acc: {}, cost: {}".format(
-                            epoch,
-                            batch_id,
-                            avg_loss.numpy(),
-                            acc.numpy(),
-                            time() - start,
-                        )
+                        f"Loss at epoch {epoch} step {batch_id}: loss: {avg_loss.numpy()}, acc: {acc.numpy()}, cost: {time() - start}"
                     )
                     start = time()
                 if batch_id == 10:
diff --git a/test/dygraph_to_static/test_no_need_buffer.py b/test/dygraph_to_static/test_no_need_buffer.py
new file mode 100644
index 0000000000000..362cac57ff087
--- /dev/null
+++ b/test/dygraph_to_static/test_no_need_buffer.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from dygraph_to_static_utils import (
+    Dy2StTestBase,
+    test_ast_only,
+    test_pir_only,
+)
+
+import paddle
+
+
+# The input of concat_grad is no_need_buffer
+def concat_net(x):
+    y = x + 1
+    z = paddle.concat([y, y], axis=0)
+    return z
+
+
+class TestNoNeedBuffer(Dy2StTestBase):
+    @test_ast_only
+    @test_pir_only
+    def test_no_need_buffer(self):
+        input = paddle.to_tensor([1, 2])
+        input.stop_gradient = False
+        static_fn = paddle.jit.to_static(concat_net)
+        static_res = static_fn(input)
+        dygraph_res = concat_net(input)
+        np.testing.assert_allclose(static_res.numpy(), dygraph_res.numpy())
+
+        _, partial_program_layer = static_fn.get_concrete_program(input)
+        no_need_buffers = partial_program_layer.program.program_attr[
+            "no_need_buffers"
+        ]
+        for no_need_buffer_value in no_need_buffers:
+            defining_op = no_need_buffer_value.get_defining_op()
+            # y = x + 1, it's defining op is `pd_op.scale`
+            if defining_op is not None and defining_op.name() == 'pd_op.scale':
+                break
+        else:
+            raise AssertionError(
+                "middle var `y` should be no_need_buffer value"
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/dygraph_to_static/test_partial_program_hook.py b/test/dygraph_to_static/test_partial_program_hook.py
index 66ac5a745eb51..d1819846737cf 100644
--- a/test/dygraph_to_static/test_partial_program_hook.py
+++ b/test/dygraph_to_static/test_partial_program_hook.py
@@ -63,7 +63,7 @@ def test_after_infer(self):
 
 
 class TestPrimHook(Dy2StTestBase):
-    def setUp(self):
+    def init_test_params(self):
         core._set_prim_all_enabled(False)
 
         def f():
@@ -72,30 +72,33 @@ def f():
         concrete_program, partial_program = paddle.jit.to_static(
             f, full_graph=True
         ).get_concrete_program()
-        self._hook = program_translator.PrimHooker(
+        hook = program_translator.PrimHooker(
             concrete_program.main_program, None
         )
-        self._forward = partial_program.forward_program
-        self._whole = partial_program._train_program
-
-        core._set_prim_all_enabled(True)
+        forward = partial_program.forward_program
+        whole = partial_program._train_program
 
-    def tearDown(self):
-        core._set_prim_all_enabled(False)
+        return hook, forward, whole
 
     @test_ast_only
     def test_before_append_backward(self):
-        self._hook.before_append_backward(self._forward)
+        hook, forward, whole = self.init_test_params()
+        core._set_prim_all_enabled(True)
+        hook.before_append_backward(forward)
         self.assertNotIn(
-            'dropout', tuple(op.type for op in self._forward.blocks[0].ops)
+            'dropout', tuple(op.type for op in forward.blocks[0].ops)
         )
+        core._set_prim_all_enabled(False)
 
     @test_ast_only
     def test_after_append_backward(self):
-        self._hook.after_append_backward(self._whole, 0)
+        hook, forward, whole = self.init_test_params()
+        core._set_prim_all_enabled(True)
+        hook.after_append_backward(whole, 0)
         self.assertNotIn(
-            'dropout_grad', tuple(op.type for op in self._whole.blocks[0].ops)
+            'dropout_grad', tuple(op.type for op in whole.blocks[0].ops)
         )
+        core._set_prim_all_enabled(False)
 
 
 class TestPirPrimHook(Dy2StTestBase):
diff --git a/test/dygraph_to_static/test_place.py b/test/dygraph_to_static/test_place.py
index 8ef18fc26441d..9bd893da5756e 100644
--- a/test/dygraph_to_static/test_place.py
+++ b/test/dygraph_to_static/test_place.py
@@ -17,23 +17,33 @@
 
 from dygraph_to_static_utils import (
     Dy2StTestBase,
-    test_legacy_and_pt_and_pir,
+    test_legacy_and_pt,
+    test_pir_only,
 )
 
 import paddle
 
 
 class TestPlace(Dy2StTestBase):
-    @test_legacy_and_pt_and_pir
-    def test_place(self):
+    @test_legacy_and_pt
+    def test_place_legacy(self):
+        # TODO(cleanup-legacy-ir): remove this test case
         paddle.enable_static()
         x = paddle.to_tensor([1, 2, 3, 4])
         with warnings.catch_warnings(record=True) as w:
             warnings.simplefilter("always")
             self.assertIsNone(x.place())
             self.assertTrue(len(w) == 1)
-            if paddle.framework.use_pir_api():
-                self.assertIn("Value do not have 'place'", str(w[-1].message))
+
+    @test_pir_only
+    def test_place(self):
+        paddle.enable_static()
+        x = paddle.to_tensor([1, 2, 3, 4])
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            self.assertIsNone(x.place)
+            self.assertTrue(len(w) == 1)
+            self.assertIn("Value do not have 'place'", str(w[-1].message))
 
 
 if __name__ == '__main__':
diff --git a/test/dygraph_to_static/test_reinforcement_learning.py b/test/dygraph_to_static/test_reinforcement_learning.py
index 1165f51807427..ade9ba14659d2 100644
--- a/test/dygraph_to_static/test_reinforcement_learning.py
+++ b/test/dygraph_to_static/test_reinforcement_learning.py
@@ -190,9 +190,7 @@ def finish_episode():
             running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
             if i_episode % args.log_interval == 0:
                 print(
-                    'Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}\t loss_probs: {}'.format(
-                        i_episode, ep_reward, running_reward, float(loss)
-                    )
+                    f'Episode {i_episode}\tLast reward: {ep_reward:.2f}\tAverage reward: {running_reward:.2f}\t loss_probs: {float(loss)}'
                 )
 
             if i_episode > args.train_step:
diff --git a/test/dygraph_to_static/test_se_resnet.py b/test/dygraph_to_static/test_se_resnet.py
index 113dde8dde3d3..7c87aed56e0d2 100644
--- a/test/dygraph_to_static/test_se_resnet.py
+++ b/test/dygraph_to_static/test_se_resnet.py
@@ -567,9 +567,7 @@ def verify_predict(self):
                 flat_predictor_pre[i],
                 flat_st_pre[i],
                 delta=1e-6,
-                msg="predictor_pre:\n {}\n, st_pre: \n{}.".format(
-                    flat_predictor_pre[i], flat_st_pre[i]
-                ),
+                msg=f"predictor_pre:\n {flat_predictor_pre[i]}\n, st_pre: \n{flat_st_pre[i]}.",
             )
 
     @test_default_and_pir
diff --git a/test/dygraph_to_static/test_sentiment.py b/test/dygraph_to_static/test_sentiment.py
index 3df112051baad..5543c9ca18eb5 100644
--- a/test/dygraph_to_static/test_sentiment.py
+++ b/test/dygraph_to_static/test_sentiment.py
@@ -15,8 +15,11 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_utils import Dy2StTestBase, enable_to_static_guard
-from test_lac import DynamicGRU
+from dygraph_to_static_utils import (
+    Dy2StTestBase,
+    enable_to_static_guard,
+    test_pir_only,
+)
 
 import paddle
 from paddle import base
@@ -27,8 +30,57 @@
 # Note: Set True to eliminate randomness.
 #     1. For one operation, cuDNN has several algorithms,
 #        some algorithm results are non-deterministic, like convolution algorithms.
-if base.is_compiled_with_cuda():
-    base.set_flags({'FLAGS_cudnn_deterministic': True})
+if paddle.is_compiled_with_cuda():
+    paddle.set_flags({'FLAGS_cudnn_deterministic': True})
+
+
+class DynamicGRU(paddle.nn.Layer):
+    def __init__(
+        self,
+        size,
+        h_0=None,
+        param_attr=None,
+        bias_attr=None,
+        is_reverse=False,
+        gate_activation='sigmoid',
+        candidate_activation='tanh',
+        origin_mode=False,
+        init_size=None,
+    ):
+        super().__init__()
+
+        self.gru_unit = paddle.nn.GRUCell(
+            size * 3,
+            size,
+        )
+
+        self.size = size
+        self.h_0 = h_0
+        self.is_reverse = is_reverse
+
+    def forward(self, inputs):
+        # Use `paddle.assign` to create a copy of global h_0 created not in `DynamicGRU`,
+        # to avoid modify it because `h_0` is both used in other `DynamicGRU`.
+        hidden = paddle.assign(self.h_0)
+        hidden.stop_gradient = True
+
+        res = []
+        for i in range(inputs.shape[1]):
+            if self.is_reverse:
+                j = inputs.shape[1] - 1 - i
+            else:
+                j = i
+
+            input_ = inputs[:, j : j + 1, :]
+            input_ = paddle.reshape(input_, [-1, input_.shape[2]])
+            hidden, reset = self.gru_unit(input_, hidden)
+            hidden_ = paddle.reshape(hidden, [-1, 1, hidden.shape[1]])
+            res.append(hidden_)
+
+        if self.is_reverse:
+            res = res[::-1]
+        res = paddle.concat(res, axis=1)
+        return res
 
 
 class SimpleConvPool(paddle.nn.Layer):
@@ -251,16 +303,12 @@ def forward(self, inputs, label=None):
         fc_2 = paddle.tanh(fc_2)
         prediction = self._fc_prediction(fc_2)
         prediction = paddle.nn.functional.softmax(prediction)
-        # TODO(Aurelius84): Uncomment the following codes when we support return variable-length vars.
-        # if label is not None:
         cost = paddle.nn.functional.cross_entropy(
             input=prediction, label=label, reduction='none', use_softmax=False
         )
         avg_cost = paddle.mean(x=cost)
         acc = paddle.static.accuracy(input=prediction, label=label)
         return avg_cost, prediction, acc
-        # else:
-        #     return prediction
 
 
 def fake_data_reader(class_num, vocab_size, batch_size, padding_size):
@@ -350,12 +398,7 @@ def train(args):
                 if used_time < 1e-5:
                     used_time = 1e-5
                 print(
-                    "step: %d, ave loss: %f, speed: %f steps/s"
-                    % (
-                        batch_id,
-                        float(avg_cost),
-                        args.log_step / used_time,
-                    )
+                    f"step: {batch_id}, ave loss: {float(avg_cost)}, speed: {args.log_step / used_time} steps/s"
                 )
                 time_begin = time.time()
 
@@ -377,15 +420,25 @@ def train_model(self, model_type='cnn_net'):
         np.testing.assert_allclose(
             dy_out,
             st_out,
-            rtol=1e-05,
+            rtol=1e-4,
             err_msg=f'dy_out:\n {dy_out}\n st_out:\n {st_out}',
         )
 
-    def test_train(self):
-        model_types = ['cnn_net', 'bow_net', 'gru_net', 'bigru_net']
-        for model_type in model_types:
-            print('training %s ....' % model_type)
-            self.train_model(model_type)
+    @test_pir_only
+    def test_train_cnn(self):
+        self.train_model('cnn_net')
+
+    @test_pir_only
+    def test_train_bow(self):
+        self.train_model('bow_net')
+
+    @test_pir_only
+    def test_train_gru(self):
+        self.train_model('gru_net')
+
+    @test_pir_only
+    def test_train_bigru(self):
+        self.train_model('bigru_net')
 
 
 if __name__ == '__main__':
diff --git a/test/dygraph_to_static/test_tsm.py b/test/dygraph_to_static/test_tsm.py
index 153d6c3daebe9..21d3da6e24cf6 100644
--- a/test/dygraph_to_static/test_tsm.py
+++ b/test/dygraph_to_static/test_tsm.py
@@ -346,13 +346,7 @@ def train(args, fake_data_reader):
             total_sample += 1
 
             print(
-                'TRAIN Epoch {}, iter {}, loss = {}, acc1 {}, acc5 {}'.format(
-                    epoch,
-                    batch_id,
-                    float(avg_loss),
-                    float(acc_top1),
-                    float(acc_top5),
-                )
+                f'TRAIN Epoch {epoch}, iter {batch_id}, loss = {float(avg_loss)}, acc1 {float(acc_top1)}, acc5 {float(acc_top5)}'
             )
             ret.extend(
                 [
@@ -363,12 +357,7 @@ def train(args, fake_data_reader):
             )
 
         print(
-            'TRAIN End, Epoch {}, avg_loss= {}, avg_acc1= {}, avg_acc5= {}'.format(
-                epoch,
-                total_loss / total_sample,
-                total_acc1 / total_sample,
-                total_acc5 / total_sample,
-            )
+            f'TRAIN End, Epoch {epoch}, avg_loss= {total_loss / total_sample}, avg_acc1= {total_acc1 / total_sample}, avg_acc5= {total_acc5 / total_sample}'
         )
     return ret
 
diff --git a/test/dygraph_to_static/test_yolov3.py b/test/dygraph_to_static/test_yolov3.py
index 1ad1b2f1d6d24..1c49bc17f2534 100644
--- a/test/dygraph_to_static/test_yolov3.py
+++ b/test/dygraph_to_static/test_yolov3.py
@@ -149,11 +149,7 @@ def train():
         total_sample += 1
 
         print(
-            "Iter {:d}, loss {:.6f}, time {:.5f}".format(
-                iter_id,
-                smoothed_loss.get_mean_value(),
-                start_time - prev_start_time,
-            )
+            f"Iter {iter_id:d}, loss {smoothed_loss.get_mean_value():.6f}, time {start_time - prev_start_time:.5f}"
         )
         ret.append(smoothed_loss.get_mean_value())
 
diff --git a/test/ir/inference/CMakeLists.txt b/test/ir/inference/CMakeLists.txt
index 05dfc5c6fa53e..a0db9d85e1bd5 100755
--- a/test/ir/inference/CMakeLists.txt
+++ b/test/ir/inference/CMakeLists.txt
@@ -116,7 +116,7 @@ endforeach()
 
 if(WIN32 AND WIN_UNITTEST_LEVEL LESS 2)
   message(STATUS "Skip tests unrelated to CUDA/TRT")
-elseif(WITH_MKLDNN)
+elseif(WITH_ONEDNN)
   foreach(target ${TEST_MKLDNN_IR_PASSES})
     py_test_modules(${target} MODULES ${target})
     set_tests_properties(${target} PROPERTIES LABELS "RUN_TYPE=INFER")
@@ -144,7 +144,7 @@ if(WITH_XPU)
   endforeach()
 endif()
 
-if(WITH_MKLDNN
+if(WITH_ONEDNN
    AND TENSORRT_FOUND
    AND WITH_GPU)
   foreach(target ${TEST_INFERENCE_IR_PASSES})
@@ -153,7 +153,7 @@ if(WITH_MKLDNN
   endforeach()
 endif()
 
-if(NOT WITH_MKLDNN
+if(NOT WITH_ONEDNN
    AND NOT TENSORRT_FOUND
    AND NOT WITH_GPU)
   foreach(target ${TEST_INFERENCE_CPU_UT})
@@ -173,9 +173,11 @@ if(WITH_GPU AND TENSORRT_FOUND)
   set_tests_properties(test_trt_dynamic_shape PROPERTIES TIMEOUT 120)
   set_tests_properties(test_trt_inspector PROPERTIES TIMEOUT 60)
   set_tests_properties(test_trt_inference_predictor PROPERTIES TIMEOUT 60)
-  set_tests_properties(test_trt_inference_fp16_io PROPERTIES TIMEOUT 300)
   set_tests_properties(test_trt_optimization_level PROPERTIES TIMEOUT 300)
   set_tests_properties(test_trt_ops_fp32_mix_precision PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_trt_inference_fp16_io PROPERTIES TIMEOUT 500)
+  set_tests_properties(test_trt_convert_unary PROPERTIES TIMEOUT 600)
+  set_tests_properties(test_seq_concat_fc_fuse_pass PROPERTIES TIMEOUT 200)
   if(NOT WIN32)
     set_tests_properties(test_trt_explicit_quantization_resnet
                          PROPERTIES TIMEOUT 300)
@@ -213,7 +215,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
   set_tests_properties(test_trt_nearest_interp_v2_op PROPERTIES TIMEOUT 30)
   set_tests_properties(test_trt_multiclass_nms3_op PROPERTIES TIMEOUT 60)
 
-  if(WITH_MKLDNN)
+  if(WITH_ONEDNN)
     set_tests_properties(test_merge_layernorm_fuse_pass PROPERTIES TIMEOUT 180)
     set_tests_properties(test_skip_merge_layernorm_fuse_pass PROPERTIES TIMEOUT
                                                                         180)
diff --git a/test/ir/inference/auto_scan_test.py b/test/ir/inference/auto_scan_test.py
index 02bd28d7139f9..6ec56f71c1d71 100755
--- a/test/ir/inference/auto_scan_test.py
+++ b/test/ir/inference/auto_scan_test.py
@@ -76,7 +76,7 @@ class IgnoreReasons(enum.Enum):
     TRT_NOT_SUPPORT = 1
     # Accuracy is abnormal after enabling pass.
     PASS_ACCURACY_ERROR = 2
-    # Accuracy is abnormal after enabling mkldnn.
+    # Accuracy is abnormal after enabling onednn.
     MKLDNN_ACCURACY_ERROR = 3
     # Accuracy is abnormal after enabling cutlass.
     CUTLASS_ACCURACY_ERROR = 3
diff --git a/test/ir/inference/inference_pass_test.py b/test/ir/inference/inference_pass_test.py
index 6b9f531201e5c..03da6514a0674 100644
--- a/test/ir/inference/inference_pass_test.py
+++ b/test/ir/inference/inference_pass_test.py
@@ -280,7 +280,7 @@ def check_output_with_option(
                     err_msg='Output has diff between GPU and TensorRT. ',
                 )
 
-        # Check whether the mkldnn results and the CPU results are the same.
+        # Check whether the onednn results and the CPU results are the same.
         if (not use_gpu) and self.enable_mkldnn:
             mkldnn_outputs = self._get_inference_outs(
                 self._get_analysis_config(
diff --git a/test/ir/inference/program_config.py b/test/ir/inference/program_config.py
index f64335fc4379e..ea00aa44cd69e 100644
--- a/test/ir/inference/program_config.py
+++ b/test/ir/inference/program_config.py
@@ -144,7 +144,6 @@ def __repr__(self):
     'heter_listen_and_serv',
     'c_wait_comm',
     'c_wait_compute',
-    'copy_cross_scope',
 }
 
 
diff --git a/test/ir/inference/quant_dequant_test.py b/test/ir/inference/quant_dequant_test.py
index 9a3715176d159..c176e802a525c 100644
--- a/test/ir/inference/quant_dequant_test.py
+++ b/test/ir/inference/quant_dequant_test.py
@@ -387,7 +387,7 @@ def check_output_with_option(
                     err_msg='Output has diff between GPU and TensorRT. ',
                 )
 
-        # Check whether the mkldnn results and the CPU results are the same.
+        # Check whether the onednn results and the CPU results are the same.
         if (not use_gpu) and self.enable_mkldnn:
             mkldnn_outputs = self._get_inference_outs(
                 self._get_analysis_config(
diff --git a/test/ir/inference/test_conv_act_onednn_fuse_pass.py b/test/ir/inference/test_conv_act_onednn_fuse_pass.py
index 34ecc923008d0..1106e672df270 100755
--- a/test/ir/inference/test_conv_act_onednn_fuse_pass.py
+++ b/test/ir/inference/test_conv_act_onednn_fuse_pass.py
@@ -159,7 +159,7 @@ def sample_program_config(self, draw):
 
         # 12. Generate legal attr of act
         act_op = None
-        self.passes = ['conv_activation_mkldnn_fuse_pass']
+        self.passes = ['conv_activation_onednn_fuse_pass']
         if act_type == 'relu6':
             act_op = OpConfig(
                 'relu6',
diff --git a/test/ir/inference/test_conv_bn_fuse_pass.py b/test/ir/inference/test_conv_bn_fuse_pass.py
index e16f867e0977d..2483012d47197 100644
--- a/test/ir/inference/test_conv_bn_fuse_pass.py
+++ b/test/ir/inference/test_conv_bn_fuse_pass.py
@@ -157,7 +157,7 @@ def generate_bn_Var():
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        # for mkldnn
+        # for onednn
         if program_config.ops[0].attrs['use_mkldnn']:
             config = self.create_inference_config(use_mkldnn=True)
             yield config, ['fused_conv2d'], (1e-5, 1e-5)
diff --git a/test/ir/inference/test_conv_transpose_bn_fuse_pass.py b/test/ir/inference/test_conv_transpose_bn_fuse_pass.py
index a6467f91bdef5..d623feffcf4aa 100644
--- a/test/ir/inference/test_conv_transpose_bn_fuse_pass.py
+++ b/test/ir/inference/test_conv_transpose_bn_fuse_pass.py
@@ -193,7 +193,7 @@ def generate_batch_norm_Variance():
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        # for mkldnn
+        # for onednn
         if program_config.ops[0].attrs['use_mkldnn']:
             config = self.create_inference_config(use_mkldnn=True)
             yield config, ['conv2d_transpose_bias'], (1e-5, 1e-5)
diff --git a/test/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py b/test/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py
index bed6522ca3d90..ca6506c893893 100644
--- a/test/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py
+++ b/test/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py
@@ -219,7 +219,7 @@ def generate_batch_norm_Variance():
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        # for mkldnn
+        # for onednn
         if program_config.ops[2].attrs['use_mkldnn']:
             config = self.create_inference_config(use_mkldnn=True)
             yield config, ['conv2d_transpose', 'elementwise_add'], (1e-5, 1e-5)
diff --git a/test/ir/inference/test_matmul_scale_fuse_pass.py b/test/ir/inference/test_matmul_scale_fuse_pass.py
index 55b59ef4d5ca6..67728e12a3025 100644
--- a/test/ir/inference/test_matmul_scale_fuse_pass.py
+++ b/test/ir/inference/test_matmul_scale_fuse_pass.py
@@ -35,7 +35,7 @@ def sample_predictor_configs(self, program_config):
             "matmul",
         ], (1e-5, 1e-5)
 
-        # mkldnn
+        # onednn
         config = self.create_inference_config(use_mkldnn=True)
         yield config, [
             "matmul",
diff --git a/test/ir/inference/test_matmul_v2_scale_fuse_pass.py b/test/ir/inference/test_matmul_v2_scale_fuse_pass.py
index dee099954626b..65a456f3a0a84 100644
--- a/test/ir/inference/test_matmul_v2_scale_fuse_pass.py
+++ b/test/ir/inference/test_matmul_v2_scale_fuse_pass.py
@@ -35,7 +35,7 @@ def sample_predictor_configs(self, program_config):
         # config = self.create_inference_config(use_gpu=False)
         # yield config, ["matmul_v2", ], (1e-5, 1e-5)
 
-        # mkldnn
+        # onednn
         config = self.create_inference_config(use_mkldnn=True)
         yield config, [
             "matmul_v2",
diff --git a/test/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py b/test/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py
index a075a6a19021a..2aad36e6d01d0 100644
--- a/test/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py
@@ -141,7 +141,7 @@ def teller1(program_config, predictor_config):
                 return True
             return False
 
-        # mkldnn Output has diff with bias!
+        # onednn Output has diff with bias!
         def teller2(program_config, predictor_config):
             return (
                 predictor_config.mkldnn_enabled()
@@ -164,7 +164,7 @@ def teller2(program_config, predictor_config):
     def test(self):
         self.run_and_statis(
             quant=False,
-            passes=["conv_affine_channel_mkldnn_fuse_pass"],
+            passes=["conv_affine_channel_onednn_fuse_pass"],
         )
 
 
diff --git a/test/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py b/test/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py
index 70c03003a0c99..01d979a859863 100644
--- a/test/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py
@@ -100,7 +100,7 @@ def sample_predictor_configs(self, program_config):
 
     def test(self):
         self.run_and_statis(
-            quant=False, passes=["conv_activation_mkldnn_fuse_pass"]
+            quant=False, passes=["conv_activation_onednn_fuse_pass"]
         )
 
 
diff --git a/test/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py b/test/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py
index 70704781361df..69f6380a61e11 100644
--- a/test/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py
@@ -97,7 +97,7 @@ def sample_predictor_configs(self, program_config):
 
     def test(self):
         self.run_and_statis(
-            quant=False, passes=["conv_activation_mkldnn_fuse_pass"]
+            quant=False, passes=["conv_activation_onednn_fuse_pass"]
         )
 
 
diff --git a/test/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py b/test/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py
index 90a6b37bf6ebd..01b58e9b67321 100644
--- a/test/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py
@@ -102,7 +102,7 @@ def sample_predictor_configs(self, program_config):
 
     def test(self):
         self.run_and_statis(
-            quant=False, passes=["conv_activation_mkldnn_fuse_pass"]
+            quant=False, passes=["conv_activation_onednn_fuse_pass"]
         )
 
 
diff --git a/test/ir/inference/test_mkldnn_conv_mish_fuse_pass.py b/test/ir/inference/test_mkldnn_conv_mish_fuse_pass.py
index d7394477d6739..0e5cfef504451 100644
--- a/test/ir/inference/test_mkldnn_conv_mish_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_conv_mish_fuse_pass.py
@@ -101,7 +101,7 @@ def sample_predictor_configs(self, program_config):
 
     def test(self):
         self.run_and_statis(
-            quant=False, passes=["conv_activation_mkldnn_fuse_pass"]
+            quant=False, passes=["conv_activation_onednn_fuse_pass"]
         )
 
 
diff --git a/test/ir/inference/test_mkldnn_conv_transpose_bias_fuse_pass.py b/test/ir/inference/test_mkldnn_conv_transpose_bias_fuse_pass.py
index 5da674b84b7ef..74621406e89ae 100644
--- a/test/ir/inference/test_mkldnn_conv_transpose_bias_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_conv_transpose_bias_fuse_pass.py
@@ -112,7 +112,7 @@ def test(self):
         self.run_and_statis(
             quant=False,
             max_duration=300,
-            passes=["conv_transpose_bias_mkldnn_fuse_pass"],
+            passes=["conv_transpose_bias_onednn_fuse_pass"],
         )
 
 
diff --git a/test/ir/inference/test_mkldnn_depthwise_conv_pass.py b/test/ir/inference/test_mkldnn_depthwise_conv_pass.py
index ddcad43ee9e65..b2b02a52014ae 100644
--- a/test/ir/inference/test_mkldnn_depthwise_conv_pass.py
+++ b/test/ir/inference/test_mkldnn_depthwise_conv_pass.py
@@ -30,7 +30,7 @@ class DepthwiseConvMKLDNNPass(PassAutoScanTest):
     '''
 
     def test(self):
-        self.run_and_statis(quant=False, passes=["depthwise_conv_mkldnn_pass"])
+        self.run_and_statis(quant=False, passes=["depthwise_conv_onednn_pass"])
 
     def sample_program_config(self, draw):
         # generate random number
@@ -121,7 +121,7 @@ def generate_conv2d_Filter():
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        # for mkldnn
+        # for onednn
         config = self.create_inference_config(use_mkldnn=True)
         yield config, ['conv2d'], (1e-5, 1e-5)
 
diff --git a/test/ir/inference/test_mkldnn_int8_scale_calculation_pass.py b/test/ir/inference/test_mkldnn_int8_scale_calculation_pass.py
index 81c8e16bf78b6..8c19328eb2988 100644
--- a/test/ir/inference/test_mkldnn_int8_scale_calculation_pass.py
+++ b/test/ir/inference/test_mkldnn_int8_scale_calculation_pass.py
@@ -22,7 +22,7 @@
 class TestInt8ScaleCalculationMkldnnPass(PassAutoScanTest):
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(use_gpu=False)
-        config.pass_builder().append_pass("int8_scale_calculation_mkldnn_pass")
+        config.pass_builder().append_pass("int8_scale_calculation_onednn_pass")
         yield config, ["conv2d"], (1e-4, 1e-5)
 
     def is_program_valid(self, prog_config):
@@ -172,7 +172,7 @@ def test(self):
         self.run_and_statis(
             quant=False,
             max_examples=100,
-            passes=["int8_scale_calculation_mkldnn_pass"],
+            passes=["int8_scale_calculation_onednn_pass"],
         )
 
 
diff --git a/test/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py b/test/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py
index 95b4b0613bd13..0f5c2b228b4d0 100644
--- a/test/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py
@@ -142,7 +142,7 @@ def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
             use_mkldnn=True,
             passes=[
-                'matmul_activation_mkldnn_fuse_pass',
+                'matmul_activation_onednn_fuse_pass',
                 'operator_scale_onednn_fuse_pass',
             ],
         )
@@ -153,7 +153,7 @@ def test(self):
             quant=False,
             max_examples=50,
             passes=[
-                'matmul_activation_mkldnn_fuse_pass',
+                'matmul_activation_onednn_fuse_pass',
                 'operator_scale_onednn_fuse_pass',
             ],
         )
diff --git a/test/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py b/test/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py
index ef560edbd1884..12ca2df19fc1e 100644
--- a/test/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py
@@ -133,8 +133,8 @@ def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
             use_mkldnn=True,
             passes=[
-                'matmul_elementwise_add_mkldnn_fuse_pass',
-                'matmul_activation_mkldnn_fuse_pass',
+                'matmul_elementwise_add_onednn_fuse_pass',
+                'matmul_activation_onednn_fuse_pass',
             ],
         )
         yield config, ['fused_matmul'], (1e-5, 1e-5)
@@ -143,8 +143,8 @@ def test(self):
         self.run_and_statis(
             quant=False,
             passes=[
-                'matmul_elementwise_add_mkldnn_fuse_pass',
-                'matmul_activation_mkldnn_fuse_pass',
+                'matmul_elementwise_add_onednn_fuse_pass',
+                'matmul_activation_onednn_fuse_pass',
             ],
         )
 
diff --git a/test/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py b/test/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py
index 27d60ff74f0a5..2bf2020af507d 100644
--- a/test/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py
@@ -74,13 +74,13 @@ def generate_input():
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
-            use_mkldnn=True, passes=['matmul_elementwise_add_mkldnn_fuse_pass']
+            use_mkldnn=True, passes=['matmul_elementwise_add_onednn_fuse_pass']
         )
         yield config, ['fused_matmul'], (1e-5, 1e-5)
 
     def test(self):
         self.run_and_statis(
-            quant=False, passes=['matmul_elementwise_add_mkldnn_fuse_pass']
+            quant=False, passes=['matmul_elementwise_add_onednn_fuse_pass']
         )
 
 
@@ -137,13 +137,13 @@ def generate_input():
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
-            use_mkldnn=True, passes=['matmul_elementwise_add_mkldnn_fuse_pass']
+            use_mkldnn=True, passes=['matmul_elementwise_add_onednn_fuse_pass']
         )
         yield config, ['fused_matmul'], (1e-5, 1e-5)
 
     def test(self):
         self.run_and_statis(
-            quant=False, passes=['matmul_elementwise_add_mkldnn_fuse_pass']
+            quant=False, passes=['matmul_elementwise_add_onednn_fuse_pass']
         )
 
 
@@ -203,13 +203,13 @@ def generate_input_redisual():
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
-            use_mkldnn=True, passes=['matmul_elementwise_add_mkldnn_fuse_pass']
+            use_mkldnn=True, passes=['matmul_elementwise_add_onednn_fuse_pass']
         )
         yield config, ['fused_matmul'], (1e-5, 1e-5)
 
     def test(self):
         self.run_and_statis(
-            quant=False, passes=['matmul_elementwise_add_mkldnn_fuse_pass']
+            quant=False, passes=['matmul_elementwise_add_onednn_fuse_pass']
         )
 
 
diff --git a/test/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py b/test/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py
index cf20cf43ec339..f343242c298cc 100644
--- a/test/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py
@@ -146,7 +146,7 @@ def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
             use_mkldnn=True,
             passes=[
-                'matmul_activation_mkldnn_fuse_pass',
+                'matmul_activation_onednn_fuse_pass',
                 'operator_scale_onednn_fuse_pass',
             ],
         )
@@ -157,7 +157,7 @@ def test(self):
             quant=False,
             max_examples=50,
             passes=[
-                'matmul_activation_mkldnn_fuse_pass',
+                'matmul_activation_onednn_fuse_pass',
                 'operator_scale_onednn_fuse_pass',
             ],
         )
diff --git a/test/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py b/test/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py
index e667c10fe6a03..9b471f7653de2 100644
--- a/test/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py
@@ -93,7 +93,7 @@ def test(self):
         self.run_and_statis(
             quant=False,
             max_examples=30,
-            passes=['matmul_elementwise_add_mkldnn_fuse_pass'],
+            passes=['matmul_elementwise_add_onednn_fuse_pass'],
         )
 
 
diff --git a/test/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py b/test/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py
index 45b17d59aeba5..1e90a8dbd1945 100644
--- a/test/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py
@@ -134,7 +134,7 @@ def sample_predictor_configs(self, program_config):
 
     def test(self):
         self.run_and_statis(
-            quant=False, passes=["matmul_transpose_reshape_mkldnn_fuse_pass"]
+            quant=False, passes=["matmul_transpose_reshape_onednn_fuse_pass"]
         )
 
 
diff --git a/test/ir/inference/test_mkldnn_reshape_transpose_matmul_v2_fuse_pass.py b/test/ir/inference/test_mkldnn_reshape_transpose_matmul_v2_fuse_pass.py
index 0b09c218ea9b2..7e9ba26351739 100644
--- a/test/ir/inference/test_mkldnn_reshape_transpose_matmul_v2_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_reshape_transpose_matmul_v2_fuse_pass.py
@@ -26,7 +26,7 @@ class TestReshapeTransposeMatmulV2OneDNNFusePass(InferencePassTest):
     def setUp(self):
         self.set_params()
         self.transpose_perm = [0, 2, 1, 3]
-        self.pass_name = 'reshape_transpose_matmul_mkldnn_fuse_pass'
+        self.pass_name = 'reshape_transpose_matmul_onednn_fuse_pass'
 
         with base.program_guard(self.main_program, self.startup_program):
             data = paddle.static.data(
diff --git a/test/ir/inference/test_mkldnn_shuffle_channel_detect_pass.py b/test/ir/inference/test_mkldnn_shuffle_channel_detect_pass.py
index c0486b6972a3c..8ebcfaf1041b6 100644
--- a/test/ir/inference/test_mkldnn_shuffle_channel_detect_pass.py
+++ b/test/ir/inference/test_mkldnn_shuffle_channel_detect_pass.py
@@ -135,7 +135,7 @@ def sample_predictor_configs(self, program_config):
 
     def test(self):
         self.run_and_statis(
-            quant=False, passes=["shuffle_channel_mkldnn_detect_pass"]
+            quant=False, passes=["shuffle_channel_onednn_detect_pass"]
         )
 
 
diff --git a/test/ir/inference/test_onednn_conv_bias_fuse_pass.py b/test/ir/inference/test_onednn_conv_bias_fuse_pass.py
index 9810354eb7f27..565b4f92446ca 100644
--- a/test/ir/inference/test_onednn_conv_bias_fuse_pass.py
+++ b/test/ir/inference/test_onednn_conv_bias_fuse_pass.py
@@ -189,7 +189,7 @@ def sample_program_config(self, draw):
 
     def test(self):
         self.run_and_statis(
-            quant=False, passes=['conv_bias_mkldnn_fuse_pass'], max_examples=130
+            quant=False, passes=['conv_bias_onednn_fuse_pass'], max_examples=130
         )
 
 
diff --git a/test/ir/inference/test_onednn_conv_concat_activation_fuse_pass.py b/test/ir/inference/test_onednn_conv_concat_activation_fuse_pass.py
index 1a71841d22cc0..f45190a5084f2 100644
--- a/test/ir/inference/test_onednn_conv_concat_activation_fuse_pass.py
+++ b/test/ir/inference/test_onednn_conv_concat_activation_fuse_pass.py
@@ -162,7 +162,7 @@ def sample_predictor_configs(self, program_config):
     def test(self):
         self.run_and_statis(
             quant=False,
-            passes=['conv_activation_mkldnn_fuse_pass'],
+            passes=['conv_activation_onednn_fuse_pass'],
             max_examples=50,
         )
 
diff --git a/test/ir/inference/test_onednn_conv_elementwise_add_fuse_pass.py b/test/ir/inference/test_onednn_conv_elementwise_add_fuse_pass.py
index b8c847460e039..fe51b2d0e3892 100644
--- a/test/ir/inference/test_onednn_conv_elementwise_add_fuse_pass.py
+++ b/test/ir/inference/test_onednn_conv_elementwise_add_fuse_pass.py
@@ -121,7 +121,7 @@ def sample_predictor_configs(self, program_config):
 
     def test(self):
         self.run_and_statis(
-            quant=False, passes=['conv_elementwise_add_mkldnn_fuse_pass']
+            quant=False, passes=['conv_elementwise_add_onednn_fuse_pass']
         )
 
 
diff --git a/test/ir/inference/test_onednn_fc_activation_fuse_pass.py b/test/ir/inference/test_onednn_fc_activation_fuse_pass.py
index a16346f94c5c0..84517b6dfc854 100644
--- a/test/ir/inference/test_onednn_fc_activation_fuse_pass.py
+++ b/test/ir/inference/test_onednn_fc_activation_fuse_pass.py
@@ -136,7 +136,7 @@ def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
             use_mkldnn=True,
             passes=[
-                "fc_act_mkldnn_fuse_pass",
+                "fc_act_onednn_fuse_pass",
                 "operator_scale_onednn_fuse_pass",
             ],
         )
@@ -146,7 +146,7 @@ def test(self):
         self.run_and_statis(
             quant=False,
             passes=[
-                "fc_act_mkldnn_fuse_pass",
+                "fc_act_onednn_fuse_pass",
                 "operator_scale_onednn_fuse_pass",
             ],
         )
diff --git a/test/ir/inference/test_onednn_fc_gru_fuse_pass.py b/test/ir/inference/test_onednn_fc_gru_fuse_pass.py
index 12069aac2de24..1b2d7b0be6e4f 100644
--- a/test/ir/inference/test_onednn_fc_gru_fuse_pass.py
+++ b/test/ir/inference/test_onednn_fc_gru_fuse_pass.py
@@ -105,7 +105,7 @@ def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
             use_mkldnn=True,
             passes=[
-                'mkldnn_placement_pass',
+                'onednn_placement_pass',
                 'fc_gru_fuse_pass',
             ],
         )
@@ -115,7 +115,7 @@ def test(self):
         self.run_and_statis(
             quant=False,
             passes=[
-                'mkldnn_placement_pass',
+                'onednn_placement_pass',
                 'fc_gru_fuse_pass',
             ],
             max_examples=100,
diff --git a/test/ir/inference/test_onednn_fc_lstm_fuse_pass.py b/test/ir/inference/test_onednn_fc_lstm_fuse_pass.py
index 9b588841f409a..93e755f4032ff 100644
--- a/test/ir/inference/test_onednn_fc_lstm_fuse_pass.py
+++ b/test/ir/inference/test_onednn_fc_lstm_fuse_pass.py
@@ -109,7 +109,7 @@ def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
             use_mkldnn=True,
             passes=[
-                'mkldnn_placement_pass',
+                'onednn_placement_pass',
                 'fc_lstm_fuse_pass',
             ],
         )
@@ -119,7 +119,7 @@ def test(self):
         self.run_and_statis(
             quant=False,
             passes=[
-                'mkldnn_placement_pass',
+                'onednn_placement_pass',
                 'fc_lstm_fuse_pass',
             ],
             max_examples=50,
diff --git a/test/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py b/test/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py
index 3afc4b0ac1cf6..99c731a1d9dfb 100644
--- a/test/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py
+++ b/test/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py
@@ -114,7 +114,7 @@ def sample_predictor_configs(self, program_config):
 
     def test(self):
         self.run_and_statis(
-            quant=False, passes=['matmul_transpose_reshape_mkldnn_fuse_pass']
+            quant=False, passes=['matmul_transpose_reshape_onednn_fuse_pass']
         )
 
 
diff --git a/test/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py b/test/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py
index 90db430f93894..abb2269e6ac35 100644
--- a/test/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py
+++ b/test/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py
@@ -152,7 +152,7 @@ def sample_predictor_configs(self, program_config):
 
     def test(self):
         self.run_and_statis(
-            quant=False, passes=['reshape_transpose_matmul_mkldnn_fuse_pass']
+            quant=False, passes=['reshape_transpose_matmul_onednn_fuse_pass']
         )
 
 
diff --git a/test/ir/inference/test_xpu_cross_attention_xpu_fuse_pass.py b/test/ir/inference/test_xpu_cross_attention_xpu_fuse_pass.py
new file mode 100644
index 0000000000000..5cc56c71b5fab
--- /dev/null
+++ b/test/ir/inference/test_xpu_cross_attention_xpu_fuse_pass.py
@@ -0,0 +1,246 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import unittest
+
+import hypothesis.strategies as st
+import numpy as np
+from auto_scan_test import PassAutoScanTest
+from program_config import OpConfig, ProgramConfig, TensorConfig
+
+
+class TestCrossAttentionXPUFusePass(PassAutoScanTest):
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_xpu=True)
+        yield config, ["cross_attention_xpu"], (1e-1, 1e-1)
+
+    def sample_program_config(self, draw):
+        # Here we will compose a program
+        # Still has some risks that the program is invalid or cause bug while running
+        # Use function `is_program_valid` to filter the invalid programs before running
+        # Use function `add_skip_pass_case` to ignore the programs even if they cause bug while runing
+
+        # q: matmul + add + reshape + transpose + scale
+        q_mul_op = OpConfig(
+            "matmul_v2",
+            inputs={"X": ["input_q"], "Y": ["q_mul_w"]},
+            outputs={"Out": ["q_mul_out"]},
+            trans_x=False,
+            trans_y=False,
+        )
+        q_add_op = OpConfig(
+            "elementwise_add",
+            inputs={"X": ["q_mul_out"], "Y": ["q_add_bias"]},
+            outputs={"Out": ["q_add_out"]},
+            axis=-1,
+        )
+        q_reshape_op = OpConfig(
+            "reshape2",
+            inputs={"X": ["q_add_out"]},
+            outputs={"Out": ["q_reshape_out"], "XShape": ["q_reshape_xshape"]},
+            shape=[0, 0, 4, 32],
+        )
+        q_transpose_op = OpConfig(
+            "transpose2",
+            inputs={"X": ["q_reshape_out"]},
+            outputs={
+                "Out": ["q_transpose_out"],
+                "XShape": ["q_transpose_xshape"],
+            },
+            axis=[0, 2, 1, 3],
+        )
+        q_scale_op = OpConfig(
+            "scale",
+            inputs={"X": ["q_transpose_out"]},
+            outputs={"Out": ["q_scale_out"]},
+            scale=1.0 / math.sqrt(32.0),
+            bias=0,
+            bias_after_scale=True,
+        )
+        # k: matmul + add + reshape + transpose
+        k_mul_op = OpConfig(
+            "matmul_v2",
+            inputs={"X": ["input_kv"], "Y": ["k_mul_w"]},
+            outputs={"Out": ["k_mul_out"]},
+            trans_x=False,
+            trans_y=False,
+        )
+        k_add_op = OpConfig(
+            "elementwise_add",
+            inputs={"X": ["k_mul_out"], "Y": ["k_add_bias"]},
+            outputs={"Out": ["k_add_out"]},
+            axis=-1,
+        )
+        k_reshape_op = OpConfig(
+            "reshape2",
+            inputs={"X": ["k_add_out"]},
+            outputs={"Out": ["k_reshape_out"], "XShape": ["k_reshape_xshape"]},
+            shape=[0, 0, 4, 32],
+        )
+        k_transpose_op = OpConfig(
+            "transpose2",
+            inputs={"X": ["k_reshape_out"]},
+            outputs={
+                "Out": ["k_transpose_out"],
+                "XShape": ["k_transpose_xshape"],
+            },
+            axis=[0, 2, 1, 3],
+        )
+        # v: matmul + add + reshape + transpose
+        v_mul_op = OpConfig(
+            "matmul_v2",
+            inputs={"X": ["input_kv"], "Y": ["v_mul_w"]},
+            outputs={"Out": ["v_mul_out"]},
+            trans_x=False,
+            trans_y=False,
+        )
+        v_add_op = OpConfig(
+            "elementwise_add",
+            inputs={"X": ["v_mul_out"], "Y": ["v_add_bias"]},
+            outputs={"Out": ["v_add_out"]},
+            axis=-1,
+        )
+        v_reshape_op = OpConfig(
+            "reshape2",
+            inputs={"X": ["v_add_out"]},
+            outputs={"Out": ["v_reshape_out"], "XShape": ["v_reshape_xshape"]},
+            shape=[0, 0, 4, 32],
+        )
+        v_transpose_op = OpConfig(
+            "transpose2",
+            inputs={"X": ["v_reshape_out"]},
+            outputs={
+                "Out": ["v_transpose_out"],
+                "XShape": ["v_transpose_xshape"],
+            },
+            axis=[0, 2, 1, 3],
+        )
+        # qk_matmul + add + softmax
+        qk_matmul_op = OpConfig(
+            "matmul_v2",
+            inputs={"X": ["q_scale_out"], "Y": ["k_transpose_out"]},
+            outputs={"Out": ["qk_matmul_out"]},
+            trans_x=False,
+            trans_y=True,
+        )
+        qk_add_op = OpConfig(
+            "elementwise_add",
+            inputs={"X": ["qk_matmul_out"], "Y": ["qk_add_mask"]},
+            outputs={"Out": ["qk_add_out"]},
+            axis=-1,
+        )
+        qk_softmax_op = OpConfig(
+            "softmax",
+            inputs={"X": ["qk_add_out"]},
+            outputs={"Out": ["qk_softmax_out"]},
+            axis=-1,
+        )
+        # qkv_malmul + transpose + reshape
+        qkv_matmul_op = OpConfig(
+            "matmul_v2",
+            inputs={"X": ["qk_softmax_out"], "Y": ["v_transpose_out"]},
+            outputs={"Out": ["qkv_matmul_out"]},
+            trans_x=False,
+            trans_y=False,
+        )
+        qkv_transpose_op = OpConfig(
+            "transpose2",
+            inputs={"X": ["qkv_matmul_out"]},
+            outputs={
+                "Out": ["qkv_transpose_out"],
+                "XShape": ["qkv_transpose_xshape"],
+            },
+            axis=[0, 2, 1, 3],
+        )
+        qkv_reshape_op = OpConfig(
+            "reshape2",
+            inputs={"X": ["qkv_transpose_out"]},
+            outputs={
+                "Out": ["qkv_reshape_out"],
+                "XShape": ["qkv_reshape_xshape"],
+            },
+            shape=[0, 0, 128],
+        )
+
+        ops = [
+            q_mul_op,
+            q_add_op,
+            q_reshape_op,
+            q_transpose_op,
+            q_scale_op,
+            k_mul_op,
+            k_add_op,
+            k_reshape_op,
+            k_transpose_op,
+            v_mul_op,
+            v_add_op,
+            v_reshape_op,
+            v_transpose_op,
+            qk_matmul_op,
+            qk_add_op,
+            qk_softmax_op,
+            qkv_matmul_op,
+            qkv_transpose_op,
+            qkv_reshape_op,
+        ]
+
+        # set input shape
+        batch_size = draw(st.integers(min_value=1, max_value=10))
+        q_seqlen = draw(st.integers(min_value=1, max_value=128))
+        kv_seqlen = draw(st.integers(min_value=1, max_value=256))
+        # batch_size = 1
+        # q_seqlen = 2
+        # kv_seqlen = 62
+        hidden_dim = 128
+
+        input_q_shape = [batch_size, q_seqlen, hidden_dim]
+        input_kv_shape = [batch_size, kv_seqlen, hidden_dim]
+        q_mul_w_shape = [input_q_shape[2], input_q_shape[2]]
+        k_mul_w_shape = [input_kv_shape[2], input_kv_shape[2]]
+        v_mul_w_shape = [input_kv_shape[2], input_kv_shape[2]]
+        q_add_bias_shape = [input_q_shape[2]]
+        qk_add_mask_shape = [q_seqlen, kv_seqlen]
+
+        program_config = ProgramConfig(
+            ops=ops,
+            inputs={
+                "input_q": TensorConfig(shape=input_q_shape),
+                "input_kv": TensorConfig(shape=input_kv_shape),
+                "qk_add_mask": TensorConfig(shape=qk_add_mask_shape),
+            },
+            weights={
+                "q_mul_w": TensorConfig(shape=q_mul_w_shape),
+                "k_mul_w": TensorConfig(shape=k_mul_w_shape),
+                "v_mul_w": TensorConfig(shape=v_mul_w_shape),
+                "q_add_bias": TensorConfig(shape=q_add_bias_shape),
+                "k_add_bias": TensorConfig(shape=q_add_bias_shape),
+                "v_add_bias": TensorConfig(shape=q_add_bias_shape),
+            },
+            outputs=["qkv_reshape_out"],
+        )
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=2,
+            min_success_num=2,
+            passes=["cross_attention_xpu_fuse_pass"],
+        )
+
+
+if __name__ == "__main__":
+    np.random.seed(200)
+    unittest.main()
diff --git a/test/ir/inference/test_xpu_group_norm_silu_pass.py b/test/ir/inference/test_xpu_group_norm_silu_pass.py
new file mode 100644
index 0000000000000..3fcd1dc9433a6
--- /dev/null
+++ b/test/ir/inference/test_xpu_group_norm_silu_pass.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import hypothesis.strategies as st
+import numpy as np
+from auto_scan_test import PassAutoScanTest
+from program_config import OpConfig, ProgramConfig, TensorConfig
+
+
+class TestGroupNormalizeSiluXPUFusePass(PassAutoScanTest):
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_xpu=True)
+        yield config, ["group_norm_silu_xpu"], (1e-3, 1e-3)
+
+    def sample_program_config(self, draw):
+        batch_size = draw(st.integers(min_value=1, max_value=50))
+        channel = 128
+        x_shape = [batch_size, channel, 64, 64]
+        y_shape = x_shape
+
+        groups = 32
+
+        epsilon = draw(st.floats(min_value=0.0000001, max_value=0.001))
+        bias = draw(st.floats(min_value=0.0000001, max_value=0.001))
+        # Here we will compose a program
+        group_norm_op = OpConfig(
+            type='group_norm',
+            inputs={
+                'X': ['group_norm_X'],
+                'Bias': ['group_norm_Bias'],
+                'Scale': ['group_norm_Scale'],
+            },
+            outputs={
+                'Y': ['group_norm_Y'],
+                'Mean': ['group_norm_Mean'],
+                'Variance': ['group_norm_Variance'],
+            },
+            epsilon=epsilon,
+            groups=groups,
+        )
+        silu_op = OpConfig(
+            "silu",
+            inputs={
+                "X": ["group_norm_Y"],
+            },
+            outputs={
+                "Out": ["silu_Out"],
+            },
+        )
+        mini_graph = [group_norm_op, silu_op]
+
+        program_config = ProgramConfig(
+            ops=mini_graph,
+            weights={
+                "group_norm_Scale": TensorConfig(shape=[x_shape[1]]),
+                "group_norm_Bias": TensorConfig(shape=[x_shape[1]]),
+            },
+            inputs={
+                "group_norm_X": TensorConfig(shape=x_shape),
+            },
+            outputs=mini_graph[-1].outputs["Out"],
+        )
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=25,
+            passes=["group_norm_silu_xpu_fuse_pass"],
+        )
+
+
+if __name__ == "__main__":
+    np.random.seed(200)
+    unittest.main()
diff --git a/test/ir/pass_test.py b/test/ir/pass_test.py
index 7d892b74590ba..16e3355f57c1d 100644
--- a/test/ir/pass_test.py
+++ b/test/ir/pass_test.py
@@ -131,9 +131,7 @@ def check_output_with_place(self, place, startup_on_cpu=False, atol=1e-5):
             outs, lods = self._run_program(executor, self.main_program)
         self.assertTrue(
             len(self.fetch_list) == len(outs),
-            "Checking the number of fetchs failed. Expected: {}, Received: {}".format(
-                len(self.fetch_list), len(outs)
-            ),
+            f"Checking the number of fetchs failed. Expected: {len(self.fetch_list)}, Received: {len(outs)}",
         )
 
         # Parameters may be changed in ir passes.
@@ -149,9 +147,7 @@ def check_output_with_place(self, place, startup_on_cpu=False, atol=1e-5):
         outs_opt, lods_opt = self._run_program(executor, opt_program)
         self.assertTrue(
             len(self.fetch_list) == len(outs_opt),
-            "Checking the number of fetchs failed. Expected: {}, Received: {}".format(
-                len(self.fetch_list), len(outs_opt)
-            ),
+            f"Checking the number of fetchs failed. Expected: {len(self.fetch_list)}, Received: {len(outs_opt)}",
         )
         for i in range(len(self.fetch_list)):
             is_allclose = np.allclose(outs_opt[i], outs[i], atol=atol)
@@ -194,10 +190,8 @@ def _check_fused_ops(self, program):
                 actual_num_fused_ops += 1
         self.assertTrue(
             self.num_fused_ops == actual_num_fused_ops,
-            "Checking of the number of fused operator < {} > failed. "
-            "Expected: {}, Received: {}".format(
-                self.fused_op_type, self.num_fused_ops, actual_num_fused_ops
-            ),
+            f"Checking of the number of fused operator < {self.fused_op_type} > failed. "
+            f"Expected: {self.num_fused_ops}, Received: {actual_num_fused_ops}",
         )
 
     def check_program(self, program=None):
@@ -219,9 +213,7 @@ def check_program(self, program=None):
         self.assertTrue(
             self.main_program.num_blocks == program.num_blocks,
             "The number of blocks of the origin program and the optimized "
-            "program are different ({} vs {}).".format(
-                self.main_program.num_blocks, program.num_blocks
-            ),
+            f"program are different ({self.main_program.num_blocks} vs {program.num_blocks}).",
         )
 
         is_different = False
diff --git a/test/ir/pir/cinn/inference/test_llama_forward.py b/test/ir/pir/cinn/inference/test_llama_forward.py
index 7c456ce3921d4..eb41f6ce3f941 100644
--- a/test/ir/pir/cinn/inference/test_llama_forward.py
+++ b/test/ir/pir/cinn/inference/test_llama_forward.py
@@ -12,619 +12,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import math
+import os
 import sys
 import unittest
 from os.path import dirname
-from typing import Optional, Tuple
 
 import numpy as np
 
+os.environ["FLAGS_prim_forward_blacklist"] = "pd_op.embedding;pd_op.softmax"
+
 import paddle
-import paddle.nn.functional as F
-from paddle import nn
-from paddle.incubate.nn.functional import swiglu
 from paddle.static import InputSpec
 
 sys.path.append(dirname(dirname(__file__)))
 
+import llama_test_model
 import utils
 
 
-class LlamaConfig:
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=11008,
-        max_position_embeddings=2048,
-        seq_length=2048,
-        num_hidden_layers=1,
-        num_attention_heads=32,
-        num_key_value_heads=32,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-    ):
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.max_position_embeddings = max_position_embeddings
-        self.seq_length = seq_length
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-
-
-class LlamaRotaryEmbedding(nn.Layer):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000):
-        super().__init__()
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        # [dim / 2]
-        self.inv_freq = 1.0 / (
-            self.base
-            ** (
-                paddle.cast(paddle.arange(0, self.dim, 2), dtype="float32")
-                / self.dim
-            )
-        )
-        self._set_cos_sin_cache(seq_len=max_position_embeddings)
-
-    def _set_cos_sin_cache(self, seq_len):
-        self.max_seq_len_cached = seq_len
-        # [seq_len]
-        t = paddle.arange(seq_len, dtype="float32")
-        # [seq_len, dim/2]
-        freqs = paddle.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        # [seq_len, dim]
-        emb = paddle.concat([freqs, freqs], axis=-1)
-        # [1, seqlen, 1, dim]
-        self.cos_cached = emb.cos()[None, :, None, :]
-        self.sin_cached = emb.sin()[None, :, None, :]
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        cos = self.cos_cached[:, :seq_len, :, :]
-        sin = self.sin_cached[:, :seq_len, :, :]
-        return (
-            cos.cast(x.dtype) if cos.dtype != x.dtype else cos,
-            sin.cast(x.dtype) if sin.dtype != x.dtype else sin,
-        )
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return paddle.concat([-x2, x1], axis=-1)  # shape is the same as x
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
-    if position_ids is None:
-        # Note: Only for LlamaForCausalLMPipe model pretraining
-        cos = cos[:, : q.shape[1], :, :]  # [bs, seq_len, 1, dim]
-        sin = sin[:, : q.shape[1], :, :]  # [bs, seq_len, 1, dim]
-    else:
-        cos = cos.squeeze(axis=[0, 2])  # [seq_len, dim]
-        sin = sin.squeeze(axis=[0, 2])  # [seq_len, dim]
-        cos = cos[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
-        sin = sin[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-def _make_causal_mask(input_ids_shape, past_key_values_length):
-    """
-    Make causal mask used for self-attention
-    """
-    batch_size, target_length = input_ids_shape  # target_length: seq_len
-
-    mask = paddle.tril(
-        paddle.ones((target_length, target_length), dtype="bool")
-    )
-
-    if past_key_values_length > 0:
-        # [tgt_len, tgt_len + past_len]
-        mask = paddle.concat(
-            [
-                paddle.ones(
-                    [target_length, past_key_values_length], dtype="bool"
-                ),
-                mask,
-            ],
-            axis=-1,
-        )
-
-    # [bs, 1, tgt_len, tgt_len + past_len]
-    return mask[None, None, :, :].expand(
-        [batch_size, 1, target_length, target_length + past_key_values_length]
-    )
-
-
-def _expand_2d_mask(mask, dtype, tgt_length):
-    """
-    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
-    """
-    batch_size, src_length = mask.shape[0], mask.shape[-1]
-    tgt_length = tgt_length if tgt_length is not None else src_length
-
-    mask = mask[:, None, None, :].astype("bool")
-    mask.stop_gradient = True
-    expanded_mask = mask.expand([batch_size, 1, tgt_length, src_length])
-
-    return expanded_mask
-
-
-def get_triangle_upper_mask(x, mask=None):
-    if mask is not None:
-        return mask
-    # [bsz, n_head, q_len, kv_seq_len]
-    shape = x.shape
-    #  [bsz, 1, q_len, kv_seq_len]
-    shape[1] = 1
-    mask = paddle.full(shape, paddle.finfo(x.dtype).min, dtype=x.dtype)
-    mask = paddle.triu(mask, diagonal=1)
-    mask.stop_gradient = True
-    return mask
-
-
-def scaled_dot_product_attention(
-    query_states,
-    config,
-    key_states,
-    value_states,
-    attention_mask,
-    output_attentions,
-):
-    bsz, q_len, num_heads, head_dim = query_states.shape
-    _, kv_seq_len, _, _ = value_states.shape
-
-    #  [ bz, seqlen, nhead, head_dim] -> [bs, nhead, seq_len, head_dim]
-    query_states = paddle.transpose(query_states, [0, 2, 1, 3])
-    # merge with the next tranpose
-    key_states = paddle.transpose(key_states, [0, 2, 1, 3])
-    value_states = paddle.transpose(value_states, [0, 2, 1, 3])
-
-    # matmul and devide by sqrt(head_dim)
-    attn_weights = paddle.matmul(
-        query_states / math.sqrt(head_dim), key_states.transpose([0, 1, 3, 2])
-    )
-
-    # NOTE: we only call get_triangle_upper_mask under PP setup
-    # FIXME ZHUI when we use pipeline parallel, the attention_mask can be None
-    # we just make it triangle_upper_mask
-    if attention_mask is None:
-        attention_mask = get_triangle_upper_mask(attn_weights)
-    attention_mask = attention_mask.reshape([bsz, 1, q_len, kv_seq_len])
-
-    attn_weights = attn_weights + attention_mask
-    attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(
-        query_states.dtype
-    )
-
-    attn_output = paddle.matmul(attn_weights, value_states)
-    attn_output = attn_output.transpose([0, 2, 1, 3])
-
-    attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
-    return (attn_output, attn_weights) if output_attentions else attn_output
-
-
-class LlamaMLP(nn.Layer):
-    def __init__(self, config):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-
-        self.gate_proj = nn.Linear(
-            self.hidden_size, self.intermediate_size, bias_attr=False
-        )
-        self.up_proj = nn.Linear(
-            self.hidden_size, self.intermediate_size, bias_attr=False
-        )
-        self.down_proj = nn.Linear(
-            self.intermediate_size, self.hidden_size, bias_attr=False
-        )
-
-    def forward(self, x):
-        x = swiglu(self.gate_proj(x), self.up_proj(x))
-        out = self.down_proj(x)
-        return out
-
-
-class LlamaRMSNorm(nn.Layer):
-    def __init__(self, config):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.weight = paddle.create_parameter(
-            shape=[self.hidden_size],
-            dtype=paddle.get_default_dtype(),
-            default_initializer=nn.initializer.Constant(1.0),
-        )
-        self.variance_epsilon = config.rms_norm_eps
-        self.config = config
-
-    def forward(self, hidden_states):
-        hidden_states = hidden_states.astype("float32")
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = (
-            paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
-        )
-
-        if self.weight.dtype in [paddle.float16, paddle.bfloat16]:
-            hidden_states = paddle.cast(hidden_states, self.weight.dtype)
-        return hidden_states * self.weight
-
-
-class LlamaAttention(nn.Layer):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-
-        self.head_dim = self.hidden_size // config.num_attention_heads
-
-        self.num_key_value_heads = config.num_key_value_heads
-        assert config.num_attention_heads // config.num_key_value_heads
-        self.num_key_value_groups = (
-            config.num_attention_heads // config.num_key_value_heads
-        )
-        self.gqa_or_mqa = (
-            config.num_attention_heads != config.num_key_value_heads
-        )
-
-        self.max_position_embeddings = config.max_position_embeddings
-        self.seq_length = config.seq_length
-
-        self.q_proj = nn.Linear(
-            self.hidden_size,
-            self.hidden_size,
-            bias_attr=False,
-        )
-        self.k_proj = nn.Linear(
-            self.hidden_size,
-            self.config.num_key_value_heads * self.head_dim,
-            bias_attr=False,
-        )
-        self.v_proj = nn.Linear(
-            self.hidden_size,
-            self.config.num_key_value_heads * self.head_dim,
-            bias_attr=False,
-        )
-
-        self.o_proj = nn.Linear(
-            self.hidden_size,
-            self.hidden_size,
-            bias_attr=False,
-        )
-
-        self._init_rope()
-
-    def _init_rope(self):
-        self.rotary_emb = LlamaRotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-        )
-
-    def forward(
-        self,
-        hidden_states,
-        position_ids: Optional[Tuple[paddle.Tensor]] = None,
-        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
-        attention_mask: Optional[paddle.Tensor] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[
-        paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]
-    ]:
-        """Input shape: Batch x Time x Channel"""
-        # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism)
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        target_query_shape = [0, 0, self.num_heads, self.head_dim]
-        target_key_value_shape = [0, 0, self.num_key_value_heads, self.head_dim]
-        query_states = query_states.reshape(shape=target_query_shape)
-        key_states = key_states.reshape(shape=target_key_value_shape)
-        value_states = value_states.reshape(shape=target_key_value_shape)
-
-        kv_seq_len = key_states.shape[-3]
-
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-3]
-
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin, position_ids
-        )
-
-        # [bs, seq_len, num_head, head_dim]
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = paddle.concat([past_key_value[0], key_states], axis=1)
-            value_states = paddle.concat(
-                [past_key_value[1], value_states], axis=1
-            )
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        outputs = scaled_dot_product_attention(
-            query_states,
-            self.config,
-            key_states,
-            value_states,
-            attention_mask,
-            output_attentions,
-        )
-        if output_attentions:
-            attn_output, attn_weights = outputs
-        else:
-            attn_output = outputs
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        outputs = (attn_output,)
-
-        if output_attentions:
-            outputs += (attn_weights,)
-
-        if use_cache:
-            outputs += (past_key_value,)
-
-        if type(outputs) is tuple and len(outputs) == 1:
-            outputs = outputs[0]
-
-        return outputs
-
-
-class LlamaDecoderLayer(nn.Layer):
-    def __init__(self, config):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = LlamaAttention(config)
-        self.mlp = LlamaMLP(config)
-        self.input_layernorm = LlamaRMSNorm(config)
-        self.post_attention_layernorm = LlamaRMSNorm(config)
-
-    def forward(
-        self,
-        hidden_states: paddle.Tensor,
-        position_ids: Optional[Tuple[paddle.Tensor]] = None,
-        attention_mask: Optional[paddle.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
-        use_cache: Optional[bool] = False,
-    ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
-        """
-        Args:
-            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`paddle.Tensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `cache` key value states are returned and can be used to speed up decoding
-                (see `cache`).
-            cache (`Tuple(paddle.Tensor)`, *optional*): cached past key and value projection states
-        """
-
-        # [bs * seq_len, embed_dim] -> [seq_len * bs / n, embed_dim] (sequence_parallel)
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        outputs = self.self_attn(
-            hidden_states,
-            position_ids,
-            past_key_value,
-            attention_mask,
-            output_attentions,
-            use_cache,
-        )
-
-        if type(outputs) is tuple:
-            hidden_states = outputs[0]
-        else:
-            hidden_states = outputs
-
-        if output_attentions:
-            self_attn_weights = outputs[1]
-
-        if use_cache:
-            present_key_value = outputs[2 if output_attentions else 1]
-
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        # remove empty tuple for pipeline parallel
-        if type(outputs) is tuple and len(outputs) == 1:
-            outputs = outputs[0]
-
-        return outputs
-
-
-class LlamaModel(nn.Layer):
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.config = config
-        self.vocab_size = config.vocab_size
-        self.hidden_size = config.hidden_size
-
-        self.embed_tokens = nn.Embedding(
-            self.vocab_size,
-            self.hidden_size,
-        )
-
-        self.layers = nn.LayerList(
-            [LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)]
-        )
-        self.norm = LlamaRMSNorm(config)
-
-    @staticmethod
-    def _prepare_decoder_attention_mask(
-        attention_mask, input_shape, past_key_values_length, dtype
-    ):
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            if len(attention_mask.shape) == 2:
-                expanded_attn_mask = _expand_2d_mask(
-                    attention_mask, dtype, tgt_length=input_shape[-1]
-                )
-                # For decoding phase in generation, seq_length = 1, we don't need to add causal mask
-                if input_shape[-1] > 1:
-                    combined_attention_mask = _make_causal_mask(
-                        input_shape,
-                        past_key_values_length=past_key_values_length,
-                    )
-                    expanded_attn_mask = (
-                        expanded_attn_mask & combined_attention_mask
-                    )
-            # [bsz, seq_len, seq_len] -> [bsz, 1, seq_len, seq_len]
-            elif len(attention_mask.shape) == 3:
-                expanded_attn_mask = attention_mask.unsqueeze(1).astype("bool")
-            # if attention_mask is already 4-D, do nothing
-            else:
-                expanded_attn_mask = attention_mask
-        else:
-            expanded_attn_mask = _make_causal_mask(
-                input_shape, past_key_values_length=past_key_values_length
-            )
-        # Convert bool attention_mask to float attention mask, which will be added to attention_scores later
-        expanded_attn_mask = paddle.where(
-            expanded_attn_mask, 0.0, paddle.finfo(dtype).min
-        ).astype(dtype)
-        return expanded_attn_mask
-
-    def forward(
-        self,
-        input_ids=None,
-        position_ids=None,
-        attention_mask=None,
-        use_cache=None,
-    ):
-        output_attentions = False
-        output_hidden_states = False
-        use_cache = (
-            use_cache if use_cache is not None else self.config.use_cache
-        )
-
-        # retrieve input_ids
-        if input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids")
-
-        past_key_values = tuple([None] * len(self.layers))
-        # NOTE: to make cache can be clear in-time
-        past_key_values = list(past_key_values)
-
-        seq_length_with_past = seq_length
-        cache_length = 0
-        if past_key_values[0] is not None:
-            cache_length = paddle.shape(past_key_values[0][0])[1]
-            seq_length_with_past += cache_length
-        inputs_embeds = self.embed_tokens(input_ids)
-
-        # embed positions
-        if attention_mask is None:
-            # [bs, seq_len]
-            attention_mask = paddle.ones(
-                (batch_size, seq_length_with_past), dtype=paddle.bool
-            )
-
-        if position_ids is None:
-            position_ids = paddle.arange(seq_length, dtype="int64").expand(
-                (batch_size, seq_length)
-            )
-
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask,
-            (batch_size, seq_length),
-            cache_length,
-            inputs_embeds.dtype,
-        )  # [bs, 1, seq_len, seq_len]
-
-        hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        for idx, (decoder_layer) in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-            past_key_value = (
-                past_key_values[idx] if past_key_values is not None else None
-            )
-
-            has_gradient = not hidden_states.stop_gradient
-
-            layer_outputs = decoder_layer(
-                hidden_states,
-                position_ids,
-                attention_mask,
-                output_attentions,
-                past_key_value,
-                use_cache,
-            )
-
-            # NOTE: clear outdate cache after it has been used for memory saving
-            past_key_value = past_key_values[idx] = None
-            if type(layer_outputs) is tuple:
-                hidden_states = layer_outputs[0]
-            else:
-                hidden_states = layer_outputs
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-            if use_cache:
-                next_decoder_cache += (
-                    layer_outputs[2 if output_attentions else 1],
-                )
-
-        hidden_states = self.norm(hidden_states)
-
-        return hidden_states
-
-
 class TestLlamaModel(unittest.TestCase):
     def setUp(self):
         paddle.seed(2024)
         self.prepare_data()
 
     def prepare_data(self):
-        self.config = LlamaConfig()
+        self.config = llama_test_model.LlamaConfig()
         self.input_ids = paddle.to_tensor(
             [
                 [
@@ -663,7 +75,7 @@ def check_jit_kernel_info(self, static_fn):
 
     def eval(self, use_cinn):
         paddle.seed(2024)
-        net = LlamaModel(self.config)
+        net = llama_test_model.LlamaModel(self.config)
         input_spec = [
             InputSpec(shape=[None, None], dtype='int64'),  # input_ids
             InputSpec(shape=[None, None], dtype='int64'),  # position_ids
@@ -676,11 +88,10 @@ def eval(self, use_cinn):
 
     def test_eval(self):
         dy_out = self.eval(use_cinn=False)
-        if utils.unittest_use_cinn():
-            cinn_out = self.eval(use_cinn=True)
-            np.testing.assert_allclose(
-                cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
-            )
+        cinn_out = self.eval(use_cinn=True)
+        np.testing.assert_allclose(
+            cinn_out.numpy(), dy_out.numpy(), atol=1e-5, rtol=1e-6
+        )
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/inference/test_llama_fw_bw.py b/test/ir/pir/cinn/inference/test_llama_fw_bw.py
new file mode 100644
index 0000000000000..e0471dd1233b5
--- /dev/null
+++ b/test/ir/pir/cinn/inference/test_llama_fw_bw.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import unittest
+from os.path import dirname
+
+os.environ['FLAGS_prim_forward_blacklist'] = 'pd_op.embedding'
+
+import numpy as np
+
+import paddle
+
+sys.path.append(dirname(dirname(__file__)))
+
+import llama_test_model
+import utils
+
+
+class TestLlamaModel(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.config = llama_test_model.LlamaConfig()
+        self.input_ids = paddle.to_tensor(
+            [
+                [
+                    1,
+                    29871,
+                    31201,
+                    236,
+                    138,
+                    141,
+                    30287,
+                    30557,
+                    30015,
+                    233,
+                    187,
+                    172,
+                    31969,
+                    31325,
+                    31043,
+                    30374,
+                    30024,
+                ]
+            ],
+            dtype="int64",
+        )
+        self.position_ids = paddle.to_tensor(
+            [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]],
+            dtype="int64",
+        )
+        self.attention_mask = paddle.to_tensor(
+            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype="int64"
+        )
+
+    def check_jit_kernel_info(self, static_fn):
+        utils.check_jit_kernel_number(static_fn, 1)
+        utils.check_jit_kernel_structure(static_fn, {utils.JIT_KERNEL_NAME: 1})
+
+    def eval(self, use_cinn):
+        paddle.seed(2024)
+        net = llama_test_model.LlamaModel(self.config)
+
+        net = utils.apply_to_static(net, use_cinn)
+
+        out = net(self.input_ids, self.position_ids, self.attention_mask)
+
+        loss = out.sum()
+
+        loss.backward()
+        return out, net.embed_tokens.weight.gradient()
+
+    def test_eval(self):
+        dy_out, dy_d_emb = self.eval(use_cinn=False)
+
+        cinn_out, cinn_d_emb = self.eval(use_cinn=True)
+        np.testing.assert_allclose(
+            cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
+        )
+
+        np.testing.assert_allclose(dy_d_emb, cinn_d_emb, atol=1e-4, rtol=1e-2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/inference/test_llama_inference.py b/test/ir/pir/cinn/inference/test_llama_inference.py
new file mode 100644
index 0000000000000..5c39d71c1c779
--- /dev/null
+++ b/test/ir/pir/cinn/inference/test_llama_inference.py
@@ -0,0 +1,197 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+from os.path import dirname
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.static import InputSpec
+
+sys.path.append(dirname(dirname(__file__)))
+
+import utils
+from llama_test_model import LlamaConfig, LlamaModel
+
+
+class LlamaInference(nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.config = LlamaConfig()
+        self._forward_ = LlamaModel(self.config)
+
+    def update_scores_for_generation(
+        self, scores, next_scores, length, unfinished_flag
+    ):
+        # update scores
+        unfinished_scores = (scores * length + next_scores) / (length + 1)
+        scores = paddle.where(unfinished_flag, unfinished_scores, scores)
+        return scores
+
+    def _post_process_(
+        self, logits, input_ids, cur_len, origin_len, scores, unfinished_flag
+    ):
+        # [batch_size, vocab_size]
+        logits = logits[:, -1, :]
+        probs = F.softmax(logits)
+
+        temperature = paddle.full([1], 1)
+        top_p = paddle.full([1], 0)
+
+        # sample
+        origin_probs = F.log_softmax(logits)
+        # compute next_tokens
+        logits = logits / temperature
+        top_ps_tensor = paddle.full(
+            shape=[paddle.shape(probs)[0], 1],
+            fill_value=top_p,
+            dtype=probs.dtype,
+        )
+        _, next_tokens = paddle.tensor.top_p_sampling(probs, top_ps_tensor)
+
+        next_scores = paddle.index_sample(origin_probs, next_tokens)
+        scores = self.update_scores_for_generation(
+            scores, next_scores, cur_len - origin_len, unfinished_flag
+        )
+
+        input_ids = paddle.concat([input_ids, next_tokens], axis=1)
+
+        return input_ids, scores, unfinished_flag
+
+    def forward(self, input_ids, position_ids, attention_mask, use_cache=None):
+        batch_size, cur_len = paddle.shape(input_ids)
+
+        batch_size, cur_len = paddle.shape(input_ids)
+        # used for compute on gpu, avoid memcpy D2H
+        cur_len_gpu = paddle.full([1], cur_len, dtype="int64")
+
+        origin_len = paddle.shape(input_ids)[1]
+        # used for compute on gpu, avoid memcpy D2H
+        origin_len_gpu = paddle.full([1], origin_len, dtype="int64")
+
+        unfinished_flag = paddle.full([batch_size, 1], True, dtype="bool")
+
+        scores = paddle.full(
+            [batch_size, 1], 0.0, dtype=paddle.get_default_dtype()
+        )
+
+        max_new_tokens = paddle.full([1], 16, dtype="int64")
+
+        outputs = self._forward_(
+            input_ids, position_ids, attention_mask, use_cache
+        )
+        input_ids, scores, unfinished_flag = self._post_process_(
+            outputs,
+            input_ids,
+            cur_len_gpu,
+            origin_len_gpu,
+            scores,
+            unfinished_flag,
+        )
+        paddle.increment(cur_len)
+        paddle.increment(cur_len_gpu)
+
+        while cur_len < max_new_tokens and paddle.any(unfinished_flag):
+            (
+                input_ids,
+                scores,
+                unfinished_flag,
+                model_kwargs,
+            ) = self._post_process_(
+                self._forward_(
+                    input_ids, position_ids, attention_mask, use_cache
+                ),
+                input_ids,
+                cur_len_gpu,
+                origin_len_gpu,
+                scores,
+                unfinished_flag,
+            )
+            paddle.increment(cur_len)
+            paddle.increment(cur_len_gpu)
+
+        return input_ids[:, origin_len:]
+
+
+class TestLlamaInference(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.config = LlamaConfig()
+        self.input_ids = paddle.to_tensor(
+            [
+                [
+                    1,
+                    29871,
+                    31201,
+                    236,
+                    138,
+                    141,
+                    30287,
+                    30557,
+                    30015,
+                    233,
+                    187,
+                    172,
+                    31969,
+                    31325,
+                    31043,
+                    30374,
+                    30024,
+                ]
+            ],
+            dtype="int64",
+        )
+        self.position_ids = paddle.to_tensor(
+            [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]],
+            dtype="int64",
+        )
+        self.attention_mask = paddle.to_tensor(
+            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype="int64"
+        )
+
+    def check_jit_kernel_info(self, static_fn):
+        utils.check_jit_kernel_number(static_fn, 1)
+        utils.check_jit_kernel_structure(static_fn, {utils.JIT_KERNEL_NAME: 1})
+
+    def eval(self, use_cinn):
+        paddle.seed(2024)
+        net = LlamaModel(self.config)
+        input_spec = [
+            InputSpec(shape=[None, None], dtype='int64'),  # input_ids
+            InputSpec(shape=[None, None], dtype='int64'),  # position_ids
+            InputSpec(shape=[None, None], dtype='int64'),  # attention_mask
+        ]
+        net = utils.apply_to_static(net, use_cinn, input_spec)
+        net.eval()
+        out = net(self.input_ids, self.position_ids, self.attention_mask)
+        return out
+
+    def test_eval(self):
+        dy_out = self.eval(use_cinn=False)
+        if utils.unittest_use_cinn():
+            cinn_out = self.eval(use_cinn=True)
+            np.testing.assert_allclose(
+                cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/llama_test_model.py b/test/ir/pir/cinn/llama_test_model.py
new file mode 100644
index 0000000000000..77c2efef8effb
--- /dev/null
+++ b/test/ir/pir/cinn/llama_test_model.py
@@ -0,0 +1,616 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import sys
+from os.path import dirname
+from typing import Optional, Tuple
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.incubate.nn.functional import swiglu
+
+sys.path.append(dirname(__file__))
+
+
+class LlamaConfig:
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        max_position_embeddings=2048,
+        seq_length=2048,
+        num_hidden_layers=1,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.max_position_embeddings = max_position_embeddings
+        self.seq_length = seq_length
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+
+
+class LlamaRotaryEmbedding(nn.Layer):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        # [dim / 2]
+        self.inv_freq = 1.0 / (
+            self.base
+            ** (
+                paddle.cast(paddle.arange(0, self.dim, 2), dtype="float32")
+                / self.dim
+            )
+        )
+        self._set_cos_sin_cache(seq_len=max_position_embeddings)
+
+    def _set_cos_sin_cache(self, seq_len):
+        self.max_seq_len_cached = seq_len
+        # [seq_len]
+        t = paddle.arange(seq_len, dtype="float32")
+        # [seq_len, dim/2]
+        freqs = paddle.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        # [seq_len, dim]
+        emb = paddle.concat([freqs, freqs], axis=-1)
+        # [1, seqlen, 1, dim]
+        self.cos_cached = emb.cos()[None, :, None, :]
+        self.sin_cached = emb.sin()[None, :, None, :]
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        # TODO(phlrain): cinn slice not support end is a DimExpr
+        # WIP for support it
+        # cos = self.cos_cached[:, :seq_len, :, :]
+        # sin = self.sin_cached[:, :seq_len, :, :]
+        cos = self.cos_cached
+        sin = self.sin_cached
+        return (
+            cos.cast(x.dtype) if cos.dtype != x.dtype else cos,
+            sin.cast(x.dtype) if sin.dtype != x.dtype else sin,
+        )
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return paddle.concat([-x2, x1], axis=-1)  # shape is the same as x
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    if position_ids is None:
+        # Note: Only for LlamaForCausalLMPipe model pretraining
+        cos = cos[:, : q.shape[1], :, :]  # [bs, seq_len, 1, dim]
+        sin = sin[:, : q.shape[1], :, :]  # [bs, seq_len, 1, dim]
+    else:
+        cos = cos.squeeze(axis=[0, 2])  # [seq_len, dim]
+        sin = sin.squeeze(axis=[0, 2])  # [seq_len, dim]
+        cos = cos[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
+        sin = sin[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def _make_causal_mask(input_ids_shape, past_key_values_length):
+    """
+    Make causal mask used for self-attention
+    """
+    batch_size, target_length = input_ids_shape  # target_length: seq_len
+
+    mask = paddle.tril(
+        paddle.ones((target_length, target_length), dtype="bool")
+    )
+
+    if past_key_values_length > 0:
+        # [tgt_len, tgt_len + past_len]
+        mask = paddle.concat(
+            [
+                paddle.ones(
+                    [target_length, past_key_values_length], dtype="bool"
+                ),
+                mask,
+            ],
+            axis=-1,
+        )
+
+    # [bs, 1, tgt_len, tgt_len + past_len]
+    return mask[None, None, :, :].expand(
+        [batch_size, 1, target_length, target_length + past_key_values_length]
+    )
+
+
+def _expand_2d_mask(mask, dtype, tgt_length):
+    """
+    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
+    """
+    batch_size, src_length = mask.shape[0], mask.shape[-1]
+    tgt_length = tgt_length if tgt_length is not None else src_length
+
+    mask = mask[:, None, None, :].astype("bool")
+    mask.stop_gradient = True
+    expanded_mask = mask.expand([batch_size, 1, tgt_length, src_length])
+
+    return expanded_mask
+
+
+def get_triangle_upper_mask(x, mask=None):
+    if mask is not None:
+        return mask
+    # [bsz, n_head, q_len, kv_seq_len]
+    shape = x.shape
+    #  [bsz, 1, q_len, kv_seq_len]
+    shape[1] = 1
+    mask = paddle.full(shape, paddle.finfo(x.dtype).min, dtype=x.dtype)
+    mask = paddle.triu(mask, diagonal=1)
+    mask.stop_gradient = True
+    return mask
+
+
+def scaled_dot_product_attention(
+    query_states,
+    config,
+    key_states,
+    value_states,
+    attention_mask,
+    output_attentions,
+):
+    bsz, q_len, num_heads, head_dim = query_states.shape
+    _, kv_seq_len, _, _ = value_states.shape
+
+    #  [ bz, seqlen, nhead, head_dim] -> [bs, nhead, seq_len, head_dim]
+    query_states = paddle.transpose(query_states, [0, 2, 1, 3])
+    # merge with the next tranpose
+    key_states = paddle.transpose(key_states, [0, 2, 1, 3])
+    value_states = paddle.transpose(value_states, [0, 2, 1, 3])
+
+    # matmul and devide by sqrt(head_dim)
+    attn_weights = paddle.matmul(
+        query_states / math.sqrt(head_dim), key_states.transpose([0, 1, 3, 2])
+    )
+
+    # NOTE: we only call get_triangle_upper_mask under PP setup
+    # FIXME ZHUI when we use pipeline parallel, the attention_mask can be None
+    # we just make it triangle_upper_mask
+    if attention_mask is None:
+        attention_mask = get_triangle_upper_mask(attn_weights)
+    attention_mask = attention_mask.reshape([bsz, 1, q_len, kv_seq_len])
+
+    attn_weights = attn_weights + attention_mask
+    attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(
+        query_states.dtype
+    )
+
+    attn_output = paddle.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose([0, 2, 1, 3])
+
+    attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
+    return (attn_output, attn_weights) if output_attentions else attn_output
+
+
+class LlamaMLP(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+
+        self.gate_proj = nn.Linear(
+            self.hidden_size, self.intermediate_size, bias_attr=False
+        )
+        self.up_proj = nn.Linear(
+            self.hidden_size, self.intermediate_size, bias_attr=False
+        )
+        self.down_proj = nn.Linear(
+            self.intermediate_size, self.hidden_size, bias_attr=False
+        )
+
+    def forward(self, x):
+        x = swiglu(self.gate_proj(x), self.up_proj(x))
+        out = self.down_proj(x)
+        return out
+
+
+class LlamaRMSNorm(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.weight = paddle.create_parameter(
+            shape=[self.hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(0.2),
+        )
+        self.variance_epsilon = config.rms_norm_eps
+        self.config = config
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.astype("float32")
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = (
+            paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
+        )
+
+        if self.weight.dtype in [paddle.float16, paddle.bfloat16]:
+            hidden_states = paddle.cast(hidden_states, self.weight.dtype)
+        return hidden_states * self.weight
+
+
+class LlamaAttention(nn.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+
+        self.head_dim = self.hidden_size // config.num_attention_heads
+
+        self.num_key_value_heads = config.num_key_value_heads
+        assert config.num_attention_heads // config.num_key_value_heads
+        self.num_key_value_groups = (
+            config.num_attention_heads // config.num_key_value_heads
+        )
+        self.gqa_or_mqa = (
+            config.num_attention_heads != config.num_key_value_heads
+        )
+
+        self.max_position_embeddings = config.max_position_embeddings
+        self.seq_length = config.seq_length
+
+        self.q_proj = nn.Linear(
+            self.hidden_size,
+            self.hidden_size,
+            bias_attr=False,
+        )
+        self.k_proj = nn.Linear(
+            self.hidden_size,
+            self.config.num_key_value_heads * self.head_dim,
+            bias_attr=False,
+        )
+        self.v_proj = nn.Linear(
+            self.hidden_size,
+            self.config.num_key_value_heads * self.head_dim,
+            bias_attr=False,
+        )
+
+        self.o_proj = nn.Linear(
+            self.hidden_size,
+            self.hidden_size,
+            bias_attr=False,
+        )
+
+        self._init_rope()
+
+    def _init_rope(self):
+        self.rotary_emb = LlamaRotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        position_ids: Optional[Tuple[paddle.Tensor]] = None,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[
+        paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]
+    ]:
+        """Input shape: Batch x Time x Channel"""
+        # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism)
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        target_query_shape = [0, 0, self.num_heads, self.head_dim]
+        target_key_value_shape = [0, 0, self.num_key_value_heads, self.head_dim]
+        query_states = query_states.reshape(shape=target_query_shape)
+        key_states = key_states.reshape(shape=target_key_value_shape)
+        value_states = value_states.reshape(shape=target_key_value_shape)
+
+        kv_seq_len = key_states.shape[-3]
+
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-3]
+
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(
+            query_states, key_states, cos, sin, position_ids
+        )
+
+        # [bs, seq_len, num_head, head_dim]
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = paddle.concat([past_key_value[0], key_states], axis=1)
+            value_states = paddle.concat(
+                [past_key_value[1], value_states], axis=1
+            )
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        outputs = scaled_dot_product_attention(
+            query_states,
+            self.config,
+            key_states,
+            value_states,
+            attention_mask,
+            output_attentions,
+        )
+        if output_attentions:
+            attn_output, attn_weights = outputs
+        else:
+            attn_output = outputs
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        outputs = (attn_output,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        if use_cache:
+            outputs += (past_key_value,)
+
+        if type(outputs) is tuple and len(outputs) == 1:
+            outputs = outputs[0]
+
+        return outputs
+
+
+class LlamaDecoderLayer(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = LlamaAttention(config)
+        self.mlp = LlamaMLP(config)
+        self.input_layernorm = LlamaRMSNorm(config)
+        self.post_attention_layernorm = LlamaRMSNorm(config)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        position_ids: Optional[Tuple[paddle.Tensor]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
+        """
+        Args:
+            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`paddle.Tensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `cache` key value states are returned and can be used to speed up decoding
+                (see `cache`).
+            cache (`Tuple(paddle.Tensor)`, *optional*): cached past key and value projection states
+        """
+
+        # [bs * seq_len, embed_dim] -> [seq_len * bs / n, embed_dim] (sequence_parallel)
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        outputs = self.self_attn(
+            hidden_states,
+            position_ids,
+            past_key_value,
+            attention_mask,
+            output_attentions,
+            use_cache,
+        )
+
+        if type(outputs) is tuple:
+            hidden_states = outputs[0]
+        else:
+            hidden_states = outputs
+
+        if output_attentions:
+            self_attn_weights = outputs[1]
+
+        if use_cache:
+            present_key_value = outputs[2 if output_attentions else 1]
+
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        # remove empty tuple for pipeline parallel
+        if type(outputs) is tuple and len(outputs) == 1:
+            outputs = outputs[0]
+
+        return outputs
+
+
+class LlamaModel(nn.Layer):
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+
+        self.embed_tokens = nn.Embedding(
+            self.vocab_size,
+            self.hidden_size,
+        )
+
+        self.layers = nn.LayerList(
+            [LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.norm = LlamaRMSNorm(config)
+
+    @staticmethod
+    def _prepare_decoder_attention_mask(
+        attention_mask, input_shape, past_key_values_length, dtype
+    ):
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            if len(attention_mask.shape) == 2:
+                expanded_attn_mask = _expand_2d_mask(
+                    attention_mask, dtype, tgt_length=input_shape[-1]
+                )
+                # For decoding phase in generation, seq_length = 1, we don't need to add causal mask
+                if input_shape[-1] > 1:
+                    combined_attention_mask = _make_causal_mask(
+                        input_shape,
+                        past_key_values_length=past_key_values_length,
+                    )
+                    expanded_attn_mask = (
+                        expanded_attn_mask & combined_attention_mask
+                    )
+            # [bsz, seq_len, seq_len] -> [bsz, 1, seq_len, seq_len]
+            elif len(attention_mask.shape) == 3:
+                expanded_attn_mask = attention_mask.unsqueeze(1).astype("bool")
+            # if attention_mask is already 4-D, do nothing
+            else:
+                expanded_attn_mask = attention_mask
+        else:
+            expanded_attn_mask = _make_causal_mask(
+                input_shape, past_key_values_length=past_key_values_length
+            )
+        # Convert bool attention_mask to float attention mask, which will be added to attention_scores later
+        expanded_attn_mask = paddle.where(
+            expanded_attn_mask, 0.0, paddle.finfo(dtype).min
+        ).astype(dtype)
+        return expanded_attn_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        use_cache=None,
+    ):
+        output_attentions = False
+        output_hidden_states = False
+        use_cache = (
+            use_cache if use_cache is not None else self.config.use_cache
+        )
+
+        # retrieve input_ids
+        if input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids")
+
+        past_key_values = tuple([None] * len(self.layers))
+        # NOTE: to make cache can be clear in-time
+        past_key_values = list(past_key_values)
+
+        seq_length_with_past = seq_length
+        cache_length = 0
+        if past_key_values[0] is not None:
+            cache_length = paddle.shape(past_key_values[0][0])[1]
+            seq_length_with_past += cache_length
+        inputs_embeds = self.embed_tokens(input_ids)
+
+        # embed positions
+        if attention_mask is None:
+            # [bs, seq_len]
+            attention_mask = paddle.ones(
+                (batch_size, seq_length_with_past), dtype=paddle.bool
+            )
+
+        if position_ids is None:
+            position_ids = paddle.arange(seq_length, dtype="int64").expand(
+                (batch_size, seq_length)
+            )
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask,
+            (batch_size, seq_length),
+            cache_length,
+            inputs_embeds.dtype,
+        )  # [bs, 1, seq_len, seq_len]
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, (decoder_layer) in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = (
+                past_key_values[idx] if past_key_values is not None else None
+            )
+
+            has_gradient = not hidden_states.stop_gradient
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                position_ids,
+                attention_mask,
+                output_attentions,
+                past_key_value,
+                use_cache,
+            )
+
+            # NOTE: clear outdate cache after it has been used for memory saving
+            past_key_value = past_key_values[idx] = None
+            if type(layer_outputs) is tuple:
+                hidden_states = layer_outputs[0]
+            else:
+                hidden_states = layer_outputs
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+            if use_cache:
+                next_decoder_cache += (
+                    layer_outputs[2 if output_attentions else 1],
+                )
+
+        hidden_states = self.norm(hidden_states)
+
+        return hidden_states
diff --git a/test/ir/pir/cinn/performance/test_cinn_large_shape_tile.py b/test/ir/pir/cinn/performance/test_cinn_large_shape_tile.py
new file mode 100644
index 0000000000000..defc7bf969723
--- /dev/null
+++ b/test/ir/pir/cinn/performance/test_cinn_large_shape_tile.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import utils
+
+import paddle
+
+
+class CINNCosSubGraphNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y, z):
+        tmp = x * y
+        tmp1 = paddle.reshape(tmp, [80, 32, 4])
+        tmp2 = paddle.sum(tmp1, axis=2)
+        tmp3 = paddle.reshape(tmp2, [80, 1, 32, 1])
+        tmp4 = tmp3 * z
+        return tmp4
+
+
+class TestCinnCos(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.x = paddle.uniform([80, 128], dtype="float32", min=-0.5, max=0.5)
+        self.x.stop_gradient = True
+        self.y = paddle.uniform([128], dtype="float32", min=-0.5, max=0.5)
+        self.y.stop_gradient = True
+        self.z = paddle.uniform(
+            [80, 32768, 32, 4], dtype="float32", min=-0.5, max=0.5
+        )
+        self.z.stop_gradient = True
+
+    def train(self, use_cinn):
+        net = CINNCosSubGraphNet()
+        net.eval()
+        net = utils.apply_to_static(net, use_cinn)
+        out = net(self.x, self.y, self.z)
+        return out
+
+    def test_train(self):
+        cinn_out = self.train(use_cinn=True)
+        dy_out = self.train(use_cinn=False)
+
+        np.testing.assert_allclose(cinn_out.numpy(), dy_out.numpy(), atol=1e-6)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_46.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_46.py
index 70742b75f8afe..2ca80264b2038 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_46.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_46.py
@@ -70,7 +70,7 @@ def train(self, net, to_static, with_prim=False, with_cinn=False):
     def test_ast_prim_cinn(self):
         st_out = self.train(self.net, to_static=True)
         cinn_out = self.train(
-            self.net, to_static=True, with_prim=False, with_cinn=False
+            self.net, to_static=True, with_prim=True, with_cinn=True
         )
         for st, cinn in zip(
             paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_73.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_73.py
index 1fa302916dacd..c344c87a5591a 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_73.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_73.py
@@ -100,7 +100,7 @@ def train(self, net, to_static, with_prim=False, with_cinn=False):
     def test_ast_prim_cinn(self):
         st_out = self.train(self.net, to_static=True)
         cinn_out = self.train(
-            self.net, to_static=True, with_prim=False, with_cinn=False
+            self.net, to_static=True, with_prim=True, with_cinn=False
         )
         for st, cinn in zip(
             paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_78.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_78.py
index 88ed232da26f8..61635d112e6ca 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_78.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_78.py
@@ -133,7 +133,7 @@ def train(self, net, to_static, with_prim=False, with_cinn=False):
     def test_ast_prim_cinn(self):
         st_out = self.train(self.net, to_static=True)
         cinn_out = self.train(
-            self.net, to_static=True, with_prim=False, with_cinn=False
+            self.net, to_static=True, with_prim=True, with_cinn=False
         )
         for st, cinn in zip(
             paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
diff --git a/test/ir/pir/cinn/symbolic/CMakeLists.txt b/test/ir/pir/cinn/symbolic/CMakeLists.txt
index b1ddf58b43d57..e90301a149bfb 100644
--- a/test/ir/pir/cinn/symbolic/CMakeLists.txt
+++ b/test/ir/pir/cinn/symbolic/CMakeLists.txt
@@ -32,8 +32,9 @@ if(WITH_GPU)
       COMMAND
         ${CMAKE_COMMAND} -E env
         PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
-        FLAGS_enable_pir_api=1 FLAGS_cinn_bucket_compile=True
-        FLAGS_prim_enable_dynamic=true FLAGS_pir_apply_shape_optimization_pass=1
+        FLAGS_check_infer_symbolic=1 FLAGS_enable_pir_api=1
+        FLAGS_cinn_bucket_compile=True FLAGS_prim_enable_dynamic=true
+        FLAGS_pir_apply_shape_optimization_pass=1
         FLAGS_group_schedule_tiling_first=1 FLAGS_cinn_new_group_scheduler=1
         ${PYTHON_EXECUTABLE}
         ${CMAKE_CURRENT_SOURCE_DIR}/${cinn_pir_test_name}.py
@@ -42,18 +43,6 @@ if(WITH_GPU)
                                                           "RUN_TYPE=CINN")
   endforeach()
 
-  add_test(
-    NAME test_unary_op_infer_sym_shape
-    COMMAND
-      ${CMAKE_COMMAND} -E env
-      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
-      FLAGS_enable_pir_api=1 FLAGS_cinn_bucket_compile=True FLAGS_prim_all=True
-      FLAGS_pir_apply_shape_optimization_pass=1 ${PYTHON_EXECUTABLE}
-      ${CMAKE_CURRENT_SOURCE_DIR}/test_unary_op_infer_sym_shape.py
-    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
-  set_tests_properties(test_unary_op_infer_sym_shape PROPERTIES LABELS
-                                                                "RUN_TYPE=CINN")
-
   add_test(
     NAME test_if_st
     COMMAND
diff --git a/test/ir/pir/cinn/symbolic/simple_llama.config b/test/ir/pir/cinn/symbolic/simple_llama.config
index 1e80f206a970d..3898b93f62723 100644
--- a/test/ir/pir/cinn/symbolic/simple_llama.config
+++ b/test/ir/pir/cinn/symbolic/simple_llama.config
@@ -26,24 +26,24 @@
     (%24) = "pd_op.slice" (%18, %22, %23) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<2xi32>, builtin.tensor<1xi64>, builtin.tensor<1xi64>) -> builtin.tensor<i32>
     (%25) = "pd_op.cast" (%24) {dtype:(pd_op.DataType)int64,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<i32>) -> builtin.tensor<i64>
     (%26) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> builtin.tensor<1xi64>
-    (%27) = "pd_op.full_with_tensor" (%26, %25) {dtype:(pd_op.DataType)int64,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<1xi64>, builtin.tensor<i64>) -> builtin.tensor<1xi64>
+    (%27) = "pd_op.full_with_tensor" (%25, %26) {dtype:(pd_op.DataType)int64,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<i64>, builtin.tensor<1xi64>) -> builtin.tensor<1xi64>
     (%28) = "pd_op.shape" (%17) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1xi64>) -> builtin.tensor<2xi32>
     (%29) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> builtin.tensor<1xi64>
     (%30) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> builtin.tensor<1xi64>
     (%31) = "pd_op.slice" (%28, %29, %30) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<2xi32>, builtin.tensor<1xi64>, builtin.tensor<1xi64>) -> builtin.tensor<i32>
     (%32) = "pd_op.cast" (%31) {dtype:(pd_op.DataType)int64,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<i32>) -> builtin.tensor<i64>
     (%33) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> builtin.tensor<1xi64>
-    (%34) = "pd_op.full_with_tensor" (%33, %32) {dtype:(pd_op.DataType)int64,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<1xi64>, builtin.tensor<i64>) -> builtin.tensor<1xi64>
+    (%34) = "pd_op.full_with_tensor" (%32, %33) {dtype:(pd_op.DataType)int64,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<i64>, builtin.tensor<1xi64>) -> builtin.tensor<1xi64>
     (%35) = "pd_op.full" () {dtype:(pd_op.DataType)int32,is_persistable:[false],place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[],stop_gradient:[false],value:(Float)1} : () -> builtin.tensor<i32>
     (%36) = "builtin.combine" (%21, %35) {} : (builtin.tensor<i32>, builtin.tensor<i32>) -> vec[builtin.tensor<i32>,builtin.tensor<i32>]
     (%37) = "pd_op.stack" (%36) {axis:(Int32)0,stop_gradient:[true]} : (vec[builtin.tensor<i32>,builtin.tensor<i32>]) -> builtin.tensor<2xi32>
     (%38) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> builtin.tensor<1xf32>
-    (%39) = "pd_op.full_with_tensor" (%37, %38) {dtype:(pd_op.DataType)bool,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<2xi32>, builtin.tensor<1xf32>) -> builtin.tensor<-1x1xb>
+    (%39) = "pd_op.full_with_tensor" (%38, %37) {dtype:(pd_op.DataType)bool,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<1xf32>, builtin.tensor<2xi32>) -> builtin.tensor<-1x1xb>
     (%40) = "pd_op.full" () {dtype:(pd_op.DataType)int32,is_persistable:[false],place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[],stop_gradient:[false],value:(Float)1} : () -> builtin.tensor<i32>
     (%41) = "builtin.combine" (%21, %40) {} : (builtin.tensor<i32>, builtin.tensor<i32>) -> vec[builtin.tensor<i32>,builtin.tensor<i32>]
     (%42) = "pd_op.stack" (%41) {axis:(Int32)0,stop_gradient:[true]} : (vec[builtin.tensor<i32>,builtin.tensor<i32>]) -> builtin.tensor<2xi32>
     (%43) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)0} : () -> builtin.tensor<1xf32>
-    (%44) = "pd_op.full_with_tensor" (%42, %43) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<2xi32>, builtin.tensor<1xf32>) -> builtin.tensor<-1x1xf16>
+    (%44) = "pd_op.full_with_tensor" (%43, %42) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<1xf32>, builtin.tensor<2xi32>) -> builtin.tensor<-1x1xf16>
     (%45) = "pd_op.shape" (%17) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1xi64>) -> builtin.tensor<2xi32>
     (%46) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> builtin.tensor<1xi64>
     (%47) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> builtin.tensor<1xi64>
@@ -222,7 +222,7 @@
     (%232) = "pd_op.full" () {dtype:(pd_op.DataType)int32,is_persistable:[false],place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[],stop_gradient:[false],value:(Float)1} : () -> builtin.tensor<i32>
     (%233) = "builtin.combine" (%230, %232) {} : (builtin.tensor<i32>, builtin.tensor<i32>) -> vec[builtin.tensor<i32>,builtin.tensor<i32>]
     (%234) = "pd_op.stack" (%233) {axis:(Int32)0,stop_gradient:[true]} : (vec[builtin.tensor<i32>,builtin.tensor<i32>]) -> builtin.tensor<2xi32>
-    (%235) = "pd_op.full_with_tensor" (%234, %231) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<2xi32>, builtin.tensor<1xf16>) -> builtin.tensor<-1x1xf16>
+    (%235) = "pd_op.full_with_tensor" (%231, %234) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<1xf16>, builtin.tensor<2xi32>) -> builtin.tensor<-1x1xf16>
     (%236, %237) = "pd_op.top_p_sampling" (%225, %235, <<NULL VALUE>>) {is_persistable:[false,false],seed:(Int32)-1,stop_gradient:[false,false]} : (builtin.tensor<-1x32000xf16>, builtin.tensor<-1x1xf16>, <<NULL TYPE>>) -> builtin.tensor<-1x1xf16>, builtin.tensor<-1x1xi64>
     (%238) = "pd_op.index_sample" (%226, %237) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x32000xf16>, builtin.tensor<-1x1xi64>) -> builtin.tensor<-1x1xf16>
     (%239) = "pd_op.subtract" (%27, %34) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<1xi64>, builtin.tensor<1xi64>) -> builtin.tensor<1xi64>
diff --git a/test/ir/pir/cinn/symbolic/test_cinn_elementwise_symbolic.py b/test/ir/pir/cinn/symbolic/test_cinn_elementwise_symbolic.py
index b2659673c9ce2..83111baa96971 100644
--- a/test/ir/pir/cinn/symbolic/test_cinn_elementwise_symbolic.py
+++ b/test/ir/pir/cinn/symbolic/test_cinn_elementwise_symbolic.py
@@ -82,6 +82,42 @@ def test_eval_symbolic(self):
         np.testing.assert_allclose(cinn_out.numpy(), dy_out.numpy(), atol=1e-8)
 
 
+class TestCinnSubGrapTrilBoolGE2Dim(unittest.TestCase):
+    """
+    Test Pir API + @to_static + CINN.
+    """
+
+    def setUp(self):
+        paddle.seed(2022)
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.x_shape = [32, 32, 64]
+        self.x = paddle.randint(0, 2, self.x_shape)
+        self.x.stop_gradient = False
+
+    def check_jit_kernel_info(self, static_fn):
+        utils.check_jit_kernel_number(static_fn, 1)
+
+    def eval_symbolic(self, use_cinn):
+        paddle.seed(2022)
+        net = CINNSubGraphNet(tril)
+        input_spec = [
+            InputSpec(shape=[None, 32, 64], dtype='bool'),
+        ]
+        net = utils.apply_to_static(net, use_cinn, input_spec)
+        net.eval()
+        out = net(self.x)
+        if use_cinn:
+            self.check_jit_kernel_info(net.forward)
+        return out
+
+    def test_eval_symbolic(self):
+        cinn_out = self.eval_symbolic(use_cinn=True)
+        dy_out = self.eval_symbolic(use_cinn=False)
+        np.testing.assert_allclose(cinn_out.numpy(), dy_out.numpy(), atol=1e-8)
+
+
 class TestCinnSubGrapTrilDiagNeg(unittest.TestCase):
     """
     Test Pir API + @to_static + CINN.
diff --git a/test/ir/pir/cinn/symbolic/test_substitute_dim_expr_based_on_constraint.py b/test/ir/pir/cinn/symbolic/test_cinn_merge_expand.py
similarity index 55%
rename from test/ir/pir/cinn/symbolic/test_substitute_dim_expr_based_on_constraint.py
rename to test/ir/pir/cinn/symbolic/test_cinn_merge_expand.py
index a9119455e94fd..52489a9968b32 100644
--- a/test/ir/pir/cinn/symbolic/test_substitute_dim_expr_based_on_constraint.py
+++ b/test/ir/pir/cinn/symbolic/test_cinn_merge_expand.py
@@ -11,32 +11,33 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import sys
-import unittest
 from os.path import dirname
 
+sys.path.append(dirname(dirname(__file__)))
+
+import unittest
+
 import numpy as np
+import utils
 
 import paddle
 from paddle.static import InputSpec
 
-sys.path.append(dirname(dirname(__file__)))
-import utils
 
-
-class TestSubstituteDimExprNet(paddle.nn.Layer):
+class CINNSubGraphNet(paddle.nn.Layer):
     def __init__(self):
         super().__init__()
 
-    def forward(self, x, y1, y2):
-        z1 = paddle.concat([y1, x], 0)
-        z2 = paddle.concat([y1, y2], 0)
-        out = z1 + z2
+    def forward(self, x, y):
+        x1 = paddle.expand(x, [4, 16])
+        x2 = paddle.expand(x1, [8, 4, 16])
+        y1 = paddle.expand(y, [8, 4, 16])
+        out = x2 + y1
         return out
 
 
-class TestSubstituteDimExprBasedOnConstraint(unittest.TestCase):
+class TestCinnSubGraphBase(unittest.TestCase):
     """
     Test Pir API + @to_static + CINN.
     """
@@ -46,39 +47,34 @@ def setUp(self):
         self.prepare_data()
 
     def prepare_data(self):
-        self.shapex = [32, 128]
-        self.x = paddle.randn(self.shapex, dtype="float32")
+        self.x_shape = [16]
+        self.y_shape = [16]
+        self.x = paddle.randn(self.x_shape, dtype="float32")
         self.x.stop_gradient = False
-        self.shapey = [32, 128]
-        self.y1 = paddle.randn(self.shapey, dtype="float32")
-        self.y1.stop_gradient = False
-        self.y2 = paddle.randn(self.shapey, dtype="float32")
-        self.y2.stop_gradient = False
+        self.y = paddle.randn(self.y_shape, dtype="float32")
+        self.y.stop_gradient = False
 
     def check_jit_kernel_info(self, static_fn):
         utils.check_jit_kernel_number(static_fn, 1)
-        utils.check_jit_kernel_structure(static_fn, {utils.JIT_KERNEL_NAME: 1})
 
-    def eval(self, use_cinn):
-        net = TestSubstituteDimExprNet()
+    def eval_symbolic(self, use_cinn):
+        paddle.seed(2022)
+        net = CINNSubGraphNet()
         input_spec = [
-            InputSpec(shape=[32, 128], dtype="float32"),
-            InputSpec(shape=[32, None], dtype="float32"),
-            InputSpec(shape=[32, None], dtype="float32"),
+            InputSpec(shape=[16], dtype='float32'),
+            InputSpec(shape=[16], dtype='float32'),
         ]
         net = utils.apply_to_static(net, use_cinn, input_spec)
         net.eval()
-        out = net(self.x, self.y1, self.y2)
+        out = net(self.x, self.y)
         if use_cinn:
             self.check_jit_kernel_info(net.forward)
         return out
 
-    def test_eval(self):
-        dy_out = self.eval(use_cinn=False)
-        cinn_out = self.eval(use_cinn=True)
-        np.testing.assert_allclose(
-            cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
-        )
+    def test_eval_symbolic(self):
+        cinn_out = self.eval_symbolic(use_cinn=True)
+        dy_out = self.eval_symbolic(use_cinn=False)
+        np.testing.assert_allclose(cinn_out.numpy(), dy_out.numpy(), atol=1e-8)
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/symbolic/test_cinn_nn_elementwise_symbolic.py b/test/ir/pir/cinn/symbolic/test_cinn_nn_elementwise_symbolic.py
new file mode 100644
index 0000000000000..19f20e2261c52
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_cinn_nn_elementwise_symbolic.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import unittest
+from os.path import dirname
+
+import numpy as np
+
+import paddle
+from paddle.static import InputSpec
+
+sys.path.append(dirname(dirname(__file__)))
+import utils
+
+
+def relu6(x):
+    return paddle.nn.functional.relu6(x)
+
+
+class CINNSubGraphNet(paddle.nn.Layer):
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+
+    def forward(self, x):
+        out = self.fn(x)
+        return out
+
+
+class TestCinnSubGraprelu6(unittest.TestCase):
+    """
+    Test Pir API + @to_static + CINN.
+    """
+
+    def setUp(self):
+        paddle.seed(2022)
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.x_shape = [32, 32]
+        self.x = paddle.randn(self.x_shape, dtype="float32")
+        self.x.stop_gradient = False
+
+    def check_jit_kernel_info(self, static_fn):
+        utils.check_jit_kernel_number(static_fn, 1)
+
+    def eval_symbolic(self, use_cinn):
+        paddle.seed(2022)
+        net = CINNSubGraphNet(relu6)
+        input_spec = [
+            InputSpec(shape=[None, 32], dtype='float32'),
+        ]
+        net = utils.apply_to_static(net, use_cinn, input_spec)
+        net.eval()
+        out = net(self.x)
+        if use_cinn:
+            self.check_jit_kernel_info(net.forward)
+        return out
+
+    def test_eval_symbolic(self):
+        cinn_out = self.eval_symbolic(use_cinn=True)
+        dy_out = self.eval_symbolic(use_cinn=False)
+        np.testing.assert_allclose(cinn_out.numpy(), dy_out.numpy(), atol=1e-8)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_if_dy.py b/test/ir/pir/cinn/symbolic/test_if_dy.py
index 2a2ff32d1570b..e4ee3dff33516 100644
--- a/test/ir/pir/cinn/symbolic/test_if_dy.py
+++ b/test/ir/pir/cinn/symbolic/test_if_dy.py
@@ -12,10 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import sys
 import unittest
 from os.path import dirname
 
+os.environ['FLAGS_cinn_new_group_scheduler'] = '1'
+os.environ['FLAGS_group_schedule_tiling_first'] = '1'
+os.environ['FLAGS_prim_enable_dynamic'] = 'true'
+
 import numpy as np
 
 import paddle
@@ -53,13 +58,13 @@ def prepare_data(self):
         self.x.stop_gradient = False
 
     def check_jit_kernel_info(self, static_fn):
-        utils.check_jit_kernel_number(static_fn, 2)
+        utils.check_jit_kernel_number(static_fn, 3)
         utils.check_jit_kernel_structure(
             static_fn,
             {
                 'if_0': {utils.JIT_KERNEL_NAME: 1},
                 'else_0': {},
-                utils.JIT_KERNEL_NAME: 1,
+                utils.JIT_KERNEL_NAME: 2,
             },
         )
 
@@ -83,5 +88,5 @@ def test_eval(self):
         )
 
 
-# if __name__ == '__main__':
-#     unittest.main()
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_infer_sym_shape_binary_op.py b/test/ir/pir/cinn/symbolic/test_infer_sym_shape_binary_op.py
index 1f4468239df9c..3cb5fa8343ed4 100644
--- a/test/ir/pir/cinn/symbolic/test_infer_sym_shape_binary_op.py
+++ b/test/ir/pir/cinn/symbolic/test_infer_sym_shape_binary_op.py
@@ -141,8 +141,8 @@ def prepare_data(self):
             'shape[], data[NULL]',
             'shape[S0], data[NULL]',
             'shape[S0, S1], data[NULL]',
-            'shape[Broadcast(S0, S3), S1, S5], data[NULL]',
-            'shape[Broadcast(S0, S4), Broadcast(S1, S5), S2, S7], data[NULL]',
+            'shape[S0, S1, S5], data[NULL]',
+            'shape[S0, S1, S2, S7], data[NULL]',
             # with transpose
             'shape[S1, S3], data[NULL]',
             'shape[S0, S2], data[NULL]',
@@ -228,5 +228,109 @@ def test_eval_symbolic(self):
         return True
 
 
+class MaskedSelectNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, mask):
+        out = paddle.masked_select(x, mask)
+        return out
+
+
+class MaskedSelectOpInferSymbolicShapeTest(TestBase):
+    def prepare_data(self):
+        self.cases = [
+            (
+                np.random.rand(4, 5, 6),
+                np.random.rand(4, 5, 6).astype(np.int64),
+            ),
+            (
+                np.random.rand(4, 5, 6),
+                np.random.rand(4, 5, 6).astype(np.int64),
+            ),
+        ]
+        self.expected = [
+            ['shape[S6], data[NULL]'],
+            ['shape[S6], data[NULL]'],
+        ]
+
+    def test_eval_symbolic(self):
+        net = MaskedSelectNet()
+
+        for i in range(len(self.cases)):
+            x, mask = self.cases[i]
+            x_spec = InputSpec(
+                shape=[None for _ in range(len(x.shape))], dtype='float32'
+            )
+            mask_spec = InputSpec(
+                shape=[None for _ in range(len(mask.shape))], dtype='float32'
+            )
+
+            input_spec = [x_spec, mask_spec]
+            net = apply_to_static(net, False, input_spec)
+            net.eval()
+
+            check_infer_results(
+                net, input_spec, 'pd_op.masked_select', self.expected[i]
+            )
+
+        return True
+
+
+class SearchsortedNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, sorted_sequence, values):
+        out = paddle.searchsorted(sorted_sequence, values)
+        return out
+
+
+class SearchsortedOpInferSymbolicShapeTest(TestBase):
+    def prepare_data(self):
+        self.cases = [
+            (
+                np.random.rand(4, 5, 6),
+                np.random.rand(4, 5, 3),
+            ),
+            (
+                np.random.rand(4, 5, 6),
+                np.random.rand(4, 5, 100),
+            ),
+            (
+                np.random.rand(6),
+                np.random.rand(100),
+            ),
+        ]
+        self.expected = [
+            ['shape[S3, S4, S5], data[NULL]'],
+            ['shape[S3, S4, S5], data[NULL]'],
+            ['shape[S1], data[NULL]'],
+        ]
+
+    def test_eval_symbolic(self):
+        net = SearchsortedNet()
+
+        for i in range(len(self.cases)):
+            sorted_sequence, values = self.cases[i]
+            sorted_sequence_spec = InputSpec(
+                shape=[None for _ in range(len(sorted_sequence.shape))],
+                dtype='float32',
+            )
+            values_spec = InputSpec(
+                shape=[None for _ in range(len(values.shape))], dtype='float32'
+            )
+
+            input_spec = [sorted_sequence_spec, values_spec]
+            net = apply_to_static(net, False, input_spec)
+            net.eval()
+
+            check_infer_results(
+                net, input_spec, 'pd_op.searchsorted', self.expected[i]
+            )
+
+        return True
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_infer_sym_shape_multinary_op.py b/test/ir/pir/cinn/symbolic/test_infer_sym_shape_multinary_op.py
index bd78c092d9ca6..7a34c737a2014 100644
--- a/test/ir/pir/cinn/symbolic/test_infer_sym_shape_multinary_op.py
+++ b/test/ir/pir/cinn/symbolic/test_infer_sym_shape_multinary_op.py
@@ -22,6 +22,7 @@
 )
 
 import paddle
+import paddle.nn.functional as F
 from paddle.static import InputSpec
 
 
@@ -63,6 +64,59 @@ def test_eval_symbolic(self):
         return out
 
 
+class MeshgridNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        out_x, out_y = paddle.meshgrid(x, y)
+        return out_x, out_y
+
+
+class MeshgridOpInferSymbolicShapeTest(TestBase):
+    def prepare_data(self):
+        self.x_cases = [
+            np.random.rand(1),
+            np.random.rand(10),
+            np.random.rand(100),
+            np.random.rand(1000),
+        ]
+        self.y_cases = [
+            np.random.rand(1),
+            np.random.rand(10),
+            np.random.rand(1000),
+            np.random.rand(100),
+        ]
+
+        self.expected = [
+            'shape[S0, S1], data[NULL], shape[S0, S1], data[NULL]',
+        ]
+
+    def test_eval_symbolic(self):
+        net = MeshgridNet()
+
+        for i in range(len(self.x_cases)):
+            x = self.x_cases[i]
+            y = self.y_cases[i]
+            x_spec = InputSpec(
+                shape=[None for _ in range(len(x.shape))], dtype='float32'
+            )
+            y_spec = InputSpec(
+                shape=[None for _ in range(len(y.shape))], dtype='float32'
+            )
+
+            input_spec = [x_spec, y_spec]
+            net = apply_to_static(net, False, input_spec)
+            net.eval()
+            check_infer_results(
+                net, input_spec, 'pd_op.meshgrid', self.expected
+            )
+
+        # TODO(WintersMontagne10335): Add builtin.meshgrid op infer symbolic shape test
+        #                Not added because attribute `sym_shape_str` does not support multi-output op now.
+        #                See also: paddle/fluid/pir/transforms/shape_optimization_pass.cc:144.
+
+
 class LinspaceNet(paddle.nn.Layer):
     def __init__(self):
         super().__init__()
@@ -300,5 +354,36 @@ def test_eval_symbolic(self):
         return True
 
 
+class InterpolateNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        input_data = paddle.empty(shape=(2, 3, 6, 10))
+        output = F.interpolate(x=input_data, size=[12, 12])
+        return output
+
+
+class InterpolateOpInferSymbolicShapeTest(TestBase):
+    def prepare_data(self):
+        self.x = paddle.rand([1, 3], 'float32')
+        self.expected = [
+            'shape[2, 3, 12, 12], data[NULL]',
+        ]
+
+    def test_eval_symbolic(self):
+        net = InterpolateNet()
+        input_spec = [
+            InputSpec(shape=[None, None], dtype='float32'),
+        ]
+        net = apply_to_static(net, False, input_spec)
+        net.eval()
+        check_infer_results(
+            net, input_spec, 'pd_op.nearest_interp', self.expected
+        )
+        out = net(self.x)
+        return out
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_infer_sym_shape_nullary_op.py b/test/ir/pir/cinn/symbolic/test_infer_sym_shape_nullary_op.py
index 75258f06ebd50..c127d114e8051 100644
--- a/test/ir/pir/cinn/symbolic/test_infer_sym_shape_nullary_op.py
+++ b/test/ir/pir/cinn/symbolic/test_infer_sym_shape_nullary_op.py
@@ -110,7 +110,7 @@ def prepare_data(self):
     def test_eval_symbolic(self):
         net = EmptyNet()
 
-        x_spec = InputSpec(shape=[None, None, None], dtype='float32')
+        x_spec = InputSpec(shape=[None, None, None], dtype='int32')
         input_spec = [x_spec]
         net = apply_to_static(net, False, input_spec)
         net.eval()
diff --git a/test/ir/pir/cinn/symbolic/test_infer_sym_shape_unary_op.py b/test/ir/pir/cinn/symbolic/test_infer_sym_shape_unary_op.py
index 89f4bb7023706..6a65b6b32b537 100644
--- a/test/ir/pir/cinn/symbolic/test_infer_sym_shape_unary_op.py
+++ b/test/ir/pir/cinn/symbolic/test_infer_sym_shape_unary_op.py
@@ -22,6 +22,7 @@
 )
 
 import paddle
+import paddle.nn.functional as F
 from paddle.static import InputSpec
 
 
@@ -271,7 +272,7 @@ def test_eval_symbolic(self):
 
             # check the infer result
             check_infer_results(
-                net, input_spec, 'pd_op.diag_embed', self.expected[0]
+                net, input_spec, 'pd_op.diag_embed', self.expected[i]
             )
 
         return True
@@ -320,7 +321,7 @@ def test_eval_symbolic(self):
             net.eval()
 
             check_infer_results(
-                net, input_spec, 'pd_op.diagonal', self.expected[0]
+                net, input_spec, 'pd_op.diagonal', self.expected[i]
             )
 
         return True
@@ -358,7 +359,7 @@ def test_eval_symbolic(self):
             net = apply_to_static(net, False, input_spec)
             net.eval()
             check_infer_results(
-                net, input_spec, 'pd_op.kthvalue', self.expected[0]
+                net, input_spec, 'pd_op.kthvalue', self.expected[i]
             )
 
         return True
@@ -560,7 +561,7 @@ def test_eval_symbolic(self):
             net.eval()
 
             check_infer_results(
-                net, input_spec, 'pd_op.reshape', self.expected[0]
+                net, input_spec, 'pd_op.reshape', self.expected[i]
             )
 
         return True
@@ -575,13 +576,11 @@ def forward(self, x):
         out = paddle.split(x, [1, 2, -1], axis=1)
         out = paddle.split(x, [1, -1], axis=1)
         out = paddle.split(x, [1, 2, 3], axis=1)
-        out = paddle.split(x, [1, 2, x.shape[1]], axis=1)
 
         out = x.split([-1], axis=1)
         out = x.split([1, 2, -1], axis=1)
         out = x.split([1, -1], axis=1)
         out = x.split([1, 2, 3], axis=1)
-        out = x.split([1, 2, x.shape[1]], axis=1)
 
         return out
 
@@ -594,12 +593,10 @@ def prepare_data(self):
             'shape[S0, 1, S2], data[NULL], shape[S0, 2, S2], data[NULL], shape[S0, Add(S1, -3), S2], data[NULL]',
             'shape[S0, 1, S2], data[NULL], shape[S0, Add(S1, -1), S2], data[NULL]',
             'shape[S0, 1, S2], data[NULL], shape[S0, 2, S2], data[NULL], shape[S0, 3, S2], data[NULL]',
-            'shape[S0, 1, S2], data[NULL], shape[S0, 2, S2], data[NULL], shape[S0, S1, S2], data[NULL]',
-            'shape[S0, S1, S2], data[NULL]',
-            'shape[S0, 1, S2], data[NULL], shape[S0, 2, S2], data[NULL], shape[S0, Add(S1, -3), S2], data[NULL]',
-            'shape[S0, 1, S2], data[NULL], shape[S0, Add(S1, -1), S2], data[NULL]',
+            'shape[S0, 6, S2], data[NULL]',
+            'shape[S0, 1, S2], data[NULL], shape[S0, 2, S2], data[NULL], shape[S0, 3, S2], data[NULL]',
+            'shape[S0, 1, S2], data[NULL], shape[S0, 5, S2], data[NULL]',
             'shape[S0, 1, S2], data[NULL], shape[S0, 2, S2], data[NULL], shape[S0, 3, S2], data[NULL]',
-            'shape[S0, 1, S2], data[NULL], shape[S0, 2, S2], data[NULL], shape[S0, S1, S2], data[NULL]',
         ]
 
     def test_eval_symbolic(self):
@@ -696,5 +693,184 @@ def test_eval_symbolic(self):
         return True
 
 
+class PadNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        out1 = F.pad(x, [1, 2, 3, 4, 5, 6])
+        out2 = F.pad(x, [0, 1, 2, 2, 0, 0])
+        return out1, out2
+
+
+class PadOpInferSymbolicShapeTest(TestBase):
+    def prepare_data(self):
+        self.cases = [np.random.rand(4, 5, 6)]
+        self.expected = [
+            [
+                'shape[Add(S0, 3), Add(S1, 7), Add(S2, 11)], data[NULL]',
+                'shape[Add(S0, 1), Add(S1, 4), S2], data[NULL]',
+            ]
+        ]
+
+    def test_eval_symbolic(self):
+        net = PadNet()
+
+        for i in range(len(self.cases)):
+            x = self.cases[i]
+            x_spec = InputSpec(
+                # shape=[4, None, None], dtype='float32'
+                # shape=[x.shape[index] for index in range(len(x.shape))], dtype='float32'
+                shape=[None for index in range(len(x.shape))],
+                dtype='float32',
+            )
+
+            input_spec = [x_spec]
+            net = apply_to_static(net, False, input_spec)
+            net.eval()
+
+            check_infer_results(net, input_spec, 'pd_op.pad', self.expected[i])
+
+        return True
+
+
+class UnbindNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        out1 = paddle.unbind(x)
+        out2 = paddle.unbind(x, axis=0)
+        out3 = paddle.unbind(x, axis=1)
+        out4 = paddle.unbind(x, axis=-2)
+        out5 = paddle.unbind(x, axis=-3)
+        return out1, out2, out3, out4, out5
+
+
+class UnbindOpInferSymbolicShapeTest(TestBase):
+    def prepare_data(self):
+        self.cases = [np.random.rand(4, 5, 6)]
+        self.expected = [
+            [
+                ', '.join(['shape[5, S0], data[NULL]'] * 4),
+                ', '.join(['shape[5, S0], data[NULL]'] * 4),
+                ', '.join(['shape[4, S0], data[NULL]'] * 4),
+                ', '.join(['shape[4, S0], data[NULL]'] * 4),
+                ', '.join(['shape[5, S0], data[NULL]'] * 4),
+            ]
+        ]
+
+    def test_eval_symbolic(self):
+        net = UnbindNet()
+
+        for i in range(len(self.cases)):
+            x = self.cases[i]
+            x_spec = InputSpec(
+                shape=[x.shape[index] for index in range(len(x.shape) - 1)]
+                + [None],
+                dtype='float32',
+            )
+
+            input_spec = [x_spec]
+            net = apply_to_static(net, False, input_spec)
+            net.eval()
+
+            check_infer_results(
+                net, input_spec, 'pd_op.unbind', self.expected[i]
+            )
+
+        return True
+
+
+class UniqueNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        out1 = paddle.unique(x)
+        out2 = paddle.unique(x, axis=0)
+        out3 = paddle.unique(x, axis=-1)
+        out4 = paddle.unique(x, axis=2)
+        return out1, out2, out3, out4
+
+
+class UniqueOpInferSymbolicShapeTest(TestBase):
+    def prepare_data(self):
+        self.cases = [np.random.rand(4, 5, 6)]
+        self.expected = [
+            [
+                # TODO: Now only the first output is tested because only the first is exported to `sym_shape_str`.
+                'shape[S3], data[NULL]',
+                'shape[S4, S1, S2], data[NULL]',
+                'shape[S0, S1, S5], data[NULL]',
+                'shape[S0, S1, S6], data[NULL]',
+            ]
+        ]
+
+    def test_eval_symbolic(self):
+        net = UniqueNet()
+
+        for i in range(len(self.cases)):
+            x = self.cases[i]
+            x_spec = InputSpec(
+                shape=[None for index in range(len(x.shape))], dtype='float32'
+            )
+
+            input_spec = [x_spec]
+            net = apply_to_static(net, False, input_spec)
+            net.eval()
+
+            check_infer_results(
+                net, input_spec, 'pd_op.unique', self.expected[i]
+            )
+
+        return True
+
+
+class UniqueConsecutiveNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        out1 = paddle.unique_consecutive(x)
+        out2 = paddle.unique_consecutive(x, axis=0)
+        out3 = paddle.unique_consecutive(x, axis=-1)
+        out4 = paddle.unique_consecutive(x, axis=2)
+        return out1, out2, out3, out4
+
+
+class UniqueConsecutiveOpInferSymbolicShapeTest(TestBase):
+    def prepare_data(self):
+        self.cases = [np.random.rand(4, 5, 6)]
+        self.expected = [
+            [
+                # TODO: Now only the first output is tested because only the first is exported to `sym_shape_str`.
+                'shape[S3], data[NULL]',
+                'shape[S4, S1, S2], data[NULL]',
+                'shape[S0, S1, S5], data[NULL]',
+                'shape[S0, S1, S6], data[NULL]',
+            ]
+        ]
+
+    def test_eval_symbolic(self):
+        net = UniqueConsecutiveNet()
+
+        for i in range(len(self.cases)):
+            x = self.cases[i]
+            x_spec = InputSpec(
+                shape=[None for index in range(len(x.shape))], dtype='float32'
+            )
+
+            input_spec = [x_spec]
+            net = apply_to_static(net, False, input_spec)
+            net.eval()
+
+            check_infer_results(
+                net, input_spec, 'pd_op.unique_consecutive', self.expected[i]
+            )
+
+        return True
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_sub_graph_batch_norm.py b/test/ir/pir/cinn/symbolic/test_sub_graph_batch_norm.py
new file mode 100644
index 0000000000000..f9277bd64b939
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_sub_graph_batch_norm.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# repo: diffusers_sub_grpah
+# model: stable_diffusion
+# api:paddle.nn.functional.activation.silu||api:paddle.nn.functional.common.dropout||api:paddle.nn.functional.conv.conv2d||api:paddle.nn.functional.conv.conv2d||method:__add__||method:__truediv__
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class LayerCase(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.parameter_0 = self.create_parameter(
+            shape=[32],
+            dtype=paddle.float32,
+        )
+        self.parameter_1 = self.create_parameter(
+            shape=[32],
+            dtype=paddle.float32,
+        )
+        self.parameter_2 = self.create_parameter(
+            shape=[32],
+            dtype=paddle.float32,
+        )
+        self.parameter_3 = self.create_parameter(
+            shape=[32],
+            dtype=paddle.float32,
+        )
+
+    def forward(
+        self,
+        var_0,  # (shape: [1, 256, 4, 4], dtype: paddle.float32, stop_gradient: True)
+    ):
+        return paddle.nn.functional.batch_norm(
+            var_0,
+            self.parameter_0,
+            self.parameter_1,
+            self.parameter_2,
+            self.parameter_3,
+            training=True,
+        )
+
+
+def create_paddle_inputs():
+    inputs = (paddle.rand(shape=[16, 32, 12, 12], dtype=paddle.float32),)
+    return inputs
+
+
+class TestLayer(unittest.TestCase):
+    def setUp(self):
+        self.inputs = create_paddle_inputs()
+        self.net = LayerCase()
+
+    def train(self, net, to_static, with_prim=False, with_cinn=False):
+        if to_static:
+            paddle.set_flags({'FLAGS_prim_all': with_prim})
+            if with_cinn:
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.build_cinn_pass = True
+                net = paddle.jit.to_static(
+                    net, build_strategy=build_strategy, full_graph=True
+                )
+            else:
+                net = paddle.jit.to_static(net, full_graph=True)
+        paddle.seed(123)
+        outs = net(*self.inputs)
+        return outs
+
+    def test_ast_prim_cinn(self):
+        st_out = self.train(self.net, to_static=True)
+        cinn_out = self.train(
+            self.net, to_static=True, with_prim=True, with_cinn=True
+        )
+        for st, cinn in zip(
+            paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
+        ):
+            np.testing.assert_allclose(
+                st.numpy(), cinn.numpy(), atol=1e-6, rtol=1e-5
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/test_cinn_broadcast.py b/test/ir/pir/cinn/test_cinn_broadcast.py
new file mode 100644
index 0000000000000..c93e8722ee9ce
--- /dev/null
+++ b/test/ir/pir/cinn/test_cinn_broadcast.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import utils
+
+import paddle
+
+
+class CINNCosSubGraphNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        tmp = x * y
+        tmp1 = paddle.reshape(tmp, [80, 32, 4])
+        tmp2 = paddle.sum(tmp1, axis=2)
+        return tmp2
+
+
+class TestCinnCos(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.x = paddle.uniform([80, 128], dtype="float32", min=-0.5, max=0.5)
+        self.x.stop_gradient = True
+        self.y = paddle.uniform([128], dtype="float32", min=-0.5, max=0.5)
+        self.y.stop_gradient = True
+
+    def train(self, use_cinn):
+        net = CINNCosSubGraphNet()
+        net.eval()
+        net = utils.apply_to_static(net, use_cinn)
+        for i in range(1):
+            out = net(self.x, self.y)
+        return out
+
+    def test_train(self):
+        cinn_out = self.train(use_cinn=True)
+        dy_out = self.train(use_cinn=False)
+
+        np.testing.assert_allclose(cinn_out.numpy(), dy_out.numpy(), atol=1e-5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/test_cinn_group_norm.py b/test/ir/pir/cinn/test_cinn_group_norm.py
new file mode 100644
index 0000000000000..934f3c86439b9
--- /dev/null
+++ b/test/ir/pir/cinn/test_cinn_group_norm.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+import utils
+
+import paddle
+from paddle.base import core
+
+
+class GroupNormSubGraph(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, weight, bias, data_format):
+        return paddle.nn.functional.group_norm(
+            x,
+            num_groups=32,
+            epsilon=1e-6,
+            weight=weight,
+            bias=bias,
+            data_format=data_format,
+        )
+
+
+class TestGroupNormSubGraph(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.shape = [80, 128, 256, 128]
+        self.dtype = "bfloat16"
+        self.data_format = "NHWC"
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.x = paddle.randn(self.shape, dtype=self.dtype)
+        self.x.stop_gradient = False
+        self.weight = paddle.randn([128], dtype=self.dtype)
+        self.weight.stop_gradient = False
+        self.bias = paddle.randn([128], dtype=self.dtype)
+        self.bias.stop_gradient = False
+
+    def eval(self, use_cinn, use_prim=False):
+        if self.x.grad is not None:
+            self.x.clear_grad()
+            self.weight.clear_grad()
+            self.bias.clear_grad()
+
+        if use_prim:
+            core._set_prim_all_enabled(True)
+        net = GroupNormSubGraph()
+        net = utils.apply_to_static(net, use_cinn=use_cinn)
+        out = net(self.x, self.weight, self.bias, self.data_format)
+        loss = out.sum()
+        loss.backward()
+
+        core._set_prim_all_enabled(False)
+        return (
+            out,
+            self.x.gradient(),
+            self.weight.gradient(),
+            self.bias.gradient(),
+        )
+
+    def _test_cinn(self):
+        cinn_out, cinn_x_grad, cinn_weight_grad, cinn_bias_grad = self.eval(
+            use_cinn=True, use_prim=True
+        )
+        dy_out, dy_x_grad, dy_weight_grad, dy_bias_grad = self.eval(
+            use_cinn=False
+        )
+        # np.testing.assert_allclose(
+        #     cinn_x_grad, dy_x_grad, atol=1e-2
+        # )
+        # np.testing.assert_allclose(
+        #     cinn_weight_grad, dy_weight_grad, atol=1e-6
+        # )
+        # np.testing.assert_allclose(
+        #     cinn_bias_grad, dy_bias_grad, atol=1e-6
+        # )
+
+
+# Todo: Origin group_norm does not support float16 or bfloat16 in NHWC and rank3
+class _TestGroupNormSubGraphF16NHWCRank3(TestGroupNormSubGraph):
+    def setUp(self):
+        paddle.seed(2024)
+        self.shape = [80, 128, 128]
+        self.dtype = "float16"
+        self.data_format = "NHWC"
+        self.prepare_data()
+
+    def test_prim(self):
+        cinn_out, cinn_x_grad, cinn_weight_grad, cinn_bias_grad = self.eval(
+            use_cinn=False, use_prim=True
+        )
+        dy_out, dy_x_grad, dy_weight_grad, dy_bias_grad = self.eval(
+            use_cinn=False
+        )
+        np.testing.assert_allclose(cinn_x_grad, dy_x_grad, atol=1e-5, rtol=1e-5)
+        np.testing.assert_allclose(cinn_weight_grad, dy_weight_grad, atol=5e-4)
+        np.testing.assert_allclose(cinn_bias_grad, dy_bias_grad, rtol=1e-5)
+
+
+class TestGroupNormSubGraphRank3(TestGroupNormSubGraph):
+    def setUp(self):
+        paddle.seed(2024)
+        self.shape = [80, 128, 128]
+        self.dtype = "float32"
+        self.data_format = "NHWC"
+        self.prepare_data()
+
+    def test_prim(self):
+        cinn_out, cinn_x_grad, cinn_weight_grad, cinn_bias_grad = self.eval(
+            use_cinn=False, use_prim=True
+        )
+        dy_out, dy_x_grad, dy_weight_grad, dy_bias_grad = self.eval(
+            use_cinn=False
+        )
+        np.testing.assert_allclose(cinn_x_grad, dy_x_grad, atol=1e-5, rtol=1e-5)
+        np.testing.assert_allclose(cinn_weight_grad, dy_weight_grad, atol=5e-4)
+        np.testing.assert_allclose(cinn_bias_grad, dy_bias_grad, rtol=1e-5)
+
+
+class TestGroupNormSubGraphNCHW(TestGroupNormSubGraphRank3):
+    def setUp(self):
+        paddle.seed(2024)
+        self.shape = [80, 128, 128]
+        self.dtype = "bfloat16"
+        self.data_format = "NCHW"
+        self.prepare_data()
+
+    def test_prim(self):
+        cinn_out, cinn_x_grad, cinn_weight_grad, cinn_bias_grad = self.eval(
+            use_cinn=False, use_prim=True
+        )
+        dy_out, dy_x_grad, dy_weight_grad, dy_bias_grad = self.eval(
+            use_cinn=False
+        )
+        np.testing.assert_allclose(cinn_x_grad, dy_x_grad, atol=1e-3, rtol=1e-3)
+        np.testing.assert_allclose(cinn_weight_grad, dy_weight_grad, rtol=1e-2)
+        np.testing.assert_allclose(cinn_bias_grad, dy_bias_grad, rtol=1e-5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/test_cinn_reduce_sum.py b/test/ir/pir/cinn/test_cinn_reduce_sum.py
new file mode 100644
index 0000000000000..6f2ecd05a9f2b
--- /dev/null
+++ b/test/ir/pir/cinn/test_cinn_reduce_sum.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import unittest
+from os.path import dirname
+
+import numpy as np
+
+import paddle
+
+sys.path.append(dirname(dirname(__file__)))
+import utils
+
+
+class ReduceSumSubGraph(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return paddle.max(x * 2, axis=[-1], keepdim=True)
+
+
+class TestReduceSumSubGraph(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.x = paddle.randn([8, 4], dtype="float32")
+
+    def eval(self, use_cinn):
+        net = ReduceSumSubGraph()
+        net.eval()
+        net = utils.apply_to_static(net, use_cinn)
+        out = net(self.x)
+        return out
+
+    def test_eval(self):
+        cinn_out = self.eval(use_cinn=True)
+        dy_out = self.eval(use_cinn=False)
+
+        np.testing.assert_allclose(cinn_out.numpy(), dy_out.numpy(), atol=1e-6)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/test_dynamic_shape.py b/test/ir/pir/cinn/test_dynamic_shape.py
new file mode 100644
index 0000000000000..92efc24037893
--- /dev/null
+++ b/test/ir/pir/cinn/test_dynamic_shape.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import unittest
+
+import numpy
+
+os.environ['FLAGS_cinn_new_group_scheduler'] = '1'
+os.environ['FLAGS_group_schedule_tiling_first'] = '1'
+os.environ['FLAGS_prim_all'] = 'true'
+os.environ['FLAGS_prim_enable_dynamic'] = 'true'
+os.environ['FLAGS_print_ir'] = '1'
+os.environ['FLAGS_enable_pir_api'] = '1'
+os.environ['FLAGS_use_cinn'] = '1'
+os.environ['FLAGS_cinn_bucket_compile'] = '1'
+os.environ['FLAGS_cinn_new_cluster_op_method'] = '1'
+os.environ['FLAGS_deny_cinn_ops'] = 'slice;'
+
+import paddle
+
+build_strategy = paddle.static.BuildStrategy()
+build_strategy.build_cinn_pass = True
+
+
+def generate_input_spec(rank_dtype_list):
+    input_spec = []
+    for rank, dtype in rank_dtype_list:
+        input_spec.append(
+            paddle.static.InputSpec(shape=[None] * rank, dtype=dtype)
+        )
+    return input_spec
+
+
+class TestTrivialFusion(unittest.TestCase):
+    def setUp(self):
+        pass
+
+    def tearDown(self):
+        pass
+
+    def compare_result(self, dy_compute, input_spec, data_init):
+        inputs = data_init()
+        dy_out = dy_compute(*inputs)
+        static_compute = paddle.jit.to_static(
+            full_graph=True,
+            build_strategy=build_strategy,
+            input_spec=input_spec,
+        )(dy_compute)
+        st_out = static_compute(*inputs)
+        numpy.testing.assert_allclose(dy_out, st_out, atol=1e-5, rtol=1e-6)
+
+    def test_simple_trivial_fusions(self):
+        def func(x):
+            x = x * 2
+            x = x + 1
+            x = paddle.nn.functional.relu(x)
+            x = paddle.transpose(x, perm=[0, 2, 1])
+            x = x.reshape((-1, 128))
+            return x
+
+        def init():
+            x = paddle.rand((32, 32, 128))
+            return (x,)
+
+        input_spec = generate_input_spec([(3, 'float32')])
+        self.compare_result(func, input_spec, init)
+
+    def test_trivial_fusion_slice_and_concat(self):
+        def func(x, y):
+            x = x * 2
+            y = y * 2
+            x = x[:, :, :64]
+            y = y[:, :, :64]
+            z = paddle.concat([x, y], axis=-1)
+            return z
+
+        def init():
+            x = paddle.rand((32, 32, 128))
+            y = paddle.rand((32, 32, 128))
+            return (x, y)
+
+        input_spec = generate_input_spec([(3, 'float32'), (3, 'float32')])
+        self.compare_result(func, input_spec, init)
+
+    def test_trivial_fusion_gather_nd(self):
+        def func(x, y):
+            x = x * 2
+            output = paddle.gather_nd(x, y)
+            return output
+
+        def init():
+            x = paddle.to_tensor(
+                [[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]
+            )
+            index = paddle.to_tensor([[0, 1]])
+            return (x, index)
+
+        input_spec = [
+            paddle.static.InputSpec(shape=[None, None, None], dtype='float32'),
+            paddle.static.InputSpec(shape=[None, 2], dtype='int32'),
+        ]
+        self.compare_result(func, input_spec, init)
+
+    def test_broadcast(self):
+        def func(x, y):
+            output = x + y
+            return output
+
+        def init():
+            x = paddle.rand((32, 1))
+            y = paddle.rand((1, 32))
+            return (x, y)
+
+        input_spec = generate_input_spec([(2, 'float32'), (2, 'float32')])
+        self.compare_result(func, input_spec, init)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/ir/pir/cinn/test_graph.py b/test/ir/pir/cinn/test_graph.py
new file mode 100644
index 0000000000000..3cf1230287bee
--- /dev/null
+++ b/test/ir/pir/cinn/test_graph.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import unittest
+from os.path import dirname
+
+import numpy as np
+
+os.environ['FLAGS_cinn_new_group_scheduler'] = '1'
+os.environ['FLAGS_group_schedule_tiling_first'] = '1'
+os.environ['FLAGS_prim_all'] = 'true'
+os.environ['FLAGS_prim_enable_dynamic'] = 'true'
+os.environ['FLAGS_print_ir'] = '1'
+os.environ['FLAGS_enable_pir_api'] = '1'
+os.environ['FLAGS_use_cinn'] = '1'
+os.environ['FLAGS_cinn_bucket_compile'] = '1'
+os.environ['FLAGS_cinn_new_cluster_op_method'] = '1'
+os.environ['FLAGS_deny_cinn_ops'] = 'slice;'
+
+
+import paddle
+from paddle import nn
+
+sys.path.append(dirname(dirname(__file__)))
+
+import utils
+
+
+class BroadcastSubgraph(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self, x, in_1, in_2, in_3, in_5
+    ):  # [512, 512] [512, 1], [512, 512], [512, 1] , [512, 512]
+        t0 = paddle.transpose(x, [1, 0])
+        t2 = in_1 * in_1
+        t3 = paddle.expand(t2, [512, 512])
+        t4 = in_2 / t3
+        t5 = t4 * -1
+        t6 = t5 * t0
+        t7 = t6.sum([1], keepdim=False)
+        t8 = t7.reshape([512, 1])
+        t10 = 1 / in_1
+        t12 = t10 * t0
+        t14 = t12 * in_2
+        t16 = 1 / in_3
+        t17 = t16 * t8
+        t18 = t17 * in_5
+        t19 = t17 * in_5
+        t21 = t14 + t18
+        t22 = t21 + t19
+        return t22
+
+
+class TestIfSubgraph(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.prepare_data()
+
+    def prepare_data(self):
+        # self.shape = [22, 64, 56]
+        self.x = paddle.randn([512, 512], dtype="float32")
+        self.x.stop_gradient = False
+
+        self.in_1 = paddle.randn([512, 1], dtype="float32")
+        self.in_1.stop_gradient = False
+
+        self.in_2 = paddle.randn([512, 512], dtype="float32")
+        self.in_2.stop_gradient = False
+
+        self.in_3 = paddle.randn([512, 1], dtype="float32")
+        self.in_3.stop_gradient = False
+
+        self.in_5 = paddle.randn([512, 512], dtype="float32")
+        self.in_5.stop_gradient = False
+
+    def check_jit_kernel_info(self, static_fn):
+        utils.check_jit_kernel_number(static_fn, 2)
+        utils.check_jit_kernel_structure(
+            static_fn,
+            {
+                'if_0': {utils.JIT_KERNEL_NAME: 1},
+                'else_0': {},
+                utils.JIT_KERNEL_NAME: 1,
+            },
+        )
+
+    def eval(self, use_cinn):
+        net = BroadcastSubgraph()
+
+        net = utils.apply_to_static(net, use_cinn)
+        net.eval()
+        out = net(self.x, self.in_1, self.in_2, self.in_3, self.in_5)
+        return out
+
+    def test_eval(self):
+        dy_out = self.eval(use_cinn=False)
+        cinn_out = self.eval(use_cinn=True)
+        np.testing.assert_allclose(
+            cinn_out.numpy(), dy_out.numpy(), atol=1e-2, rtol=1e-2
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/fused_pass/CMakeLists.txt b/test/ir/pir/fused_pass/CMakeLists.txt
index d799701444126..86181ad622746 100644
--- a/test/ir/pir/fused_pass/CMakeLists.txt
+++ b/test/ir/pir/fused_pass/CMakeLists.txt
@@ -1,4 +1,6 @@
-add_subdirectory(onednn)
+if(WITH_ONEDNN)
+  add_subdirectory(onednn)
+endif()
 
 if(WITH_XPU)
   add_subdirectory(xpu)
@@ -19,7 +21,8 @@ foreach(target ${TEST_INTERP_CASES})
 endforeach()
 
 set_tests_properties(test_pir_multihead_matmul_fuse_pass PROPERTIES TIMEOUT 100)
+set_tests_properties(test_add_norm_fuse_pass PROPERTIES TIMEOUT 300)
 if(WITH_CUTLASS)
   set_tests_properties(test_fused_weight_only_linear_pass PROPERTIES TIMEOUT
-                                                                     300)
+                                                                     600)
 endif()
diff --git a/test/ir/pir/fused_pass/onednn/pass_test.py b/test/ir/pir/fused_pass/onednn/pass_test.py
index b0df75a92c003..bcc307e72b74c 100644
--- a/test/ir/pir/fused_pass/onednn/pass_test.py
+++ b/test/ir/pir/fused_pass/onednn/pass_test.py
@@ -27,20 +27,18 @@ def setUpClass(self):
         self.feeds = None
         self.fetch_list = None
         self.valid_op_map = {}
-        self.pass_list = []
         self.pir_program = None
         self.places = []
         self.skip_accuracy_verification = False
+        self.pass_attr_list = []  # pass_name:pass_attr(defalut:None)
 
     def run_pir_pass(self, program):
-        if not isinstance(self.pass_list, list):
-            self.pass_list = [self.pass_list]
-
         pm = pir.PassManager(opt_level=4)
-        pm.enable_ir_printing()
         pm.enable_print_statistics()
-        for pass_name in self.pass_list:
-            pm.add_pass(pass_name)
+        pm.enable_ir_printing()
+        for pass_item in self.pass_attr_list:
+            for pass_name, pass_attr in pass_item.items():
+                pm.add_pass(pass_name, pass_attr)
         pm.run(program)
         return program
 
@@ -54,10 +52,8 @@ def check_fused_ops(self, program):
             actual_valid_op_count = op_names.count(valid_op_name)
             self.assertTrue(
                 valid_op_count == actual_valid_op_count,
-                "Checking of the number of fused operator < {} > failed. "
-                "Expected: {}, Received: {}".format(
-                    valid_op_name, valid_op_count, actual_valid_op_count
-                ),
+                f"Checking of the number of fused operator < {valid_op_name} > failed. "
+                f"Expected: {valid_op_count}, Received: {actual_valid_op_count}",
             )
 
     @abc.abstractmethod
diff --git a/test/ir/pir/fused_pass/onednn/test_batch_norm_act_fuse_pass.py b/test/ir/pir/fused_pass/onednn/test_batch_norm_act_fuse_pass.py
index aef4e91652e6b..ef4efaaf38b31 100644
--- a/test/ir/pir/fused_pass/onednn/test_batch_norm_act_fuse_pass.py
+++ b/test/ir/pir/fused_pass/onednn/test_batch_norm_act_fuse_pass.py
@@ -22,10 +22,6 @@
 paddle.enable_static()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestConv3dAddFusePass(PassTest):
     def is_program_valid(self, program=None):
         return True
@@ -47,7 +43,7 @@ def build_ir_program(self):
                 bn_out = bn(x)
                 out = paddle.nn.functional.relu(bn_out)
                 out = paddle.assign(out)
-                self.pass_list = ['batch_norm_act_fuse_pass']
+                self.pass_attr_list = [{'batch_norm_act_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((3, 32, 28, 28)).astype("float32"),
                 }
diff --git a/test/ir/pir/fused_pass/onednn/test_conv2d_bias_fuse_pass.py b/test/ir/pir/fused_pass/onednn/test_conv2d_bias_fuse_pass.py
index 1751f58818f3f..4f7ad606a86c1 100644
--- a/test/ir/pir/fused_pass/onednn/test_conv2d_bias_fuse_pass.py
+++ b/test/ir/pir/fused_pass/onednn/test_conv2d_bias_fuse_pass.py
@@ -22,10 +22,6 @@
 paddle.enable_static()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestConv2dAddFusePass(PassTest):
     def is_program_valid(self, program=None):
         return True
@@ -64,7 +60,7 @@ def build_ir_program(self):
 
                 out = paddle.add(conv2d(x), bias)
                 out = paddle.assign(out)
-                self.pass_list = ['conv2d_bias_fuse_pass']
+                self.pass_attr_list = [{'conv2d_bias_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
                     "bias": np.random.random(1).astype("float32"),
@@ -87,10 +83,6 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestConv2dAddFusePassWithAddParam(PassTest):
     def is_program_valid(self, program=None):
         return True
@@ -136,7 +128,7 @@ def build_ir_program(self):
                 )
                 out = paddle.add(add_out, other_param)
                 out = paddle.assign(out)
-                self.pass_list = ['conv2d_bias_fuse_pass']
+                self.pass_attr_list = [{'conv2d_bias_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
                     "bias": np.random.random(1).astype("float32"),
diff --git a/test/ir/pir/fused_pass/onednn/test_conv2d_elemenwise_add_fuse_pass.py b/test/ir/pir/fused_pass/onednn/test_conv2d_elemenwise_add_fuse_pass.py
index 2e74ad2440e7c..dc93e086ed8cd 100644
--- a/test/ir/pir/fused_pass/onednn/test_conv2d_elemenwise_add_fuse_pass.py
+++ b/test/ir/pir/fused_pass/onednn/test_conv2d_elemenwise_add_fuse_pass.py
@@ -22,10 +22,6 @@
 paddle.enable_static()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestConv2dAddFusePass(PassTest):
     r"""
     x_var   filter
@@ -59,7 +55,9 @@ def build_ir_program(self):
                 )
                 out = paddle.add(conv2d(x), residual_data)
                 out = paddle.assign(out)
-                self.pass_list = ['conv_elementwise_add_mkldnn_fuse_pass']
+                self.pass_attr_list = [
+                    {'conv_elementwise_add_onednn_fuse_pass': {}}
+                ]
                 self.feeds = {
                     "x": np.random.random((3, 1, 28, 28)).astype("float32"),
                     "residual_data": np.random.random((3, 32, 28, 28)).astype(
@@ -84,10 +82,6 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestConv2dAddFusePassAsY(PassTest):
     r"""
             x_var   filter
@@ -121,7 +115,9 @@ def build_ir_program(self):
                 )
                 out = paddle.add(residual_data, conv2d(x))
                 out = paddle.assign(out)
-                self.pass_list = ['conv_elementwise_add_mkldnn_fuse_pass']
+                self.pass_attr_list = [
+                    {'conv_elementwise_add_onednn_fuse_pass': {}}
+                ]
                 self.feeds = {
                     "x": np.random.random((3, 1, 28, 28)).astype("float32"),
                     "residual_data": np.random.random((3, 32, 28, 28)).astype(
@@ -146,10 +142,6 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestConv2dBiasAddFusePass(PassTest):
     r"""
     x_var   filter
@@ -197,9 +189,9 @@ def build_ir_program(self):
                 conv2d_out = paddle.add(conv2d(x), bias)
                 out = paddle.add(conv2d_out, residual_data)
                 out = paddle.assign(out)
-                self.pass_list = [
-                    'conv2d_bias_fuse_pass',
-                    'conv_elementwise_add_mkldnn_fuse_pass',
+                self.pass_attr_list = [
+                    {'conv2d_bias_fuse_pass': {}},
+                    {'conv_elementwise_add_onednn_fuse_pass': {}},
                 ]
 
                 self.feeds = {
diff --git a/test/ir/pir/fused_pass/onednn/test_conv3d_bias_fuse_pass.py b/test/ir/pir/fused_pass/onednn/test_conv3d_bias_fuse_pass.py
index 26defd95863fa..a7de6fef0b8c2 100644
--- a/test/ir/pir/fused_pass/onednn/test_conv3d_bias_fuse_pass.py
+++ b/test/ir/pir/fused_pass/onednn/test_conv3d_bias_fuse_pass.py
@@ -22,10 +22,6 @@
 paddle.enable_static()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestConv3dAddFusePass(PassTest):
     def is_program_valid(self, program=None):
         return True
@@ -64,7 +60,7 @@ def build_ir_program(self):
 
                 out = paddle.add(conv3d(x), bias)
                 out = paddle.assign(out)
-                self.pass_list = ['conv3d_bias_fuse_pass']
+                self.pass_attr_list = [{'conv3d_bias_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5, 5)).astype("float32"),
                     "bias": np.random.random(1).astype("float32"),
@@ -87,10 +83,6 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestConv3dAddFusePassWithAddParam(PassTest):
     def is_program_valid(self, program=None):
         return True
@@ -136,7 +128,7 @@ def build_ir_program(self):
                 )
                 out = paddle.add(add_out, other_param)
                 out = paddle.assign(out)
-                self.pass_list = ['conv3d_bias_fuse_pass']
+                self.pass_attr_list = [{'conv3d_bias_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5, 5)).astype("float32"),
                     "bias": np.random.random(1).astype("float32"),
diff --git a/test/ir/pir/fused_pass/onednn/test_conv_activation_onednn_fuse_pass.py b/test/ir/pir/fused_pass/onednn/test_conv_activation_onednn_fuse_pass.py
new file mode 100644
index 0000000000000..bab2b16300a54
--- /dev/null
+++ b/test/ir/pir/fused_pass/onednn/test_conv_activation_onednn_fuse_pass.py
@@ -0,0 +1,323 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+from pass_test import PassTest
+
+import paddle
+
+paddle.enable_static()
+
+
+class TestConv2dReluFusePass(PassTest):
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                conv2d = paddle.nn.Conv2D(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+                act_op = paddle.nn.ReLU()
+                out = act_op(conv2d(x))
+                out = paddle.assign(out)
+                self.pass_attr_list = [{'conv_activation_onednn_fuse_pass': {}}]
+                self.feeds = {
+                    "x": np.random.random((5, 5, 5, 5)).astype("float32"),
+                }
+                self.fetch_list = [out]
+                self.valid_op_map = {
+                    "pd_op.relu": 0,
+                    "pd_op.conv2d": 0,
+                    "onednn_op.fused_conv2d": 1,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+class TestConv2dHardsigmoidFusePass(PassTest):
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                conv2d = paddle.nn.Conv2D(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+                act_op = paddle.nn.Hardsigmoid()
+                out = act_op(conv2d(x))
+                out = paddle.assign(out)
+                self.pass_attr_list = [{'conv_activation_onednn_fuse_pass': {}}]
+                self.feeds = {
+                    "x": np.random.random((5, 5, 5, 5)).astype("float32"),
+                }
+                self.fetch_list = [out]
+                self.valid_op_map = {
+                    "pd_op.hardsigmoid": 0,
+                    "pd_op.conv2d": 0,
+                    "onednn_op.fused_conv2d": 1,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+class TestConv2dHardSwishFusePass(PassTest):
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                conv2d = paddle.nn.Conv2D(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+                act_op = paddle.nn.Hardswish()
+                out = act_op(conv2d(x))
+                out = paddle.assign(out)
+                self.pass_attr_list = [{'conv_activation_onednn_fuse_pass': {}}]
+                self.feeds = {
+                    "x": np.random.random((5, 5, 5, 5)).astype("float32"),
+                }
+                self.fetch_list = [out]
+                self.valid_op_map = {
+                    "pd_op.hardswish": 0,
+                    "pd_op.conv2d": 0,
+                    "onednn_op.fused_conv2d": 1,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+class TestConv2dGELUFusePass(PassTest):
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                conv2d = paddle.nn.Conv2D(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+                act_op = paddle.nn.GELU()
+                out = act_op(conv2d(x))
+                out = paddle.assign(out)
+                self.pass_attr_list = [{'conv_activation_onednn_fuse_pass': {}}]
+                self.feeds = {
+                    "x": np.random.random((5, 5, 5, 5)).astype("float32"),
+                }
+                self.fetch_list = [out]
+                self.valid_op_map = {
+                    "pd_op.gelu": 0,
+                    "pd_op.conv2d": 0,
+                    "onednn_op.fused_conv2d": 1,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+class TestConv2dClipFusePass(PassTest):
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                conv2d = paddle.nn.Conv2D(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+                out = paddle.clip(conv2d(x), min=-15.0, max=15.0)
+                out = paddle.assign(out)
+                self.pass_attr_list = [{'conv_activation_onednn_fuse_pass': {}}]
+                self.feeds = {
+                    "x": np.random.random((5, 5, 5, 5)).astype("float32"),
+                }
+                self.fetch_list = [out]
+                self.valid_op_map = {
+                    "pd_op.clip": 0,
+                    "pd_op.conv2d": 0,
+                    "onednn_op.fused_conv2d": 1,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+class TestConv2dBiasReluFusePass(PassTest):
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                bias_attr = paddle.ParamAttr(
+                    learning_rate=0.0,
+                    initializer=paddle.nn.initializer.Normal(mean=0.0, std=2.0),
+                )
+                bias = paddle.static.create_parameter(
+                    shape=[1], dtype='float32', attr=bias_attr, is_bias=False
+                )
+                conv2d = paddle.nn.Conv2D(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+                act_op = paddle.nn.ReLU()
+                out = act_op(paddle.add(conv2d(x), bias))
+                out = paddle.assign(out)
+                self.pass_attr_list = [
+                    {'conv2d_bias_fuse_pass': {}},
+                    {'conv_activation_onednn_fuse_pass': {}},
+                ]
+                self.feeds = {
+                    "x": np.random.random((5, 5, 5, 5)).astype("float32"),
+                    "bias": np.random.random(1).astype("float32"),
+                }
+                self.fetch_list = [out]
+                self.valid_op_map = {
+                    "pd_op.relu": 0,
+                    "pd_op.conv2d": 0,
+                    "onednn_op.fused_conv2d": 1,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/ir/pir/fused_pass/onednn/test_conv_concat_activation_onednn_fuse_pass.py b/test/ir/pir/fused_pass/onednn/test_conv_concat_activation_onednn_fuse_pass.py
new file mode 100644
index 0000000000000..581eca5cfa549
--- /dev/null
+++ b/test/ir/pir/fused_pass/onednn/test_conv_concat_activation_onednn_fuse_pass.py
@@ -0,0 +1,543 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+from pass_test import PassTest
+
+import paddle
+
+paddle.enable_static()
+
+
+class TestConv2dConcatReluFusePass(PassTest):
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                conv2d = paddle.nn.Conv2D(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+                act_op = paddle.nn.ReLU()
+                concat_out = paddle.concat([conv2d(x)])
+
+                out = act_op(concat_out)
+                out = paddle.assign(out)
+                self.pass_attr_list = [
+                    {'conv_concat_activation_onednn_fuse_pass': {}}
+                ]
+                self.feeds = {
+                    "x": np.random.random((5, 5, 5, 5)).astype("float32"),
+                }
+                self.fetch_list = [out]
+                self.valid_op_map = {
+                    "pd_op.relu": 0,
+                    "pd_op.conv2d": 0,
+                    "pd_op.concat": 1,
+                    "onednn_op.fused_conv2d": 1,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+class TestConv2dConcat3ReluFusePass(PassTest):
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                conv2d = paddle.nn.Conv2D(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+                x1 = paddle.static.data(
+                    name='x1', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                conv2d1 = paddle.nn.Conv2D(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+                x2 = paddle.static.data(
+                    name='x2', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                conv2d2 = paddle.nn.Conv2D(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+
+                act_op = paddle.nn.ReLU()
+
+                concat_out = paddle.concat(
+                    [conv2d(x), conv2d1(x1), conv2d2(x2)]
+                )
+
+                out = act_op(concat_out)
+                out = paddle.assign(out)
+                self.pass_attr_list = [
+                    {'conv_concat_activation_onednn_fuse_pass': {}}
+                ]
+                self.feeds = {
+                    "x": np.random.random((5, 5, 5, 5)).astype("float32"),
+                    "x1": np.random.random((5, 5, 5, 5)).astype("float32"),
+                    "x2": np.random.random((5, 5, 5, 5)).astype("float32"),
+                }
+                self.fetch_list = [out]
+                self.valid_op_map = {
+                    "pd_op.relu": 0,
+                    "pd_op.conv2d": 0,
+                    "pd_op.concat": 1,
+                    "onednn_op.fused_conv2d": 3,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+class TestConv2dConcat3GELUFusePass(PassTest):
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                conv2d = paddle.nn.Conv2D(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+                x1 = paddle.static.data(
+                    name='x1', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                conv2d1 = paddle.nn.Conv2D(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+                x2 = paddle.static.data(
+                    name='x2', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                conv2d2 = paddle.nn.Conv2D(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+
+                act_op = paddle.nn.GELU()
+                concat_out = paddle.concat(
+                    [conv2d(x), conv2d1(x1), conv2d2(x2)]
+                )
+
+                out = act_op(concat_out)
+                out = paddle.assign(out)
+                self.pass_attr_list = [
+                    {'conv_concat_activation_onednn_fuse_pass': {}}
+                ]
+                self.feeds = {
+                    "x": np.random.random((5, 5, 5, 5)).astype("float32"),
+                    "x1": np.random.random((5, 5, 5, 5)).astype("float32"),
+                    "x2": np.random.random((5, 5, 5, 5)).astype("float32"),
+                }
+                self.fetch_list = [out]
+                self.valid_op_map = {
+                    "pd_op.gelu": 0,
+                    "pd_op.conv2d": 0,
+                    "pd_op.concat": 1,
+                    "onednn_op.fused_conv2d": 3,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+class TestConv2dConcat3HardsigmoidFusePass(PassTest):
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                conv2d = paddle.nn.Conv2D(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+                x1 = paddle.static.data(
+                    name='x1', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                conv2d1 = paddle.nn.Conv2D(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+                x2 = paddle.static.data(
+                    name='x2', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                conv2d2 = paddle.nn.Conv2D(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+
+                act_op = paddle.nn.Hardsigmoid()
+                concat_out = paddle.concat(
+                    [conv2d(x), conv2d1(x1), conv2d2(x2)]
+                )
+
+                out = act_op(concat_out)
+                out = paddle.assign(out)
+                self.pass_attr_list = [
+                    {'conv_concat_activation_onednn_fuse_pass': {}}
+                ]
+                self.feeds = {
+                    "x": np.random.random((5, 5, 5, 5)).astype("float32"),
+                    "x1": np.random.random((5, 5, 5, 5)).astype("float32"),
+                    "x2": np.random.random((5, 5, 5, 5)).astype("float32"),
+                }
+                self.fetch_list = [out]
+                self.valid_op_map = {
+                    "pd_op.hardsigmoid": 0,
+                    "pd_op.conv2d": 0,
+                    "pd_op.concat": 1,
+                    "onednn_op.fused_conv2d": 3,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+class TestConv2dConcat3ClipFusePass(PassTest):
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                conv2d = paddle.nn.Conv2D(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+                x1 = paddle.static.data(
+                    name='x1', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                conv2d1 = paddle.nn.Conv2D(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+
+                concat_out = paddle.concat([conv2d(x), conv2d1(x1)])
+
+                out = paddle.clip(concat_out, min=-15, max=15)
+                out = paddle.assign(out)
+                self.pass_attr_list = [
+                    {'conv_concat_activation_onednn_fuse_pass': {}}
+                ]
+                self.feeds = {
+                    "x": np.random.random((5, 5, 5, 5)).astype("float32"),
+                    "x1": np.random.random((5, 5, 5, 5)).astype("float32"),
+                }
+                self.fetch_list = [out]
+                self.valid_op_map = {
+                    "pd_op.clip": 0,
+                    "pd_op.conv2d": 0,
+                    "pd_op.concat": 1,
+                    "onednn_op.fused_conv2d": 2,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+class TestConv2dConcat6ReluFusePass(PassTest):
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                conv2d = paddle.nn.Conv2D(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+                x1 = paddle.static.data(
+                    name='x1', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                conv2d1 = paddle.nn.Conv2D(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+                x2 = paddle.static.data(
+                    name='x2', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                conv2d2 = paddle.nn.Conv2D(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+                x3 = paddle.static.data(
+                    name='x3', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                conv2d3 = paddle.nn.Conv2D(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+                x4 = paddle.static.data(
+                    name='x4', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                conv2d4 = paddle.nn.Conv2D(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+                x5 = paddle.static.data(
+                    name='x5', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                conv2d5 = paddle.nn.Conv2D(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+
+                act_op = paddle.nn.GELU()
+                concat_out = paddle.concat(
+                    [
+                        conv2d(x),
+                        conv2d1(x1),
+                        conv2d2(x2),
+                        conv2d3(x3),
+                        conv2d4(x4),
+                        conv2d5(x5),
+                    ]
+                )
+
+                out = act_op(concat_out)
+                out = paddle.assign(out)
+                self.pass_attr_list = [
+                    {'conv_concat_activation_onednn_fuse_pass': {}}
+                ]
+                self.feeds = {
+                    "x": np.random.random((5, 5, 5, 5)).astype("float32"),
+                    "x1": np.random.random((5, 5, 5, 5)).astype("float32"),
+                    "x2": np.random.random((5, 5, 5, 5)).astype("float32"),
+                    "x3": np.random.random((5, 5, 5, 5)).astype("float32"),
+                    "x4": np.random.random((5, 5, 5, 5)).astype("float32"),
+                    "x5": np.random.random((5, 5, 5, 5)).astype("float32"),
+                }
+                self.fetch_list = [out]
+                self.valid_op_map = {
+                    "pd_op.relu": 0,
+                    "pd_op.conv2d": 0,
+                    "pd_op.concat": 1,
+                    "onednn_op.fused_conv2d": 6,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/ir/pir/fused_pass/onednn/test_convtranspose_bias_fuse_pass.py b/test/ir/pir/fused_pass/onednn/test_convtranspose_bias_fuse_pass.py
index 5f5bf774a8373..7554873851aed 100644
--- a/test/ir/pir/fused_pass/onednn/test_convtranspose_bias_fuse_pass.py
+++ b/test/ir/pir/fused_pass/onednn/test_convtranspose_bias_fuse_pass.py
@@ -22,10 +22,6 @@
 paddle.enable_static()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestConv2dTransposeAddFusePass(PassTest):
     def is_program_valid(self, program=None):
         return True
@@ -64,7 +60,7 @@ def build_ir_program(self):
 
                 out = paddle.add(conv2d(x), bias)
                 out = paddle.assign(out)
-                self.pass_list = ['conv2d_transpose_bias_fuse_pass']
+                self.pass_attr_list = [{'conv2d_transpose_bias_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
                     "bias": np.random.random(1).astype("float32"),
@@ -87,10 +83,6 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestConv2dTransposeAddFusePassWithAddParam(PassTest):
     def is_program_valid(self, program=None):
         return True
@@ -136,7 +128,7 @@ def build_ir_program(self):
                 )
                 out = paddle.add(add_out, other_param)
                 out = paddle.assign(out)
-                self.pass_list = ['conv2d_transpose_bias_fuse_pass']
+                self.pass_attr_list = [{'conv2d_transpose_bias_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
                     "bias": np.random.random(1).astype("float32"),
diff --git a/test/ir/pir/fused_pass/onednn/test_matmul_activation_fuse_pass.py b/test/ir/pir/fused_pass/onednn/test_matmul_activation_fuse_pass.py
index ff619c8bd131a..555fb4336d27b 100644
--- a/test/ir/pir/fused_pass/onednn/test_matmul_activation_fuse_pass.py
+++ b/test/ir/pir/fused_pass/onednn/test_matmul_activation_fuse_pass.py
@@ -22,10 +22,6 @@
 paddle.enable_static()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestMatmulActFusePatternCase1(PassTest):
     r'''
     x     y
@@ -54,7 +50,7 @@ def build_ir_program(self):
                 matmul_out = paddle.matmul(x, y)
                 out = paddle.nn.functional.relu(matmul_out)
                 out = paddle.assign(out)
-                self.pass_list = ['matmul_activation_fuse_pass']
+                self.pass_attr_list = [{'matmul_activation_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
                     "y": np.random.random((5, 5, 5, 5)).astype("float32"),
@@ -77,10 +73,6 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestMatmulAddFusePatternCase2(PassTest):
     r'''
     x     y
@@ -109,7 +101,7 @@ def build_ir_program(self):
                 matmul_out = paddle.matmul(x, y)
                 out = paddle.nn.functional.swish(matmul_out)
                 out = paddle.assign(out)
-                self.pass_list = ['matmul_activation_fuse_pass']
+                self.pass_attr_list = [{'matmul_activation_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
                     "y": np.random.random((5, 5, 5, 5)).astype("float32"),
@@ -132,10 +124,6 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestMatmulAddFusePatternCase3(PassTest):
     r'''
     x     y
@@ -164,7 +152,7 @@ def build_ir_program(self):
                 matmul_out = paddle.matmul(x, y)
                 out = paddle.abs(matmul_out)
                 out = paddle.assign(out)
-                self.pass_list = ['matmul_activation_fuse_pass']
+                self.pass_attr_list = [{'matmul_activation_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
                     "y": np.random.random((5, 5, 5, 5)).astype("float32"),
@@ -187,10 +175,6 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestMatmulClipFusePatternCase4(PassTest):
     r'''
     x     y
@@ -219,7 +203,7 @@ def build_ir_program(self):
                 matmul_out = paddle.matmul(x, y)
                 out = paddle.clip(matmul_out)
                 out = paddle.assign(out)
-                self.pass_list = ['matmul_activation_fuse_pass']
+                self.pass_attr_list = [{'matmul_activation_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
                     "y": np.random.random((5, 5, 5, 5)).astype("float32"),
@@ -242,10 +226,6 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestMatmulAddFusePatternCase5(PassTest):
     r'''
     x     y
@@ -274,7 +254,7 @@ def build_ir_program(self):
                 matmul_out = paddle.matmul(x, y)
                 out = paddle.nn.functional.gelu(matmul_out)
                 out = paddle.assign(out)
-                self.pass_list = ['matmul_activation_fuse_pass']
+                self.pass_attr_list = [{'matmul_activation_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
                     "y": np.random.random((5, 5, 5, 5)).astype("float32"),
@@ -297,10 +277,6 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestMatmulAddFusePatternCase6(PassTest):
     r'''
       x     y
@@ -329,7 +305,7 @@ def build_ir_program(self):
                 matmul_out = paddle.matmul(x, y)
                 out = paddle.nn.functional.hardsigmoid(matmul_out)
                 out = paddle.assign(out)
-                self.pass_list = ['matmul_activation_fuse_pass']
+                self.pass_attr_list = [{'matmul_activation_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
                     "y": np.random.random((5, 5, 5, 5)).astype("float32"),
@@ -352,10 +328,6 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestMatmulAddFusePatternCase7(PassTest):
     r'''
      x     y
@@ -384,7 +356,7 @@ def build_ir_program(self):
                 matmul_out = paddle.matmul(x, y)
                 out = paddle.nn.functional.hardswish(matmul_out)
                 out = paddle.assign(out)
-                self.pass_list = ['matmul_activation_fuse_pass']
+                self.pass_attr_list = [{'matmul_activation_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
                     "y": np.random.random((5, 5, 5, 5)).astype("float32"),
@@ -407,10 +379,6 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestMatmulAddFusePatternCase8(PassTest):
     r'''
      x     y
@@ -439,7 +407,7 @@ def build_ir_program(self):
                 matmul_out = paddle.matmul(x, y)
                 out = paddle.nn.functional.leaky_relu(matmul_out)
                 out = paddle.assign(out)
-                self.pass_list = ['matmul_activation_fuse_pass']
+                self.pass_attr_list = [{'matmul_activation_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
                     "y": np.random.random((5, 5, 5, 5)).astype("float32"),
@@ -462,10 +430,6 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestMatmulAddFusePatternCase9(PassTest):
     r'''
     x     y
@@ -494,7 +458,7 @@ def build_ir_program(self):
                 matmul_out = paddle.matmul(x, y)
                 out = paddle.nn.functional.mish(matmul_out)
                 out = paddle.assign(out)
-                self.pass_list = ['matmul_activation_fuse_pass']
+                self.pass_attr_list = [{'matmul_activation_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
                     "y": np.random.random((5, 5, 5, 5)).astype("float32"),
@@ -517,10 +481,6 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestMatmulAddFusePatternCase10(PassTest):
     r'''
     x     y
@@ -549,7 +509,7 @@ def build_ir_program(self):
                 matmul_out = paddle.matmul(x, y)
                 out = paddle.nn.functional.relu6(matmul_out)
                 out = paddle.assign(out)
-                self.pass_list = ['matmul_activation_fuse_pass']
+                self.pass_attr_list = [{'matmul_activation_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
                     "y": np.random.random((5, 5, 5, 5)).astype("float32"),
@@ -572,10 +532,6 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestMatmulAddFusePatternCase11(PassTest):
     r'''
     x     y
@@ -604,7 +560,7 @@ def build_ir_program(self):
                 matmul_out = paddle.matmul(x, y)
                 out = paddle.nn.functional.sigmoid(matmul_out)
                 out = paddle.assign(out)
-                self.pass_list = ['matmul_activation_fuse_pass']
+                self.pass_attr_list = [{'matmul_activation_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
                     "y": np.random.random((5, 5, 5, 5)).astype("float32"),
@@ -627,10 +583,6 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestMatmulAddFusePatternCase12(PassTest):
     r'''
     x     y
@@ -659,7 +611,7 @@ def build_ir_program(self):
                 matmul_out = paddle.matmul(x, y)
                 out = paddle.sqrt(matmul_out)
                 out = paddle.assign(out)
-                self.pass_list = ['matmul_activation_fuse_pass']
+                self.pass_attr_list = [{'matmul_activation_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
                     "y": np.random.random((5, 5, 5, 5)).astype("float32"),
@@ -682,10 +634,6 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestMatmulAddFusePatternCase13(PassTest):
     r'''
     x     y
@@ -714,7 +662,7 @@ def build_ir_program(self):
                 matmul_out = paddle.matmul(x, y)
                 out = paddle.nn.functional.tanh(matmul_out)
                 out = paddle.assign(out)
-                self.pass_list = ['matmul_activation_fuse_pass']
+                self.pass_attr_list = [{'matmul_activation_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
                     "y": np.random.random((5, 5, 5, 5)).astype("float32"),
@@ -737,10 +685,6 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestFusedMatmulActFusePattern(PassTest):
     r'''
     x     y
@@ -775,9 +719,9 @@ def build_ir_program(self):
                 out = paddle.add(matmul_out, bias)
                 act_out = paddle.nn.functional.relu(out)
                 act_out = paddle.assign(act_out)
-                self.pass_list = [
-                    'matmul_elementwise_add_fuse_pass',
-                    'matmul_activation_fuse_pass',
+                self.pass_attr_list = [
+                    {'matmul_elementwise_add_fuse_pass': {}},
+                    {'matmul_activation_fuse_pass': {}},
                 ]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
@@ -803,10 +747,6 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestFusedMatmulClipFusePattern(PassTest):
     r'''
     x     y
@@ -841,9 +781,9 @@ def build_ir_program(self):
                 out = paddle.add(matmul_out, bias)
                 act_out = paddle.clip(out)
                 act_out = paddle.assign(act_out)
-                self.pass_list = [
-                    'matmul_elementwise_add_fuse_pass',
-                    'matmul_activation_fuse_pass',
+                self.pass_attr_list = [
+                    {'matmul_elementwise_add_fuse_pass': {}},
+                    {'matmul_activation_fuse_pass': {}},
                 ]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
@@ -869,10 +809,6 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestFusedMatmulsigmoidFusePattern(PassTest):
     r'''
     x     y
@@ -907,9 +843,9 @@ def build_ir_program(self):
                 out = paddle.add(matmul_out, bias)
                 act_out = paddle.nn.functional.hardsigmoid(out)
                 act_out = paddle.assign(act_out)
-                self.pass_list = [
-                    'matmul_elementwise_add_fuse_pass',
-                    'matmul_activation_fuse_pass',
+                self.pass_attr_list = [
+                    {'matmul_elementwise_add_fuse_pass': {}},
+                    {'matmul_activation_fuse_pass': {}},
                 ]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
@@ -935,10 +871,6 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestMatmulGeluTanhFusePatternCase14(PassTest):
     r'''
     x     y
@@ -967,7 +899,7 @@ def build_ir_program(self):
                 matmul_out = paddle.matmul(x, y)
                 out = paddle.nn.functional.gelu(matmul_out, approximate=True)
                 out = paddle.assign(out)
-                self.pass_list = ['matmul_activation_fuse_pass']
+                self.pass_attr_list = [{'matmul_activation_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
                     "y": np.random.random((5, 5, 5, 5)).astype("float32"),
diff --git a/test/ir/pir/fused_pass/onednn/test_matmul_elementwise_add_fuse_pass.py b/test/ir/pir/fused_pass/onednn/test_matmul_elementwise_add_fuse_pass.py
index cd16ac5f14570..7076d18197a72 100644
--- a/test/ir/pir/fused_pass/onednn/test_matmul_elementwise_add_fuse_pass.py
+++ b/test/ir/pir/fused_pass/onednn/test_matmul_elementwise_add_fuse_pass.py
@@ -22,10 +22,6 @@
 paddle.enable_static()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestMatmulAddFusePattern(PassTest):
     r'''
     x     y
@@ -57,7 +53,7 @@ def build_ir_program(self):
                 matmul_out = paddle.matmul(x, y)
                 out = paddle.add(matmul_out, residual)
                 out = paddle.assign(out)
-                self.pass_list = ['matmul_elementwise_add_fuse_pass']
+                self.pass_attr_list = [{'matmul_elementwise_add_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
                     "y": np.random.random((5, 5, 5, 5)).astype("float32"),
@@ -81,10 +77,6 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestMatmulAddFusePatternCase2(PassTest):
     r'''
     x     y
@@ -116,7 +108,7 @@ def build_ir_program(self):
                 matmul_out = paddle.matmul(x, y)
                 out = paddle.add(matmul_out, residual)
                 out = paddle.assign(out)
-                self.pass_list = ['matmul_elementwise_add_fuse_pass']
+                self.pass_attr_list = [{'matmul_elementwise_add_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
                     "y": np.random.random((5, 5, 5, 5)).astype("float32"),
@@ -140,10 +132,6 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestMatmulAddFusePatternCase3(PassTest):
     r'''
                        x     y
@@ -175,7 +163,7 @@ def build_ir_program(self):
                 matmul_out = paddle.matmul(x, y)
                 out = paddle.add(residual, matmul_out)
                 out = paddle.assign(out)
-                self.pass_list = ['matmul_elementwise_add_fuse_pass']
+                self.pass_attr_list = [{'matmul_elementwise_add_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
                     "y": np.random.random((5, 5, 5, 5)).astype("float32"),
@@ -199,10 +187,6 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
 class TestMatmulAddFusePatternCase4(PassTest):
     r'''
                    x     y
@@ -234,7 +218,7 @@ def build_ir_program(self):
                 matmul_out = paddle.matmul(x, y)
                 out = paddle.add(residual, matmul_out)
                 out = paddle.assign(out)
-                self.pass_list = ['matmul_elementwise_add_fuse_pass']
+                self.pass_attr_list = [{'matmul_elementwise_add_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((5, 5, 5, 5)).astype("float32"),
                     "y": np.random.random((5, 5, 5, 5)).astype("float32"),
@@ -258,73 +242,5 @@ def test_check_output(self):
         self.check_pass_correct()
 
 
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
-class TestFusedMatmulAddFusePattern(PassTest):
-    r'''
-                   x     y
-                    \   /
-    resdual(data)  matmul
-                \   /
-                 add
-                  |
-                 out  residual2(data)
-                  \   /
-                   add
-                    |
-                 out_end
-    '''
-
-    def is_program_valid(self, program=None):
-        return True
-
-    def build_ir_program(self):
-        with paddle.pir_utils.IrGuard():
-            main_prog = paddle.static.Program()
-            start_prog = paddle.static.Program()
-            with paddle.pir.core.program_guard(main_prog, start_prog):
-                x = paddle.static.data(
-                    name='x', shape=[5, 5, 5, 5], dtype='float32'
-                )
-                y = paddle.static.data(
-                    name='y', shape=[5, 5, 5, 5], dtype='float32'
-                )
-                residual = paddle.static.data(
-                    name="residual", shape=[1], dtype='float32'
-                )
-                residual2 = paddle.static.data(
-                    name="residual2", shape=[1], dtype='float32'
-                )
-                matmul_out = paddle.matmul(x, y)
-                out = paddle.add(residual, matmul_out)
-                out_end = paddle.add(out, residual2)
-                out_end = paddle.assign(out_end)
-                self.pass_list = ['matmul_elementwise_add_fuse_pass']
-                self.feeds = {
-                    "x": np.random.random((5, 5, 5, 5)).astype("float32"),
-                    "y": np.random.random((5, 5, 5, 5)).astype("float32"),
-                    "residual": np.random.random(1).astype("float32"),
-                    "residual2": np.random.random(1).astype("float32"),
-                }
-                self.fetch_list = [out_end]
-                self.valid_op_map = {
-                    "onednn_op.fused_matmul": 1,
-                    "pd_op.matmul": 0,
-                    "pd_op.add": 1,
-                }
-                return [main_prog, start_prog]
-
-    def sample_program(self):
-        yield self.build_ir_program(), False
-
-    def setUp(self):
-        self.places.append(paddle.CPUPlace())
-
-    def test_check_output(self):
-        self.check_pass_correct()
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/ir/pir/fused_pass/onednn/test_matmul_transpose_reshape_fuse_pass.py b/test/ir/pir/fused_pass/onednn/test_matmul_transpose_reshape_fuse_pass.py
new file mode 100644
index 0000000000000..9a6233235506f
--- /dev/null
+++ b/test/ir/pir/fused_pass/onednn/test_matmul_transpose_reshape_fuse_pass.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from pass_test import PassTest
+
+import paddle
+
+paddle.enable_static()
+
+
+class TestMatmulTransposeReshapeFusePattern(PassTest):
+    r'''
+    x       y
+     \     /
+      matmul
+        |
+    transpose
+        |
+     reshape
+        |
+       out
+    '''
+
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[8, 12, 128, 128], dtype='float32'
+                )
+                y = paddle.static.data(
+                    name='y', shape=[8, 12, 128, 64], dtype='float32'
+                )
+
+                matmul_out = paddle.matmul(x, y)
+                transpose_out = paddle.transpose(matmul_out, perm=[0, 2, 1, 3])
+                reshape_out = paddle.reshape(
+                    transpose_out, shape=[0, 0, 12 * 64]
+                )
+                reshape_out = paddle.assign(reshape_out)
+                self.pass_attr_list = [
+                    {'matmul_transpose_reshape_fuse_pass': {}}
+                ]
+                self.feeds = {
+                    "x": np.random.random((8, 12, 128, 128)).astype("float32"),
+                    "y": np.random.random((8, 12, 128, 64)).astype("float32"),
+                }
+                self.fetch_list = [reshape_out]
+                self.valid_op_map = {
+                    "onednn_op.fused_matmul": 1,
+                    "pd_op.matmul": 0,
+                    "pd_op.reshape": 0,
+                    "pd_op.transpose": 0,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+class TestMatmulTransposeReshapeAddFusePattern(PassTest):
+    r'''
+           y
+           |
+        reshape
+           |
+    x  transpose
+     \     /
+      matmul
+        |
+    transpose
+        |
+     reshape  residual
+         \   /
+          add
+           |
+          out
+    '''
+
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                y = paddle.static.data(
+                    name='y', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                residual = paddle.static.create_parameter(
+                    name="residual", shape=[1], dtype='float32'
+                )
+
+                reshape_out = paddle.reshape(y, [0, 0, 0, 0])
+                transpose_out = paddle.transpose(reshape_out, perm=[0, 2, 3, 1])
+                matmul_out = paddle.matmul(x, transpose_out)
+                transpose_out = paddle.transpose(matmul_out, perm=[0, 2, 1, 3])
+                reshape_out = paddle.reshape(transpose_out, [0, 0, 25])
+                out = paddle.add(reshape_out, residual)
+                out = paddle.assign(out)
+                self.pass_attr_list = [
+                    {'reshape_transpose_matmul_fuse_pass': {}},
+                    {'matmul_transpose_reshape_fuse_pass': {}},
+                    {'matmul_elementwise_add_fuse_pass': {}},
+                ]
+                self.feeds = {
+                    "x": np.random.random((5, 5, 5, 5)).astype("float32"),
+                    "y": np.random.random((5, 5, 5, 5)).astype("float32"),
+                    "residual": np.random.random(1).astype("float32"),
+                }
+                self.fetch_list = [out]
+                self.valid_op_map = {
+                    "onednn_op.fused_matmul": 1,
+                    "pd_op.reshape": 0,
+                    "pd_op.transpose": 0,
+                    "pd_op.matmul": 0,
+                    "pd_op.add": 0,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/ir/pir/fused_pass/onednn/test_pir_depthwise_conv_onednn_pass.py b/test/ir/pir/fused_pass/onednn/test_pir_depthwise_conv_onednn_pass.py
new file mode 100644
index 0000000000000..7b2a716e746fa
--- /dev/null
+++ b/test/ir/pir/fused_pass/onednn/test_pir_depthwise_conv_onednn_pass.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+from pass_test import PassTest
+
+import paddle
+
+paddle.enable_static()
+
+
+class TestConv2dAddFusePass(PassTest):
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[5, 2, 5, 5], dtype='float32'
+                )
+
+                conv2d = paddle.nn.Conv2D(
+                    in_channels=2,
+                    out_channels=2,
+                    kernel_size=[2, 2],
+                    groups=2,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                )
+
+                conv2d_out = conv2d(x)
+                out = paddle.assign(conv2d_out)
+                self.pass_attr_list = [{'depthwise_conv_onednn_pass': {}}]
+
+                self.feeds = {
+                    "x": np.random.random((5, 2, 5, 5)).astype("float32"),
+                }
+                self.fetch_list = [out]
+                self.valid_op_map = {
+                    "pd_op.conv2d": 1,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/ir/pir/fused_pass/onednn/test_reshape_transpose_matmul_fuse_pass.py b/test/ir/pir/fused_pass/onednn/test_reshape_transpose_matmul_fuse_pass.py
new file mode 100644
index 0000000000000..1957e0911cdcf
--- /dev/null
+++ b/test/ir/pir/fused_pass/onednn/test_reshape_transpose_matmul_fuse_pass.py
@@ -0,0 +1,270 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from pass_test import PassTest
+
+import paddle
+
+paddle.enable_static()
+
+
+class TestReshapeTranspoeMatmulFusePatternCase1(PassTest):
+    r'''
+        x
+        |
+     reshape
+        |
+    transpose    y
+         \      /
+          matmul
+            |
+        matmul_out
+    '''
+
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[2, 2, 3, 4], dtype='float32'
+                )
+                y = paddle.static.data(
+                    name='y', shape=[2, 2, 4, 3], dtype='float32'
+                )
+
+                reshape_out = paddle.reshape(x, shape=[2, 2, 3, 4])
+                transpose_out = paddle.transpose(reshape_out, perm=[1, 0, 2, 3])
+                matmul_out = paddle.matmul(transpose_out, y)
+                matmul_out = paddle.assign(matmul_out)
+                self.pass_attr_list = [
+                    {'reshape_transpose_matmul_fuse_pass': {}}
+                ]
+                self.feeds = {
+                    "x": np.random.random((2, 2, 3, 4)).astype("float32"),
+                    "y": np.random.random((2, 2, 4, 3)).astype("float32"),
+                }
+                self.fetch_list = [matmul_out]
+                self.valid_op_map = {
+                    "onednn_op.fused_matmul": 1,
+                    "pd_op.reshape": 0,
+                    "pd_op.transpose": 0,
+                    "pd_op.matmul": 0,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+class TestReshapeTranspoeMatmulFusePatternCase2(PassTest):
+    r'''
+            y
+            |
+         reshape
+            |
+    x   transpose
+     \      /
+      matmul
+        |
+    matmul_out
+    '''
+
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                y = paddle.static.data(
+                    name='y', shape=[5, 5, 5, 5], dtype='float32'
+                )
+
+                reshape_out = paddle.reshape(y, [0, 0, 0, 0])
+                transpose_out = paddle.transpose(reshape_out, perm=[0, 2, 3, 1])
+                matmul_out = paddle.matmul(x, transpose_out)
+                matmul_out = paddle.assign(matmul_out)
+                self.pass_attr_list = [
+                    {'reshape_transpose_matmul_fuse_pass': {}}
+                ]
+                self.feeds = {
+                    "x": np.random.random((5, 5, 5, 5)).astype("float32"),
+                    "y": np.random.random((5, 5, 5, 5)).astype("float32"),
+                }
+                self.fetch_list = [matmul_out]
+                self.valid_op_map = {
+                    "onednn_op.fused_matmul": 1,
+                    "pd_op.reshape": 0,
+                    "pd_op.transpose": 0,
+                    "pd_op.matmul": 0,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+class TestReshapeTranspoeMatmulFusePatternCase3(PassTest):
+    r'''
+        x        y
+        |        |
+     reshape  reshape
+        |        |
+    transpose transpose
+         \      /
+          matmul
+            |
+        matmul_out
+    '''
+
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                y = paddle.static.data(
+                    name='y', shape=[5, 5, 5, 5], dtype='float32'
+                )
+
+                reshape_x = paddle.reshape(x, [0, 0, 0, 0])
+                reshape_y = paddle.reshape(y, [0, 0, 0, 0])
+                transpose_x = paddle.transpose(reshape_x, perm=[0, 2, 3, 1])
+                transpose_y = paddle.transpose(reshape_y, perm=[0, 2, 3, 1])
+                matmul_out = paddle.matmul(transpose_x, transpose_y)
+                matmul_out = paddle.assign(matmul_out)
+                self.pass_attr_list = [
+                    {'reshape_transpose_matmul_fuse_pass': {}},
+                    {'reshape_transpose_matmul_fuse_pass': {}},
+                ]
+                self.feeds = {
+                    "x": np.random.random((5, 5, 5, 5)).astype("float32"),
+                    "y": np.random.random((5, 5, 5, 5)).astype("float32"),
+                }
+                self.fetch_list = [matmul_out]
+                self.valid_op_map = {
+                    "onednn_op.fused_matmul": 1,
+                    "pd_op.reshape": 0,
+                    "pd_op.transpose": 0,
+                    "pd_op.matmul": 0,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+class TestReshapeTransposeMatmulAddFusePattern(PassTest):
+    r'''
+           x
+           |
+        reshape
+           |
+    y  transpose
+     \    /
+     matmul  resdual
+        \   /
+         add
+          |
+         out
+    '''
+
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                y = paddle.static.data(
+                    name='y', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                residual = paddle.static.data(
+                    name="residual", shape=[1], dtype='float32'
+                )
+
+                reshape_out = paddle.reshape(y, [0, 0, 0, 0])
+                transpose_out = paddle.transpose(reshape_out, perm=[0, 2, 3, 1])
+                matmul_out = paddle.matmul(x, transpose_out)
+                out = paddle.add(matmul_out, residual)
+                out = paddle.assign(out)
+                self.pass_attr_list = [
+                    {'reshape_transpose_matmul_fuse_pass': {}},
+                    {'matmul_elementwise_add_fuse_pass': {}},
+                ]
+                self.feeds = {
+                    "x": np.random.random((5, 5, 5, 5)).astype("float32"),
+                    "y": np.random.random((5, 5, 5, 5)).astype("float32"),
+                    "residual": np.random.random(1).astype("float32"),
+                }
+                self.fetch_list = [out]
+                self.valid_op_map = {
+                    "onednn_op.fused_matmul": 1,
+                    "pd_op.reshape": 0,
+                    "pd_op.transpose": 0,
+                    "pd_op.matmul": 0,
+                    "pd_op.add": 0,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/ir/pir/fused_pass/onednn/test_scale_matmul_fuse_pass.py b/test/ir/pir/fused_pass/onednn/test_scale_matmul_fuse_pass.py
new file mode 100644
index 0000000000000..23c32812d1bea
--- /dev/null
+++ b/test/ir/pir/fused_pass/onednn/test_scale_matmul_fuse_pass.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from pass_test import PassTest
+
+import paddle
+
+paddle.enable_static()
+
+
+class TestReshapeTranspoeMatmulFusePatternCase1(PassTest):
+    r'''
+      x
+      |
+    scale      y
+       \      /
+        matmul
+          |
+      matmul_out
+    '''
+
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[2, 2, 3, 4], dtype='float32'
+                )
+                y = paddle.static.data(
+                    name='y', shape=[2, 2, 4, 3], dtype='float32'
+                )
+
+                scale_out = paddle.scale(x, scale=0.5)
+                matmul_out = paddle.matmul(scale_out, y)
+                matmul_out = paddle.assign(matmul_out)
+                self.pass_attr_list = [{'scale_matmul_fuse_pass': {}}]
+                self.feeds = {
+                    "x": np.random.random((2, 2, 3, 4)).astype("float32"),
+                    "y": np.random.random((2, 2, 4, 3)).astype("float32"),
+                }
+                self.fetch_list = [matmul_out]
+                self.valid_op_map = {
+                    "onednn_op.fused_matmul": 1,
+                    "pd_op.scale": 0,
+                    "pd_op.matmul": 0,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+class TestReshapeTranspoeMatmulFusePatternCase2(PassTest):
+    r'''
+            y
+            |
+    x     scale
+     \      /
+      matmul
+        |
+    matmul_out
+    '''
+
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                y = paddle.static.data(
+                    name='y', shape=[5, 5, 5, 5], dtype='float32'
+                )
+
+                scale_out = paddle.scale(y, scale=0.3)
+                matmul_out = paddle.matmul(x, scale_out)
+                matmul_out = paddle.assign(matmul_out)
+                self.pass_attr_list = [{'scale_matmul_fuse_pass': {}}]
+                self.feeds = {
+                    "x": np.random.random((5, 5, 5, 5)).astype("float32"),
+                    "y": np.random.random((5, 5, 5, 5)).astype("float32"),
+                }
+                self.fetch_list = [matmul_out]
+                self.valid_op_map = {
+                    "onednn_op.fused_matmul": 1,
+                    "pd_op.scale": 0,
+                    "pd_op.matmul": 0,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/ir/pir/fused_pass/onednn/test_squeeze_transpose_onednn_pass.py b/test/ir/pir/fused_pass/onednn/test_squeeze_transpose_onednn_pass.py
new file mode 100644
index 0000000000000..4061144012cb8
--- /dev/null
+++ b/test/ir/pir/fused_pass/onednn/test_squeeze_transpose_onednn_pass.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+from pass_test import PassTest
+
+import paddle
+
+paddle.enable_static()
+
+
+class TestConv2dAddFusePass(PassTest):
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[4, 16, 1, 32], dtype='float32'
+                )
+
+                squeeze_out = paddle.squeeze(x, axis=[2])
+                out = paddle.transpose(squeeze_out, [0, 1, 2])
+
+                out = paddle.assign(out)
+                self.pass_attr_list = [
+                    {'squeeze_transpose_onednn_fuse_pass': {}}
+                ]
+                self.feeds = {
+                    "x": np.random.random((4, 16, 1, 32)).astype("float32"),
+                }
+                self.fetch_list = [out]
+                self.valid_op_map = {
+                    "onednn_op.fused_transpose": 1,
+                    "pd_op.squeeze": 0,
+                    "pd_op.transpose": 0,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/ir/pir/fused_pass/pass_test.py b/test/ir/pir/fused_pass/pass_test.py
index 73d86c40ce0eb..3bb937ec59771 100644
--- a/test/ir/pir/fused_pass/pass_test.py
+++ b/test/ir/pir/fused_pass/pass_test.py
@@ -27,20 +27,18 @@ def setUpClass(self):
         self.feeds = None
         self.fetch_list = None
         self.valid_op_map = {}
-        self.pass_list = []
         self.pir_program = None
         self.places = []
         self.skip_accuracy_verification = False
+        self.pass_attr_list = []  # pass_name:pass_attr(defalut:None)
 
     def run_pir_pass(self, program):
-        if not isinstance(self.pass_list, list):
-            self.pass_list = [self.pass_list]
-
         pm = pir.PassManager(opt_level=4)
-        pm.enable_ir_printing()
         pm.enable_print_statistics()
-        for pass_name in self.pass_list:
-            pm.add_pass(pass_name)
+        pm.enable_ir_printing()
+        for pass_item in self.pass_attr_list:
+            for pass_name, pass_attr in pass_item.items():
+                pm.add_pass(pass_name, pass_attr)
         pm.run(program)
         return program
 
@@ -54,10 +52,8 @@ def check_fused_ops(self, program):
             actual_valid_op_count = op_names.count(valid_op_name)
             self.assertTrue(
                 valid_op_count == actual_valid_op_count,
-                "Checking of the number of fused operator < {} > failed. "
-                "Expected: {}, Received: {}".format(
-                    valid_op_name, valid_op_count, actual_valid_op_count
-                ),
+                f"Checking of the number of fused operator < {valid_op_name} > failed. "
+                f"Expected: {valid_op_count}, Received: {actual_valid_op_count}",
             )
 
     @abc.abstractmethod
diff --git a/test/ir/pir/fused_pass/test_add_norm_fuse_pass.py b/test/ir/pir/fused_pass/test_add_norm_fuse_pass.py
index 73a8d2d57cba5..fac6e62bc2278 100644
--- a/test/ir/pir/fused_pass/test_add_norm_fuse_pass.py
+++ b/test/ir/pir/fused_pass/test_add_norm_fuse_pass.py
@@ -70,7 +70,9 @@ def sample_program(self):
                                 x = paddle.rsqrt(variance + 1e-6) * x
                                 out = x * w
                                 out = paddle.assign(out)
-                                self.pass_list = ['add_norm_fuse_pass']
+                                self.pass_attr_list = [
+                                    {'add_norm_fuse_pass': {}}
+                                ]
                                 self.feeds = {
                                     "x": np.random.random(x_shape).astype(
                                         "float32"
@@ -153,7 +155,9 @@ def sample_program(self):
                                 )
                                 out = x_2 * w
                                 out = paddle.assign(out)
-                                self.pass_list = ['add_norm_fuse_pass']
+                                self.pass_attr_list = [
+                                    {'add_norm_fuse_pass': {}}
+                                ]
                                 self.feeds = {
                                     "x": np.random.random(x_shape).astype(
                                         paddle.get_default_dtype()
@@ -176,7 +180,7 @@ def test_check_output(self):
         self.check_pass_correct(atol=1e-3, rtol=1e-3)
 
 
-class TestAddRmsNormFusePattern(TestRmsNormFusePattern):
+class TestAddRmsNormFusePatternWithResidual(TestRmsNormFusePattern):
     r"""
         x         residual       w
         |           |
@@ -222,14 +226,29 @@ def sample_program(self):
                                         np.random.random(w_shape).astype(w_type)
                                     ),
                                 )
+                                w1 = create_parameter(
+                                    name="w1",
+                                    shape=w_shape,
+                                    dtype=w_type,
+                                    initializer=paddle.nn.initializer.Assign(
+                                        np.random.random([4096, 4096]).astype(
+                                            w_type
+                                        )
+                                    ),
+                                )
                                 add_out = paddle.add(residual, x)
+                                add_out_1 = add_out
                                 variance = add_out.pow(2).mean(-1, keepdim=True)
                                 add_out = (
                                     paddle.rsqrt(variance + 1e-6) * add_out
                                 )
-                                out = add_out * w
+                                mul_out = add_out * w
+                                matmul_out = paddle.matmul(mul_out, w1)
+                                out = paddle.add(add_out_1, matmul_out)
                                 out = paddle.assign(out)
-                                self.pass_list = ['add_norm_fuse_pass']
+                                self.pass_attr_list = [
+                                    {'add_norm_fuse_pass': {}}
+                                ]
                                 self.feeds = {
                                     "x": np.random.random(x_shape).astype(
                                         "float32"
@@ -240,7 +259,6 @@ def sample_program(self):
                                 }
                                 self.fetch_list = [out]
                                 self.valid_op_map = {
-                                    "pd_op.add": 0,
                                     "pd_op.pow": 0,
                                     "pd_op.mean": 0,
                                     "pd_op.full": 0,
@@ -288,15 +306,30 @@ def sample_program(self):
                                         mean=0.0, std=2.0
                                     ),
                                 )
+                                w1 = create_parameter(
+                                    name="w1",
+                                    shape=w_shape,
+                                    dtype=w_type,
+                                    initializer=paddle.nn.initializer.Assign(
+                                        np.random.random([4096, 4096]).astype(
+                                            w_type
+                                        )
+                                    ),
+                                )
                                 add_out = paddle.add(residual, x)
+                                add_out_1 = add_out
                                 layer_norm = paddle.nn.LayerNorm(
                                     add_out.shape[-1:],
                                     epsilon=epilson,
                                     weight_attr=w_attr,
                                 )
-                                out = layer_norm(add_out)
+                                layer_norm_out = layer_norm(add_out)
+                                matmul_out = paddle.matmul(layer_norm_out, w1)
+                                out = paddle.add(add_out_1, matmul_out)
                                 out = paddle.assign(out)
-                                self.pass_list = ['add_norm_fuse_pass']
+                                self.pass_attr_list = [
+                                    {'add_norm_fuse_pass': {}}
+                                ]
                                 self.feeds = {
                                     "x": np.random.random(x_shape).astype(
                                         "float32"
@@ -307,13 +340,15 @@ def sample_program(self):
                                 }
                                 self.fetch_list = [out]
                                 self.valid_op_map = {
-                                    "pd_op.add": 0,
                                     "pd_op.layer_norm": 0,
                                     "pd_op.fused_bias_residual_layernorm": 1,
                                 }
 
                                 yield [main_prog, start_prog], False
 
+    def test_check_output(self):
+        self.check_pass_correct(atol=1e-3, rtol=1e-3)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/ir/pir/fused_pass/test_conv2d_add_act_fuse_pass.py b/test/ir/pir/fused_pass/test_conv2d_add_act_fuse_pass.py
index aaaf7cb175497..fa3a2f00aaf4a 100644
--- a/test/ir/pir/fused_pass/test_conv2d_add_act_fuse_pass.py
+++ b/test/ir/pir/fused_pass/test_conv2d_add_act_fuse_pass.py
@@ -72,7 +72,7 @@ def build_ir_program(self):
                 act_op = paddle.nn.ReLU()
                 out = act_op(paddle.add(conv2d(x), y))
                 out = paddle.assign(out)
-                self.pass_list = ['conv2d_add_act_fuse_pass']
+                self.pass_attr_list = [{'conv2d_add_act_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((3, 1, 28, 28)).astype("float32"),
                 }
@@ -152,7 +152,7 @@ def build_ir_program(self):
                     paddle.add(residual_data, paddle.add(conv2d(x), y))
                 )
                 out = paddle.assign(out)
-                self.pass_list = ['conv2d_add_act_fuse_pass']
+                self.pass_attr_list = [{'conv2d_add_act_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((3, 1, 28, 28)).astype("float32"),
                     "residual_data": np.random.random((3, 32, 28, 28)).astype(
diff --git a/test/ir/pir/fused_pass/test_conv2d_add_fuse_pass.py b/test/ir/pir/fused_pass/test_conv2d_add_fuse_pass.py
index 1e19364e3ba8d..36826a71a221e 100644
--- a/test/ir/pir/fused_pass/test_conv2d_add_fuse_pass.py
+++ b/test/ir/pir/fused_pass/test_conv2d_add_fuse_pass.py
@@ -64,7 +64,7 @@ def build_ir_program(self):
                 )
                 out = paddle.add(conv2d(x), bias)
                 out = paddle.assign(out)
-                self.pass_list = ['conv2d_add_fuse_pass']
+                self.pass_attr_list = [{'conv2d_add_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((3, 1, 28, 28)).astype("float32"),
                 }
diff --git a/test/ir/pir/fused_pass/test_conv2d_bn_fuse_pass.py b/test/ir/pir/fused_pass/test_conv2d_bn_fuse_pass.py
index dbbb5b14b325c..3449553b40e68 100644
--- a/test/ir/pir/fused_pass/test_conv2d_bn_fuse_pass.py
+++ b/test/ir/pir/fused_pass/test_conv2d_bn_fuse_pass.py
@@ -58,7 +58,7 @@ def build_ir_program(self):
                 )
                 out = bn(conv2d(x))
                 out = paddle.assign(out)
-                self.pass_list = ['conv2d_bn_fuse_pass']
+                self.pass_attr_list = [{'conv2d_bn_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((3, 1, 28, 28)).astype("float32")
                 }
diff --git a/test/ir/pir/fused_pass/test_fused_dot_product_attention_pass.py b/test/ir/pir/fused_pass/test_fused_dot_product_attention_pass.py
index da82fb74b7d18..00040bc725d74 100644
--- a/test/ir/pir/fused_pass/test_fused_dot_product_attention_pass.py
+++ b/test/ir/pir/fused_pass/test_fused_dot_product_attention_pass.py
@@ -116,7 +116,7 @@ def test_fused_dot_product_attention(self):
                         fetch_list=[res2, res3_, res4_, res5_],
                     )
                 pm = paddle.pir.PassManager()
-                pm.add_pass('fused_dot_product_attention_pass')
+                pm.add_pass('fused_dot_product_attention_pass', {})
                 pm.run(main_program)
                 op_names = [op.name() for op in main_program.global_block().ops]
 
diff --git a/test/ir/pir/fused_pass/test_fused_dropout_add_pass.py b/test/ir/pir/fused_pass/test_fused_dropout_add_pass.py
index a0fb11e36979c..237a7d73780c5 100644
--- a/test/ir/pir/fused_pass/test_fused_dropout_add_pass.py
+++ b/test/ir/pir/fused_pass/test_fused_dropout_add_pass.py
@@ -41,7 +41,7 @@ def _test_fused_dropout_add(self):
                 self.assertTrue('pd_op.add' in op_names)
                 pm = paddle.pir.PassManager()
                 pm.add_pass(
-                    'fused_dropout_add_pass'
+                    'fused_dropout_add_pass', {}
                 )  # apply pass to eliminate dead code
                 pm.run(main_program)
                 op_names = [op.name() for op in main_program.global_block().ops]
@@ -90,7 +90,7 @@ def test_fused_dropout_add_grad(self):
                 )
                 pm = paddle.pir.PassManager()
                 pm.add_pass(
-                    'fused_dropout_add_pass'
+                    'fused_dropout_add_pass', {}
                 )  # apply pass to eliminate dead code
                 pm.run(main_program)
                 op_names = [op.name() for op in main_program.global_block().ops]
diff --git a/test/ir/pir/fused_pass/test_fused_flash_attn_pass.py b/test/ir/pir/fused_pass/test_fused_flash_attn_pass.py
new file mode 100644
index 0000000000000..c8532994d01cc
--- /dev/null
+++ b/test/ir/pir/fused_pass/test_fused_flash_attn_pass.py
@@ -0,0 +1,667 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import unittest
+
+import numpy as np
+from pass_test import PassTest
+
+import paddle
+from paddle.base import core
+
+np.random.seed(42)
+paddle.enable_static()
+
+
+def get_cuda_version():
+    result = os.popen("nvcc --version").read()
+    regex = r'release (\S+),'
+    match = re.search(regex, result)
+    if match:
+        num = str(match.group(1))
+        integer, decimal = num.split('.')
+        return int(integer) * 1000 + int(float(decimal) * 10)
+    else:
+        return -1
+
+
+is_sm_supported = (
+    core.is_compiled_with_cuda()
+    and paddle.device.cuda.get_device_capability()[0] >= 8
+    and paddle.device.cuda.get_device_capability()[1] >= 0
+)
+
+
+def is_flashattn_supported():
+    if (
+        not core.is_compiled_with_cuda()
+        or get_cuda_version() < 11040
+        or not is_sm_supported
+    ):
+        return False
+    return True
+
+
+@unittest.skipIf(
+    not is_flashattn_supported(),
+    "core is not compiled with CUDA and cuda version need larger than or equal to 11.4"
+    "and device's compute capability must >= 8.x",
+)
+class TestFlashAttnPatternQscaleCast(PassTest):
+    r"""
+         Q          K           V
+         |          |           |
+     transpose  transpose   transpose
+         |          |           |
+       scale    transpose       |
+         |          |           |
+         -- matmul--            |
+              |                 |
+    mask --- add                |
+              |                 |
+            cast                |
+              |                 |
+           softmax              |
+              |                 |
+             cast               |
+              |                 |
+              ------matmul------
+                      |
+                     out
+
+         Q   K   V   None   mask
+         |   |   |     |      |
+         ------flash_attn------
+                   |
+                  out
+    """
+
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        for bs in [1]:
+            for seq_len in [128]:
+                for head_dim in [64]:
+                    for num_heads in [8]:
+                        with paddle.pir_utils.IrGuard():
+                            main_prog = paddle.static.Program()
+                            start_prog = paddle.static.Program()
+                            with paddle.pir.core.program_guard(
+                                main_prog, start_prog
+                            ):
+                                mask_shape = (bs, 1, seq_len, seq_len)
+                                Q = paddle.static.data(
+                                    name='Q',
+                                    shape=[bs, seq_len, num_heads, head_dim],
+                                    dtype='float16',
+                                )
+                                K = paddle.static.data(
+                                    name='K',
+                                    shape=[bs, seq_len, num_heads, head_dim],
+                                    dtype='float16',
+                                )
+                                V = paddle.static.data(
+                                    name='V',
+                                    shape=[bs, seq_len, num_heads, head_dim],
+                                    dtype='float16',
+                                )
+                                mask = paddle.static.data(
+                                    name='mask',
+                                    shape=mask_shape,
+                                    dtype='float16',
+                                )
+                                qt = paddle.transpose(Q, [0, 2, 1, 3])
+                                q_scale = paddle.scale(
+                                    qt, scale=0.125, bias=0.0
+                                )
+                                kt = paddle.transpose(K, [0, 2, 1, 3])
+                                kt = paddle.transpose(kt, [0, 1, 3, 2])
+                                vt = paddle.transpose(V, [0, 2, 1, 3])
+                                score = paddle.matmul(q_scale, kt)
+                                score = paddle.add(score, mask)
+                                cast_out = paddle.cast(score, 'float16')
+                                softmax_out = paddle.nn.functional.softmax(
+                                    cast_out
+                                )
+                                softmax_out = paddle.cast(
+                                    softmax_out, 'float16'
+                                )
+                                attention_out = paddle.matmul(softmax_out, vt)
+                                attention_out = paddle.transpose(
+                                    attention_out, [0, 2, 1, 3]
+                                )
+                                out = paddle.assign(attention_out)
+                                self.pass_attr_list = [
+                                    {'fused_flash_attn_pass': {}}
+                                ]
+                                self.feeds = {
+                                    "Q": np.random.random(
+                                        (bs, seq_len, num_heads, head_dim)
+                                    ).astype("float16"),
+                                    "K": np.random.random(
+                                        (bs, seq_len, num_heads, head_dim)
+                                    ).astype("float16"),
+                                    "V": np.random.random(
+                                        (bs, seq_len, num_heads, head_dim)
+                                    ).astype("float16"),
+                                    "mask": np.random.random(mask_shape).astype(
+                                        "float16"
+                                    ),
+                                }
+                                self.fetch_list = [out]
+                                self.valid_op_map = {
+                                    "pd_op.flash_attn": 1,
+                                }
+                                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def test_check_output(self):
+        self.check_pass_correct(atol=1e-3, rtol=1e-3)
+
+
+@unittest.skipIf(
+    not is_flashattn_supported(),
+    "core is not compiled with CUDA and cuda version need larger than or equal to 11.4"
+    "and device's compute capability must >= 8.x",
+)
+class TestFlashAttnPatternQscaleNoCast(PassTest):
+    r"""
+         Q          K           V
+         |          |           |
+     transpose  transpose   transpose
+         |          |           |
+       scale    transpose       |
+         |          |           |
+         -- matmul--            |
+              |                 |
+    mask --- add                |
+              |                 |
+              |                 |
+           softmax              |
+              |                 |
+              |                 |
+              ------matmul------
+                      |
+                     out
+
+         Q   K   V   None   mask
+         |   |   |     |      |
+         ------flash_attn------
+                   |
+                  out
+    """
+
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        for bs in [1]:
+            for seq_len in [128]:
+                for head_dim in [64]:
+                    for num_heads in [8]:
+                        with paddle.pir_utils.IrGuard():
+                            main_prog = paddle.static.Program()
+                            start_prog = paddle.static.Program()
+                            with paddle.pir.core.program_guard(
+                                main_prog, start_prog
+                            ):
+                                mask_shape = (bs, 1, seq_len, seq_len)
+                                Q = paddle.static.data(
+                                    name='Q',
+                                    shape=[bs, seq_len, num_heads, head_dim],
+                                    dtype='float16',
+                                )
+                                K = paddle.static.data(
+                                    name='K',
+                                    shape=[bs, seq_len, num_heads, head_dim],
+                                    dtype='float16',
+                                )
+                                V = paddle.static.data(
+                                    name='V',
+                                    shape=[bs, seq_len, num_heads, head_dim],
+                                    dtype='float16',
+                                )
+                                mask = paddle.static.data(
+                                    name='mask',
+                                    shape=mask_shape,
+                                    dtype='float16',
+                                )
+                                qt = paddle.transpose(Q, [0, 2, 1, 3])
+                                q_scale = paddle.scale(
+                                    qt, scale=0.125, bias=0.0
+                                )
+                                kt = paddle.transpose(K, [0, 2, 1, 3])
+                                kt = paddle.transpose(kt, [0, 1, 3, 2])
+                                vt = paddle.transpose(V, [0, 2, 1, 3])
+                                score = paddle.matmul(q_scale, kt)
+                                score = paddle.add(score, mask)
+                                softmax_out = paddle.nn.functional.softmax(
+                                    score
+                                )
+                                attention_out = paddle.matmul(softmax_out, vt)
+                                attention_out = paddle.transpose(
+                                    attention_out, [0, 2, 1, 3]
+                                )
+                                out = paddle.assign(attention_out)
+                                self.pass_attr_list = [
+                                    {'fused_flash_attn_pass': {}}
+                                ]
+                                self.feeds = {
+                                    "Q": np.random.random(
+                                        (bs, seq_len, num_heads, head_dim)
+                                    ).astype("float16"),
+                                    "K": np.random.random(
+                                        (bs, seq_len, num_heads, head_dim)
+                                    ).astype("float16"),
+                                    "V": np.random.random(
+                                        (bs, seq_len, num_heads, head_dim)
+                                    ).astype("float16"),
+                                    "mask": np.random.random(mask_shape).astype(
+                                        "float16"
+                                    ),
+                                }
+                                self.fetch_list = [out]
+                                self.valid_op_map = {
+                                    "pd_op.flash_attn": 1,
+                                }
+                                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def test_check_output(self):
+        self.check_pass_correct(atol=1e-3, rtol=1e-3)
+
+
+@unittest.skipIf(
+    not is_flashattn_supported(),
+    "core is not compiled with CUDA and cuda version need larger than or equal to 11.4"
+    "and device's compute capability must >= 8.x",
+)
+class TestFlashAttnPatternOutscaleCast(PassTest):
+    r"""
+         Q          K           V
+         |          |           |
+     transpose  transpose   transpose
+         |          |           |
+         |      transpose       |
+         |          |           |
+         -- matmul--            |
+              |                 |
+            scale               |
+              |                 |
+    mask --- add                |
+              |                 |
+            cast                |
+              |                 |
+           softmax              |
+              |                 |
+             cast               |
+              |                 |
+              ------matmul------
+                      |
+                     out
+
+         Q   K   V   None   mask
+         |   |   |     |      |
+         ------flash_attn------
+                   |
+                  out
+    """
+
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        for bs in [1]:
+            for seq_len in [128]:
+                for head_dim in [64]:
+                    for num_heads in [8]:
+                        with paddle.pir_utils.IrGuard():
+                            main_prog = paddle.static.Program()
+                            start_prog = paddle.static.Program()
+                            with paddle.pir.core.program_guard(
+                                main_prog, start_prog
+                            ):
+                                mask_shape = (bs, 1, seq_len, seq_len)
+                                Q = paddle.static.data(
+                                    name='Q',
+                                    shape=[bs, seq_len, num_heads, head_dim],
+                                    dtype='float16',
+                                )
+                                K = paddle.static.data(
+                                    name='K',
+                                    shape=[bs, seq_len, num_heads, head_dim],
+                                    dtype='float16',
+                                )
+                                V = paddle.static.data(
+                                    name='V',
+                                    shape=[bs, seq_len, num_heads, head_dim],
+                                    dtype='float16',
+                                )
+                                mask = paddle.static.data(
+                                    name='mask',
+                                    shape=mask_shape,
+                                    dtype='float16',
+                                )
+                                qt = paddle.transpose(Q, [0, 2, 1, 3])
+                                kt = paddle.transpose(K, [0, 2, 1, 3])
+                                kt = paddle.transpose(kt, [0, 1, 3, 2])
+                                vt = paddle.transpose(V, [0, 2, 1, 3])
+
+                                score = paddle.matmul(qt, kt)
+                                score_scale = paddle.scale(
+                                    score, scale=0.125, bias=0.0
+                                )
+                                score = paddle.add(score_scale, mask)
+                                cast_out = paddle.cast(score, 'float16')
+                                softmax_out = paddle.nn.functional.softmax(
+                                    cast_out
+                                )
+                                softmax_out = paddle.cast(
+                                    softmax_out, 'float16'
+                                )
+                                attention_out = paddle.matmul(softmax_out, vt)
+                                attention_out = paddle.transpose(
+                                    attention_out, [0, 2, 1, 3]
+                                )
+                                out = paddle.assign(attention_out)
+                                self.pass_attr_list = [
+                                    {'fused_flash_attn_pass': {}}
+                                ]
+                                self.feeds = {
+                                    "Q": np.random.random(
+                                        (bs, seq_len, num_heads, head_dim)
+                                    ).astype("float16"),
+                                    "K": np.random.random(
+                                        (bs, seq_len, num_heads, head_dim)
+                                    ).astype("float16"),
+                                    "V": np.random.random(
+                                        (bs, seq_len, num_heads, head_dim)
+                                    ).astype("float16"),
+                                    "mask": np.random.random(mask_shape).astype(
+                                        "float16"
+                                    ),
+                                }
+                                self.fetch_list = [out]
+                                self.valid_op_map = {
+                                    "pd_op.flash_attn": 1,
+                                }
+                                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def test_check_output(self):
+        self.check_pass_correct(atol=1e-3, rtol=1e-3)
+
+
+@unittest.skipIf(
+    not is_flashattn_supported(),
+    "core is not compiled with CUDA and cuda version need larger than or equal to 11.4"
+    "and device's compute capability must >= 8.x",
+)
+class TestFlashAttnPatternOutscaleNoCast(PassTest):
+    r"""
+         Q          K           V
+         |          |           |
+     transpose  transpose   transpose
+         |          |           |
+         |    transpose         |
+         |          |           |
+         -- matmul--            |
+              |                 |
+            scale               |
+              |                 |
+    mask --- add                |
+              |                 |
+              |                 |
+           softmax              |
+              |                 |
+              |                 |
+              ------matmul------
+                      |
+                     out
+
+         Q   K   V   None   mask
+         |   |   |     |      |
+         ------flash_attn------
+                   |
+                  out
+    """
+
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        for bs in [1]:
+            for seq_len in [128]:
+                for head_dim in [64]:
+                    for num_heads in [8]:
+                        with paddle.pir_utils.IrGuard():
+                            main_prog = paddle.static.Program()
+                            start_prog = paddle.static.Program()
+                            with paddle.pir.core.program_guard(
+                                main_prog, start_prog
+                            ):
+                                mask_shape = (bs, 1, seq_len, seq_len)
+                                Q = paddle.static.data(
+                                    name='Q',
+                                    shape=[bs, seq_len, num_heads, head_dim],
+                                    dtype='float16',
+                                )
+                                K = paddle.static.data(
+                                    name='K',
+                                    shape=[bs, seq_len, num_heads, head_dim],
+                                    dtype='float16',
+                                )
+                                V = paddle.static.data(
+                                    name='V',
+                                    shape=[bs, seq_len, num_heads, head_dim],
+                                    dtype='float16',
+                                )
+                                mask = paddle.static.data(
+                                    name='mask',
+                                    shape=mask_shape,
+                                    dtype='float16',
+                                )
+                                qt = paddle.transpose(Q, [0, 2, 1, 3])
+                                kt = paddle.transpose(K, [0, 2, 1, 3])
+                                kt = paddle.transpose(kt, [0, 1, 3, 2])
+                                vt = paddle.transpose(V, [0, 2, 1, 3])
+
+                                score = paddle.matmul(qt, kt)
+                                score_scale = paddle.scale(
+                                    score, scale=0.125, bias=0.0
+                                )
+                                score = paddle.add(score_scale, mask)
+                                softmax_out = paddle.nn.functional.softmax(
+                                    score
+                                )
+                                attention_out = paddle.matmul(softmax_out, vt)
+                                attention_out = paddle.transpose(
+                                    attention_out, [0, 2, 1, 3]
+                                )
+                                out = paddle.assign(attention_out)
+                                self.pass_attr_list = [
+                                    {'fused_flash_attn_pass': {}}
+                                ]
+                                self.feeds = {
+                                    "Q": np.random.random(
+                                        (bs, seq_len, num_heads, head_dim)
+                                    ).astype("float16"),
+                                    "K": np.random.random(
+                                        (bs, seq_len, num_heads, head_dim)
+                                    ).astype("float16"),
+                                    "V": np.random.random(
+                                        (bs, seq_len, num_heads, head_dim)
+                                    ).astype("float16"),
+                                    "mask": np.random.random(mask_shape).astype(
+                                        "float16"
+                                    ),
+                                }
+                                self.fetch_list = [out]
+                                self.valid_op_map = {
+                                    "pd_op.flash_attn": 1,
+                                }
+                                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def test_check_output(self):
+        self.check_pass_correct(atol=1e-3, rtol=1e-3)
+
+
+@unittest.skipIf(
+    not is_flashattn_supported(),
+    "core is not compiled with CUDA and cuda version need larger than or equal to 11.4"
+    "and device's compute capability must >= 8.x",
+)
+class TestTransposeSliceFlashAttnPattern(PassTest):
+    r"""
+                 transpose
+                     |
+          -----------+----------
+          |          |           |
+        slice       slice      slice
+          |          |           |
+          Q          K           V
+          |          |           |
+          |       transpose      |
+          |          |           |
+          -- matmul--            |
+               |                 |
+             scale               |
+               |                 |
+     mask --- add                |
+               |                 |
+            softmax              |
+               |                 |
+               ------matmul------
+                       |
+                   transpose
+                       |
+                      out
+
+            transpose
+                |
+          ------+------
+          |     |     |
+        slice slice slice
+          |     |     |
+          Q     K     V              mask
+          |     |     |               |
+    tranpose tranpose tranpose        |
+          |     |     |               |
+          -------flash_attn------------
+                    |
+                   out
+    """
+
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        for bs in [1]:
+            for seq_len in [128]:
+                for head_dim in [64]:
+                    for num_heads in [8]:
+                        with paddle.pir_utils.IrGuard():
+                            main_prog = paddle.static.Program()
+                            start_prog = paddle.static.Program()
+                            with paddle.pir.core.program_guard(
+                                main_prog, start_prog
+                            ):
+                                x = paddle.static.data(
+                                    name='x',
+                                    shape=[bs, seq_len, 3, num_heads, head_dim],
+                                    dtype='float16',
+                                )
+                                mask_shape = (bs, 1, seq_len, seq_len)
+                                mask = paddle.static.data(
+                                    name='mask',
+                                    shape=mask_shape,
+                                    dtype='float16',
+                                )
+                                xt = paddle.transpose(x, [2, 0, 3, 1, 4])
+                                q = xt[0, :, :, :, :]
+                                k = xt[1, :, :, :, :]
+                                v = xt[2, :, :, :, :]
+                                kt = paddle.transpose(k, [0, 1, 3, 2])
+
+                                score = paddle.matmul(q, kt)
+                                score_scale = paddle.scale(
+                                    score, scale=0.125, bias=0.0
+                                )
+                                score_add = paddle.add(score_scale, mask)
+                                softmax_out = paddle.nn.functional.softmax(
+                                    score_add
+                                )
+                                attention_out = paddle.matmul(softmax_out, v)
+                                attention_out = paddle.transpose(
+                                    attention_out, [0, 2, 1, 3]
+                                )
+                                out = paddle.assign(attention_out)
+                                self.pass_attr_list = [
+                                    {'fused_flash_attn_pass': {}}
+                                ]
+                                self.feeds = {
+                                    "x": np.random.random(
+                                        (bs, seq_len, 3, num_heads, head_dim)
+                                    ).astype("float16"),
+                                    "mask": np.random.random(mask_shape).astype(
+                                        "float16"
+                                    ),
+                                }
+                                self.fetch_list = [out]
+                                self.valid_op_map = {
+                                    "pd_op.flash_attn": 1,
+                                }
+                                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def test_check_output(self):
+        self.check_pass_correct(atol=1e-3, rtol=1e-3)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/ir/pir/fused_pass/test_fused_gemm_epilogue_pass.py b/test/ir/pir/fused_pass/test_fused_gemm_epilogue_pass.py
index ac22e1c043b59..b8318f4948905 100644
--- a/test/ir/pir/fused_pass/test_fused_gemm_epilogue_pass.py
+++ b/test/ir/pir/fused_pass/test_fused_gemm_epilogue_pass.py
@@ -93,7 +93,7 @@ def test_fused_gemm_epilogue_add(self):
 
                 pm = paddle.pir.PassManager()
                 pm.add_pass(
-                    'fused_gemm_epilogue_pass'
+                    'fused_gemm_epilogue_pass', {}
                 )  # apply pass to eliminate dead code
                 pm.run(main_program)
                 op_names = [op.name() for op in main_program.global_block().ops]
diff --git a/test/ir/pir/fused_pass/test_fused_weight_only_linear_pass.py b/test/ir/pir/fused_pass/test_fused_weight_only_linear_pass.py
index 3652902be0105..ccdf05520a534 100644
--- a/test/ir/pir/fused_pass/test_fused_weight_only_linear_pass.py
+++ b/test/ir/pir/fused_pass/test_fused_weight_only_linear_pass.py
@@ -38,110 +38,111 @@ def get_cuda_version():
         return -1
 
 
-# @unittest.skipIf(
-#     not core.is_compiled_with_cuda() or get_cuda_version() < 11020,
-#     "weight_only_linear requires CUDA >= 11.2",
-# )
-# class TestFusedWeightOnlyLinearPass_WithBias(PassTest):
-#     def is_config_valid(self, w_shape, bias_shape):
-#         if w_shape[-1] != bias_shape[-1]:
-#             return False
-
-#     def get_valid_op_map(self, dtype, w_shape):
-#         # weight_quantize need weight's dtype to be fp16 or bf16
-#         if (
-#             dtype == "float32"
-#             or w_shape[0] % 64 != 0
-#             or w_shape[1] % 16 != 0
-#             or (
-#                 (
-#                     paddle.device.cuda.get_device_capability()[0] == 8
-#                     and paddle.device.cuda.get_device_capability()[1] == 6
-#                 )
-#                 is False
-#                 and (
-#                     paddle.device.cuda.get_device_capability()[0] == 8
-#                     and paddle.device.cuda.get_device_capability()[1] == 0
-#                 )
-#                 is False
-#                 and (
-#                     paddle.device.cuda.get_device_capability()[0] == 7
-#                     and paddle.device.cuda.get_device_capability()[1] == 5
-#                 )
-#                 is False
-#                 and (
-#                     paddle.device.cuda.get_device_capability()[0] == 7
-#                     and paddle.device.cuda.get_device_capability()[1] == 0
-#                 )
-#                 is False
-#             )
-#         ):
-#             self.valid_op_map = {
-#                 "pd_op.weight_only_linear": 0,
-#                 "pd_op.weight_quantize": 0,
-#                 "pd_op.matmul": 1,
-#                 "pd_op.add": 1,
-#             }
-#         elif dtype == "float16":
-#             self.valid_op_map = {
-#                 "pd_op.weight_only_linear": 1,
-#                 "pd_op.weight_quantize": 1,
-#                 "pd_op.matmul": 0,
-#                 "pd_op.add": 0,
-#             }
-
-#     def setUp(self):
-#         if core.is_compiled_with_cuda():
-#             self.places.append(paddle.CUDAPlace(0))
-
-#     def sample_program(self):
-#         for dtype in ['float16', "float32"]:
-#             for w_shape in [[4096, 2048], [4096, 1024]]:
-#                 for bias_shape in [[3, 128, 2048], [3, 128, 1024]]:
-#                     if self.is_config_valid(w_shape, bias_shape) is False:
-#                         continue
-#                     rand_value = 0.001 * \
-#                         paddle.rand(shape=w_shape, dtype=dtype).numpy()
-#                     with paddle.pir_utils.IrGuard():
-#                         start_prog = paddle.static.Program()
-#                         main_prog = paddle.static.Program()
-#                         with paddle.pir.core.program_guard(
-#                             main_prog, start_prog
-#                         ):
-#                             x = paddle.static.data(
-#                                 name='x', shape=[3, 128, 4096], dtype=dtype
-#                             )
-
-#                             w = create_parameter(
-#                                 shape=w_shape,
-#                                 dtype=dtype,
-#                                 initializer=paddle.nn.initializer.Assign(
-#                                     rand_value
-#                                 ),
-#                             )
-#                             bias = paddle.static.data(
-#                                 name="bias",
-#                                 shape=bias_shape,
-#                                 dtype=dtype,
-#                             )
-#                             res1 = paddle.matmul(x=x, y=w)
-#                             out = paddle.add(res1, bias)
-#                             out = paddle.assign(out)
-#                             self.pass_list = ['fused_weight_only_linear_pass']
-#                             self.feeds = {
-#                                 "x": 0.01 * np.random.random((3, 128, 4096)).astype(
-#                                     dtype
-#                                 ),
-#                                 "bias": 0.01 * np.random.random(bias_shape).astype(
-#                                     dtype
-#                                 ),
-#                             }
-#                             self.fetch_list = [out]
-#                             self.get_valid_op_map(dtype, w_shape)
-#                             yield [main_prog, start_prog], False
-
-#     def test_check_output(self):
-#         self.check_pass_correct(1e-3, 1e-3)
+@unittest.skipIf(
+    not core.is_compiled_with_cuda() or get_cuda_version() < 11020,
+    "weight_only_linear requires CUDA >= 11.2",
+)
+class TestFusedWeightOnlyLinearPass_WithBias(PassTest):
+    def is_config_valid(self, w_shape, bias_shape):
+        if w_shape[-1] != bias_shape[-1]:
+            return False
+
+    def get_valid_op_map(self, dtype, w_shape):
+        # weight_quantize need weight's dtype to be fp16 or bf16
+        if (
+            dtype == "float32"
+            or w_shape[0] % 64 != 0
+            or w_shape[1] % 16 != 0
+            or (
+                (
+                    paddle.device.cuda.get_device_capability()[0] == 8
+                    and paddle.device.cuda.get_device_capability()[1] == 6
+                )
+                is False
+                and (
+                    paddle.device.cuda.get_device_capability()[0] == 8
+                    and paddle.device.cuda.get_device_capability()[1] == 0
+                )
+                is False
+                and (
+                    paddle.device.cuda.get_device_capability()[0] == 7
+                    and paddle.device.cuda.get_device_capability()[1] == 5
+                )
+                is False
+                and (
+                    paddle.device.cuda.get_device_capability()[0] == 7
+                    and paddle.device.cuda.get_device_capability()[1] == 0
+                )
+                is False
+            )
+        ):
+            self.valid_op_map = {
+                "pd_op.weight_only_linear": 0,
+                "pd_op.weight_quantize": 0,
+                "pd_op.matmul": 1,
+                "pd_op.add": 1,
+            }
+        elif dtype == "float16":
+            self.valid_op_map = {
+                "pd_op.weight_only_linear": 1,
+                "pd_op.weight_quantize": 1,
+                "pd_op.matmul": 0,
+                "pd_op.add": 0,
+            }
+
+    def setUp(self):
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+        self.pass_attr_list = [{'fused_weight_only_linear_pass': {}}]
+
+    def sample_program(self):
+        for dtype in ['float16', "float32"]:
+            for w_shape in [[4096, 2048], [4096, 1024]]:
+                for bias_shape in [[2048], [1024]]:
+                    if self.is_config_valid(w_shape, bias_shape) is False:
+                        continue
+                    rand_value = (
+                        0.001 * paddle.rand(shape=w_shape, dtype=dtype).numpy()
+                    )
+                    with paddle.pir_utils.IrGuard():
+                        start_prog = paddle.static.Program()
+                        main_prog = paddle.static.Program()
+                        with paddle.pir.core.program_guard(
+                            main_prog, start_prog
+                        ):
+                            x = paddle.static.data(
+                                name='x', shape=[3, 128, 4096], dtype=dtype
+                            )
+
+                            w = create_parameter(
+                                shape=w_shape,
+                                dtype=dtype,
+                                initializer=paddle.nn.initializer.Assign(
+                                    rand_value
+                                ),
+                            )
+                            bias = paddle.static.data(
+                                name="bias",
+                                shape=bias_shape,
+                                dtype=dtype,
+                            )
+                            res1 = paddle.matmul(x=x, y=w)
+                            out = paddle.add(res1, bias)
+                            out = paddle.assign(out)
+                            self.feeds = {
+                                "x": 0.01
+                                * np.random.random((3, 128, 4096)).astype(
+                                    dtype
+                                ),
+                                "bias": 0.01
+                                * np.random.random(bias_shape).astype(dtype),
+                            }
+                            self.fetch_list = [out]
+                            self.get_valid_op_map(dtype, w_shape)
+                            yield [main_prog, start_prog], False
+
+    def test_check_output(self):
+        self.check_pass_correct(1e-3, 1e-3)
 
 
 @unittest.skipIf(
@@ -193,6 +194,7 @@ def get_valid_op_map(self, dtype, w_shape):
     def setUp(self):
         if core.is_compiled_with_cuda():
             self.places.append(paddle.CUDAPlace(0))
+        self.pass_attr_list = [{'fused_weight_only_linear_pass': {}}]
 
     def sample_program(self):
         for dtype in ['float16', "float32"]:
@@ -218,7 +220,6 @@ def sample_program(self):
 
                         out = paddle.matmul(x=x, y=w)
                         out = paddle.assign(out)
-                        self.pass_list = ['fused_weight_only_linear_pass']
                         self.feeds = {
                             "x": 0.01
                             * np.random.random((3, 128, 4096)).astype(dtype),
@@ -231,5 +232,43 @@ def test_check_output(self):
         self.check_pass_correct(1e-3, 1e-3)
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda() or get_cuda_version() < 11020,
+    "weight_only_linear requires CUDA >= 11.2",
+)
+class TestFusedWeightOnlyLinearPass_Weight_Only_Int8(
+    TestFusedWeightOnlyLinearPass_NoBias
+):
+    def setUp(self):
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+        self.pass_attr_list = [
+            {
+                'fused_weight_only_linear_pass': {
+                    "weight_only_algo": "weight_only_int8"
+                }
+            }
+        ]
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda() or get_cuda_version() < 11020,
+    "weight_only_linear requires CUDA >= 11.2",
+)
+class TestFusedWeightOnlyLinearPass_Weight_Only_Int8_WithBias(
+    TestFusedWeightOnlyLinearPass_WithBias
+):
+    def setUp(self):
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+        self.pass_attr_list = [
+            {
+                'fused_weight_only_linear_pass': {
+                    "weight_only_algo": "weight_only_int8",
+                }
+            }
+        ]
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/ir/pir/fused_pass/test_pir_embedding_eltwise_layernorm_fuse_pass.py b/test/ir/pir/fused_pass/test_pir_embedding_eltwise_layernorm_fuse_pass.py
index 34241dd8ef259..f8946b4068f6c 100644
--- a/test/ir/pir/fused_pass/test_pir_embedding_eltwise_layernorm_fuse_pass.py
+++ b/test/ir/pir/fused_pass/test_pir_embedding_eltwise_layernorm_fuse_pass.py
@@ -57,7 +57,9 @@ def sample_program(self):
                 layer_norm = paddle.nn.LayerNorm(add_out2.shape[-1:])
                 out = layer_norm(add_out2)
                 out = paddle.assign(out)
-                self.pass_list = ['embedding_eltwise_layernorm_fuse_pass']
+                self.pass_attr_list = [
+                    {'embedding_eltwise_layernorm_fuse_pass': {}}
+                ]
                 self.feeds = {
                     "x1": np.random.random((1, 30)).astype("int64"),
                     "x2": np.random.random((1, 30)).astype("int64"),
@@ -109,7 +111,9 @@ def sample_program(self):
                 layer_norm = paddle.nn.LayerNorm(add_out1.shape[-1:])
                 out = layer_norm(add_out1)
                 out = paddle.assign(out)
-                self.pass_list = ['embedding_eltwise_layernorm_fuse_pass']
+                self.pass_attr_list = [
+                    {'embedding_eltwise_layernorm_fuse_pass': {}}
+                ]
                 self.feeds = {
                     "x1": np.random.random((1, 30)).astype("int64"),
                 }
diff --git a/test/ir/pir/fused_pass/test_pir_fc_elementwise_layernorm_fuse_pass.py b/test/ir/pir/fused_pass/test_pir_fc_elementwise_layernorm_fuse_pass.py
index 405ba7e72c20e..9ac030c799f35 100644
--- a/test/ir/pir/fused_pass/test_pir_fc_elementwise_layernorm_fuse_pass.py
+++ b/test/ir/pir/fused_pass/test_pir_fc_elementwise_layernorm_fuse_pass.py
@@ -78,9 +78,13 @@ def sample_program(self):
                                     )
                                     out = layer_norm(add_out)
                                     out = paddle.assign(out)
-                                    self.pass_list.append('fc_fuse_pass')
-                                    self.pass_list.append(
-                                        'fc_elementwise_layernorm_fuse_pass'
+                                    self.pass_attr_list.append(
+                                        {'fc_fuse_pass': {}}
+                                    )
+                                    self.pass_attr_list.append(
+                                        {
+                                            'fc_elementwise_layernorm_fuse_pass': {}
+                                        }
                                     )
                                     self.feeds = {
                                         "x": np.random.random(x_shape).astype(
diff --git a/test/ir/pir/fused_pass/test_pir_fc_fuse_pass.py b/test/ir/pir/fused_pass/test_pir_fc_fuse_pass.py
index bab75dcc4de02..41a3418ba0ddd 100644
--- a/test/ir/pir/fused_pass/test_pir_fc_fuse_pass.py
+++ b/test/ir/pir/fused_pass/test_pir_fc_fuse_pass.py
@@ -63,7 +63,7 @@ def sample_program(self):
                                 else:
                                     out = paddle.add(paddle.matmul(x, w), y)
                                 out = paddle.assign(out)
-                                self.pass_list = ['fc_fuse_pass']
+                                self.pass_attr_list = [{'fc_fuse_pass': {}}]
                                 self.feeds = {
                                     "x": np.random.random(x_shape).astype(
                                         "float32"
diff --git a/test/ir/pir/fused_pass/test_pir_matmul_scale_fuse_pass.py b/test/ir/pir/fused_pass/test_pir_matmul_scale_fuse_pass.py
index 8320997a07492..064e743b9e5a5 100644
--- a/test/ir/pir/fused_pass/test_pir_matmul_scale_fuse_pass.py
+++ b/test/ir/pir/fused_pass/test_pir_matmul_scale_fuse_pass.py
@@ -60,7 +60,9 @@ def sample_program(self):
                                         bias_after_scale=bias_after_scale,
                                     )
                                     out = paddle.assign(out)
-                                    self.pass_list = ['matmul_scale_fuse_pass']
+                                    self.pass_attr_list = [
+                                        {'matmul_scale_fuse_pass': {}}
+                                    ]
                                     self.feeds = {
                                         "x": np.random.random(x_shape).astype(
                                             "float32"
diff --git a/test/ir/pir/fused_pass/test_pir_matmul_transpose_fuse_pass.py b/test/ir/pir/fused_pass/test_pir_matmul_transpose_fuse_pass.py
index 67798b90dc947..d144b0f7c5f13 100644
--- a/test/ir/pir/fused_pass/test_pir_matmul_transpose_fuse_pass.py
+++ b/test/ir/pir/fused_pass/test_pir_matmul_transpose_fuse_pass.py
@@ -62,7 +62,9 @@ def sample_program(self):
                             matmul_out = paddle.matmul(x, y, name='matmul_out')
                             out = paddle.transpose(matmul_out, perm=perm)
                             out = paddle.assign(out)
-                            self.pass_list = ['matmul_transpose_fuse_pass']
+                            self.pass_attr_list = [
+                                {'matmul_transpose_fuse_pass': {}}
+                            ]
                             self.feeds = {
                                 "x": np.random.random(x_shape).astype(
                                     "float32"
@@ -124,7 +126,9 @@ def sample_program(self):
                             y_t = paddle.transpose(y, perm)
                             out = paddle.matmul(x, y_t)
                             out = paddle.assign(out)
-                            self.pass_list = ['matmul_transpose_fuse_pass']
+                            self.pass_attr_list = [
+                                {'matmul_transpose_fuse_pass': {}}
+                            ]
                             self.feeds = {
                                 "x": np.random.random(x_shape).astype(
                                     "float32"
@@ -186,7 +190,9 @@ def sample_program(self):
                             x_t = paddle.transpose(x, perm)
                             out = paddle.matmul(x_t, y)
                             out = paddle.assign(out)
-                            self.pass_list = ['matmul_transpose_fuse_pass']
+                            self.pass_attr_list = [
+                                {'matmul_transpose_fuse_pass': {}}
+                            ]
                             self.feeds = {
                                 "x": np.random.random(x_shape).astype(
                                     "float32"
diff --git a/test/ir/pir/fused_pass/test_pir_multihead_matmul_fuse_pass.py b/test/ir/pir/fused_pass/test_pir_multihead_matmul_fuse_pass.py
index 4455fffc58788..39249c4ab62d1 100644
--- a/test/ir/pir/fused_pass/test_pir_multihead_matmul_fuse_pass.py
+++ b/test/ir/pir/fused_pass/test_pir_multihead_matmul_fuse_pass.py
@@ -126,7 +126,9 @@ def build_ir_program(self):
                                     shape=[bs, seq_len, num_heads * head_dim],
                                 )
                                 out = paddle.assign(reshape_out_2)
-                                self.pass_list = ['multihead_matmul_fuse_pass']
+                                self.pass_attr_list = [
+                                    {'multihead_matmul_fuse_pass': {}}
+                                ]
                                 self.feeds = {
                                     "x": np.random.random(
                                         (bs, seq_len, hidden_dim)
diff --git a/test/ir/pir/fused_pass/test_pir_silu_fuse_pass.py b/test/ir/pir/fused_pass/test_pir_silu_fuse_pass.py
index 8821a01d50b66..f86d0ad8e93d8 100644
--- a/test/ir/pir/fused_pass/test_pir_silu_fuse_pass.py
+++ b/test/ir/pir/fused_pass/test_pir_silu_fuse_pass.py
@@ -41,7 +41,7 @@ def build_ir_program(self):
                 sigmoid_op = paddle.nn.Sigmoid()
                 out = paddle.multiply(x, sigmoid_op(x))
                 out = paddle.assign(out)
-                self.pass_list = ['silu_fuse_pass']
+                self.pass_attr_list = [{'silu_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((3, 1, 28, 28)).astype("float32"),
                 }
diff --git a/test/ir/pir/fused_pass/test_pir_transpose_flatten_concat_fuse_pass.py b/test/ir/pir/fused_pass/test_pir_transpose_flatten_concat_fuse_pass.py
index 788113c261a19..65ac9f6e9c803 100644
--- a/test/ir/pir/fused_pass/test_pir_transpose_flatten_concat_fuse_pass.py
+++ b/test/ir/pir/fused_pass/test_pir_transpose_flatten_concat_fuse_pass.py
@@ -42,7 +42,9 @@ def sample_program(self):
                     )
                     out = paddle.concat([flatten_out], axis=1)
                     out = paddle.assign(out)
-                    self.pass_list = ['transpose_flatten_concat_fuse_pass']
+                    self.pass_attr_list = [
+                        {'transpose_flatten_concat_fuse_pass': {}}
+                    ]
                     self.feeds = {
                         "x": np.random.random(x_shape).astype("float32"),
                     }
@@ -109,7 +111,9 @@ def sample_program(self):
                     axis=1,
                 )
                 out = paddle.assign(out)
-                self.pass_list = ['transpose_flatten_concat_fuse_pass']
+                self.pass_attr_list = [
+                    {'transpose_flatten_concat_fuse_pass': {}}
+                ]
                 self.feeds = {
                     "x1": np.random.random([2, 1, 1, 19]).astype("float32"),
                     "x2": np.random.random([2, 1, 1, 16]).astype("float32"),
diff --git a/test/ir/pir/fused_pass/xpu/pass_test.py b/test/ir/pir/fused_pass/xpu/pass_test.py
index b0df75a92c003..bcc307e72b74c 100644
--- a/test/ir/pir/fused_pass/xpu/pass_test.py
+++ b/test/ir/pir/fused_pass/xpu/pass_test.py
@@ -27,20 +27,18 @@ def setUpClass(self):
         self.feeds = None
         self.fetch_list = None
         self.valid_op_map = {}
-        self.pass_list = []
         self.pir_program = None
         self.places = []
         self.skip_accuracy_verification = False
+        self.pass_attr_list = []  # pass_name:pass_attr(defalut:None)
 
     def run_pir_pass(self, program):
-        if not isinstance(self.pass_list, list):
-            self.pass_list = [self.pass_list]
-
         pm = pir.PassManager(opt_level=4)
-        pm.enable_ir_printing()
         pm.enable_print_statistics()
-        for pass_name in self.pass_list:
-            pm.add_pass(pass_name)
+        pm.enable_ir_printing()
+        for pass_item in self.pass_attr_list:
+            for pass_name, pass_attr in pass_item.items():
+                pm.add_pass(pass_name, pass_attr)
         pm.run(program)
         return program
 
@@ -54,10 +52,8 @@ def check_fused_ops(self, program):
             actual_valid_op_count = op_names.count(valid_op_name)
             self.assertTrue(
                 valid_op_count == actual_valid_op_count,
-                "Checking of the number of fused operator < {} > failed. "
-                "Expected: {}, Received: {}".format(
-                    valid_op_name, valid_op_count, actual_valid_op_count
-                ),
+                f"Checking of the number of fused operator < {valid_op_name} > failed. "
+                f"Expected: {valid_op_count}, Received: {actual_valid_op_count}",
             )
 
     @abc.abstractmethod
diff --git a/test/ir/pir/fused_pass/xpu/test_add_layernorm_xpu_fuse_pass.py b/test/ir/pir/fused_pass/xpu/test_add_layernorm_xpu_fuse_pass.py
index d724d9e98d7c5..9ba1404322d10 100644
--- a/test/ir/pir/fused_pass/xpu/test_add_layernorm_xpu_fuse_pass.py
+++ b/test/ir/pir/fused_pass/xpu/test_add_layernorm_xpu_fuse_pass.py
@@ -54,7 +54,7 @@ def build_ir_program(self):
                 layer_norm = paddle.nn.LayerNorm(add_out.shape[-1:])
                 out = layer_norm(add_out)
                 out = paddle.assign(out)
-                self.pass_list = ['add_layernorm_xpu_fuse_pass']
+                self.pass_attr_list = [{'add_layernorm_xpu_fuse_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((3, 64, 28, 28)).astype("float32"),
                     "y": np.random.random((3, 64, 28, 28)).astype("float32"),
diff --git a/test/ir/pir/fused_pass/xpu/test_group_norm_silu_xpu_fuse_pass.py b/test/ir/pir/fused_pass/xpu/test_group_norm_silu_xpu_fuse_pass.py
new file mode 100644
index 0000000000000..3a515d7d62b66
--- /dev/null
+++ b/test/ir/pir/fused_pass/xpu/test_group_norm_silu_xpu_fuse_pass.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from pass_test import PassTest
+
+import paddle
+from paddle.base import core
+
+paddle.enable_static()
+
+
+class TestGroupNormSiluXpuFusePattern(PassTest):
+    r"""
+                      X
+              Scale   |   Bias
+                   \  |  /
+                  group norm
+                   /  |  \
+                  /   |   \
+            variance  |   mean
+                      |
+                     silu
+                      |
+                    output
+    """
+
+    def is_program_valid(self, program):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                channels = 128
+                groups = 32
+                x = paddle.static.data(
+                    name='X', shape=[1, channels, 64, 64], dtype='float32'
+                )
+
+                group_norm = paddle.nn.GroupNorm(groups, channels)
+                silu = paddle.nn.Silu()
+
+                group_norm_out = group_norm(x)
+                out = silu(group_norm_out)
+                out = paddle.assign(out)
+                self.pass_attr_list = [{'group_norm_silu_xpu_fuse_pass': {}}]
+                self.feeds = {
+                    "X": np.random.random((1, channels, 64, 64)).astype(
+                        "float32"
+                    ),
+                }
+                self.fetch_list = [out]
+                self.valid_op_map = {
+                    "pd_op.group_norm": 0,
+                    "pd_op.silu": 0,
+                    "pd_op.group_norm_silu_xpu": 1,
+                }
+                return [main_prog, start_prog]
+
+    def setUp(self):
+        if core.is_compiled_with_xpu():
+            self.places.append(paddle.XPUPlace(0))
+        self.skip_accuracy_verification = True
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/ir/pir/test_ir_backward.py b/test/ir/pir/test_ir_backward.py
index 5e4f5386a1cda..3f8a77eed354f 100644
--- a/test/ir/pir/test_ir_backward.py
+++ b/test/ir/pir/test_ir_backward.py
@@ -292,6 +292,31 @@ def false_func():
             self.assertEqual((grad_x == res).all(), True)
 
 
+class TestBackward_5(unittest.TestCase):
+    def tearDown(self) -> None:
+        paddle.framework.set_flags({"FLAGS_enable_pir_api": False})
+
+    def test_skip_vjp(self):
+        if not paddle.framework.in_pir_mode():
+            return
+        program = paddle.static.Program()
+        with paddle.static.program_guard(program):
+            x = paddle.static.data('x', [4, 4], 'float32')
+            x.stop_gradient = True
+            y = paddle.nn.functional.relu(x)
+            y.stop_gradient = False
+            z = paddle.nn.functional.relu(y)
+            loss = paddle.mean(z)
+
+        paddle.autograd.ir_backward.append_backward(loss)
+        relu_grad_number = 0
+        for op in program.global_block().ops:
+            if op.name() == "pd_op.relu_grad":
+                relu_grad_number += 1
+
+        self.assertEqual(relu_grad_number, 1)
+
+
 class TestValueSet(unittest.TestCase):
     def setUp(self) -> None:
         with paddle.pir_utils.IrGuard():
diff --git a/test/ir/pir/test_ir_pybind.py b/test/ir/pir/test_ir_pybind.py
index fd0aee950cc31..afe8b57385379 100644
--- a/test/ir/pir/test_ir_pybind.py
+++ b/test/ir/pir/test_ir_pybind.py
@@ -48,6 +48,8 @@ def test_program(self):
 
         self.assertEqual(pir_program, program)
 
+        self.assertEqual(len(pir_program.blocks), 1)
+
     def test_block(self):
         pir_program = get_ir_program()
         block = pir_program.global_block()
diff --git a/test/ir/pir/test_ir_save_load.py b/test/ir/pir/test_ir_save_load.py
new file mode 100644
index 0000000000000..71002441128d5
--- /dev/null
+++ b/test/ir/pir/test_ir_save_load.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+from paddle import base
+
+
+class TestA(unittest.TestCase):
+    def test_save_load(self):
+        with paddle.pir_utils.IrGuard():
+            main_program = paddle.static.Program()
+            with paddle.static.program_guard(main_program):
+                input = paddle.full(
+                    shape=[1, 512, 64], fill_value=0.5, dtype='float32'
+                )
+                weight = paddle.full(
+                    shape=[64, 64], fill_value=0.5, dtype='float32'
+                )
+                bias = paddle.full(shape=[64], fill_value=1.0, dtype='float32')
+                x = paddle.matmul(input, weight)
+                y = paddle.add(x, bias)
+
+            file_path = "test_save_program1.json"
+            pir_version = 1
+            base.core.serialize_pir_program(
+                main_program, file_path, pir_version
+            )
+
+            recover_program = paddle.static.Program()
+            base.core.deserialize_pir_program(
+                file_path, recover_program, pir_version
+            )
+
+            self.assertEqual(
+                len(main_program.global_block().ops),
+                len(recover_program.global_block().ops),
+            )
+            for i in range(len(main_program.global_block().ops)):
+                self.assertEqual(
+                    main_program.global_block().ops[i].name(),
+                    recover_program.global_block().ops[i].name(),
+                )
+
+    def test_save_no_trainable(self):
+        # check save with trainable=False, no stopgradient info
+        with paddle.pir_utils.IrGuard():
+            main_program = paddle.static.Program()
+            with paddle.static.program_guard(main_program):
+                input = paddle.full(
+                    shape=[1, 512, 64], fill_value=0.5, dtype='float32'
+                )
+                weight = paddle.full(
+                    shape=[64, 64], fill_value=0.5, dtype='float32'
+                )
+                input.stop_gradient = False
+                bias = paddle.full(shape=[64], fill_value=1.0, dtype='float32')
+                x = paddle.matmul(input, weight)
+                y = paddle.add(x, bias)
+
+            file_path = "test_save_program1_0.json"
+            pir_version = 1
+            base.core.serialize_pir_program(
+                main_program, file_path, pir_version, True, True, False
+            )
+
+            recover_program = paddle.static.Program()
+            base.core.deserialize_pir_program(
+                file_path, recover_program, pir_version
+            )
+
+            self.assertEqual(
+                main_program.global_block().ops[-1].result(0).stop_gradient,
+                False,
+            )
+            self.assertEqual(
+                recover_program.global_block().ops[-1].result(0).stop_gradient,
+                True,
+            )
+
+    def test_builtin_save(self):
+        with paddle.pir_utils.IrGuard():
+            main_program = paddle.static.Program()
+            with paddle.static.program_guard(main_program):
+                x_2 = paddle.static.data(
+                    shape=[4, 5], dtype='int32', name='x_2'
+                )
+                out1, out2 = paddle.split(x=x_2, num_or_sections=2, axis=0)
+                out = paddle.concat([out1, out2], axis=1)
+
+            file_path = "test_save_program2.json"
+            pir_version = 1
+            base.core.serialize_pir_program(
+                main_program, file_path, pir_version, True, True, True
+            )
+
+            recover_program = paddle.static.Program()
+            base.core.deserialize_pir_program(
+                file_path, recover_program, pir_version
+            )
+
+            self.assertEqual(
+                len(main_program.global_block().ops),
+                len(recover_program.global_block().ops),
+            )
+            for i in range(len(main_program.global_block().ops)):
+                self.assertEqual(
+                    main_program.global_block().ops[i].name(),
+                    recover_program.global_block().ops[i].name(),
+                )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/test_map_op_another_pass.py b/test/ir/pir/test_map_op_another_pass.py
index 00322ac9c21e7..4955fd713f26d 100644
--- a/test/ir/pir/test_map_op_another_pass.py
+++ b/test/ir/pir/test_map_op_another_pass.py
@@ -74,7 +74,7 @@ def sample_program(self):
                             x, conv2d_filter, groups=32, data_format="NCHW"
                         )
                         out = paddle.assign(depthwise_conv2d_out)
-                        self.pass_list = ['map_op_to_another_pass']
+                        self.pass_attr_list = [{'map_op_to_another_pass': {}}]
                         self.feeds = {
                             "x": np.random.random(x_shape).astype("float32"),
                         }
diff --git a/test/ir/pir/test_pass_manager.py b/test/ir/pir/test_pass_manager.py
index 2ff9cb98aadfe..fdad847245f4f 100644
--- a/test/ir/pir/test_pass_manager.py
+++ b/test/ir/pir/test_pass_manager.py
@@ -51,7 +51,7 @@ def test_op(self):
         self.assertTrue('pd_op.uniform' in op_names)
         pm = pir.PassManager()
         pm.add_pass(
-            'dead_code_elimination_pass'
+            'dead_code_elimination_pass', {}
         )  # apply pass to eliminate dead code
         pm.run(new_program)
         op_names = [op.name() for op in new_program.global_block().ops]
diff --git a/test/ir/pir/test_pir_identity_op_clean_pass.py b/test/ir/pir/test_pir_identity_op_clean_pass.py
index 4a70f5aea59c9..c556ded5864cc 100644
--- a/test/ir/pir/test_pir_identity_op_clean_pass.py
+++ b/test/ir/pir/test_pir_identity_op_clean_pass.py
@@ -37,7 +37,7 @@ def build_ir_program(self):
                 )
                 out = paddle.scale(x, scale=1.0, bias=0.0)
                 out = paddle.assign(out)
-                self.pass_list = ['identity_op_clean_pass']
+                self.pass_attr_list = [{'identity_op_clean_pass': {}}]
                 self.feeds = {
                     "x": np.random.random((3, 1, 28, 28)).astype("float32")
                 }
@@ -85,7 +85,7 @@ def sample_program(self):
                             bias_after_scale=bias_after_scale_2,
                         )
                         out = paddle.assign(out)
-                        self.pass_list = ['identity_op_clean_pass']
+                        self.pass_attr_list = [{'identity_op_clean_pass': {}}]
                         self.feeds = {
                             "x": np.random.random((3, 1, 28, 28)).astype(
                                 "float32"
@@ -119,7 +119,7 @@ def sample_program(self):
                     )
                     out = paddle.cast(x, tmp_type)
                     out = paddle.assign(out)
-                    self.pass_list = ['identity_op_clean_pass']
+                    self.pass_attr_list = [{'identity_op_clean_pass': {}}]
                     self.feeds = {
                         "x": np.random.random((3, 1, 28, 28)).astype(tmp_type)
                     }
@@ -150,7 +150,7 @@ def sample_program(self):
                 )
                 out = paddle.concat(x=[x_input])
                 out = paddle.assign(out)
-                self.pass_list = ['identity_op_clean_pass']
+                self.pass_attr_list = [{'identity_op_clean_pass': {}}]
                 self.feeds = {
                     "x_input": np.random.random((3, 1, 28, 28)).astype(
                         "float32"
@@ -185,7 +185,7 @@ def sample_program(self):
                         )
                         out = paddle.cast(paddle.cast(x, type_1), type_2)
                         out = paddle.assign(out)
-                        self.pass_list = ['identity_op_clean_pass']
+                        self.pass_attr_list = [{'identity_op_clean_pass': {}}]
                         self.feeds = {
                             "x": np.random.random((3, 1, 28, 28)).astype(
                                 "float16"
@@ -209,41 +209,6 @@ def setUp(self):
             self.places.append(paddle.CUDAPlace(0))
 
 
-class TestRemoveRedundantTransposePattern(PassTest):
-    def is_program_valid(self, program=None):
-        return True
-
-    def sample_program(self):
-        for perm1_shape in [[1, 2, 0]]:
-            for perm2_shape in [[0, 2, 1]]:
-                with paddle.pir_utils.IrGuard():
-                    main_prog = paddle.static.Program()
-                    start_prog = paddle.static.Program()
-                    with paddle.pir.core.program_guard(main_prog, start_prog):
-                        x = paddle.static.data(
-                            name='x', shape=[2, 3, 4], dtype="float32"
-                        )
-                        out = paddle.transpose(
-                            paddle.transpose(x, perm1_shape), perm2_shape
-                        )
-                        out = paddle.assign(out)
-                        self.pass_list = ['identity_op_clean_pass']
-                        self.feeds = {
-                            "x": np.random.random((2, 3, 4)).astype("float32")
-                        }
-                        self.fetch_list = [out]
-                        self.valid_op_map = {"pd_op.transpose": 1}
-                        yield [main_prog, start_prog], False
-
-    def test_check_output(self):
-        self.check_pass_correct()
-
-    def setUp(self):
-        self.places.append(paddle.CPUPlace())
-        if core.is_compiled_with_cuda():
-            self.places.append(paddle.CUDAPlace(0))
-
-
 class TestDeleteDropoutOpPatternPattern(PassTest):
     def is_program_valid(self, program=None):
         return True
@@ -270,7 +235,9 @@ def sample_program(self):
                             dropout_net.eval()  # set is_test=true
                             dropout_out = dropout_net(transpose_out)
                             out = paddle.assign(dropout_out)
-                            self.pass_list = ['identity_op_clean_pass']
+                            self.pass_attr_list = [
+                                {'identity_op_clean_pass': {}}
+                            ]
                             self.feeds = {
                                 "x": np.random.random((2, 3, 4)).astype(
                                     "float32"
diff --git a/test/ir/pir/test_remove_redundant_transpose_pass.py b/test/ir/pir/test_remove_redundant_transpose_pass.py
new file mode 100644
index 0000000000000..678bea37e8a65
--- /dev/null
+++ b/test/ir/pir/test_remove_redundant_transpose_pass.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from fused_pass.pass_test import PassTest
+
+import paddle
+from paddle.base import core
+
+paddle.enable_static()
+
+
+class TestRemoveRedundantTransposePattern(PassTest):
+    def is_program_valid(self, program=None):
+        return True
+
+    def sample_program(self):
+        for perm1_shape in [[1, 2, 0]]:
+            for perm2_shape in [[0, 2, 1]]:
+                with paddle.pir_utils.IrGuard():
+                    main_prog = paddle.static.Program()
+                    start_prog = paddle.static.Program()
+                    with paddle.pir.core.program_guard(main_prog, start_prog):
+                        x = paddle.static.data(
+                            name='x', shape=[2, 3, 4], dtype="float32"
+                        )
+                        out = paddle.transpose(
+                            paddle.transpose(x, perm1_shape), perm2_shape
+                        )
+                        out = paddle.assign(out)
+                        self.pass_attr_list = [
+                            {'remove_redundant_transpose_pass': {}}
+                        ]
+                        self.feeds = {
+                            "x": np.random.random((2, 3, 4)).astype("float32")
+                        }
+                        self.fetch_list = [out]
+                        self.valid_op_map = {"pd_op.transpose": 1}
+                        yield [main_prog, start_prog], False
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/ir/pir/test_save_load_params.py b/test/ir/pir/test_save_load_params.py
new file mode 100644
index 0000000000000..6b8fb2d16597a
--- /dev/null
+++ b/test/ir/pir/test_save_load_params.py
@@ -0,0 +1,210 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.optimizer import Adam
+from paddle.pir_utils import IrGuard
+
+paddle.enable_static()
+IMAGE_SIZE = 784
+
+
+class TestSimpleParamSaveLoad(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def get_params(self, prog):
+        scope = paddle.static.global_scope()
+
+        def get_tensor(name):
+            t = scope.find_var(name).get_tensor()
+            return t
+
+        param_dict = {}
+        opt_dict = {}
+        for op in prog.global_block().ops:
+            if op.name() == "builtin.parameter" and "persistable" in op.attrs():
+                if op.attrs()['persistable'] == [True]:
+                    name = op.attrs()["parameter_name"]
+                    param_dict.update({name: get_tensor(name)})
+            elif op.name() == "pd_op.data" and "persistable" in op.attrs():
+                if op.attrs()['persistable'] == [True]:
+                    name = op.attrs()["name"]
+                    opt_dict.update({name: get_tensor(name)})
+        return param_dict, opt_dict
+
+    def test_params1(self):
+        with IrGuard():
+            main_program = paddle.static.Program()
+            with paddle.static.program_guard(
+                main_program, paddle.static.Program()
+            ):
+                x = paddle.static.data(
+                    name="static_x", shape=[None, IMAGE_SIZE], dtype='float32'
+                )
+                z = paddle.static.nn.fc(x, 10)
+                z = paddle.static.nn.fc(z, 10, bias_attr=False)
+                loss = paddle.mean(z)
+                opt = Adam(learning_rate=1e-3)
+                opt.minimize(loss)
+                place = paddle.CPUPlace()
+                exe = paddle.static.Executor(place)
+                exe.run(paddle.static.default_startup_program())
+                fake_inputs = np.random.randn(2, IMAGE_SIZE).astype('float32')
+                exe.run(
+                    main_program,
+                    feed={'static_x': fake_inputs},
+                    fetch_list=[loss],
+                )
+                scope = paddle.static.global_scope()
+                params = main_program.global_block().all_parameters()
+                param_dict = {}
+                # save parameters
+                for v in params:
+                    name = v.get_defining_op().attrs()["parameter_name"]
+                    param_dict.update({name: scope.var(name).get_tensor()})
+
+                path = os.path.join(self.temp_dir.name, "save_pickle")
+                paddle.static.io.save_pir(main_program, path)
+
+                # change the value of parameters
+                for v in params:
+                    name = v.get_defining_op().attrs()["parameter_name"]
+                    tensor = scope.var(name).get_tensor()
+                    tensor.set(np.zeros_like(np.array(tensor)), place)
+
+                # load parameters
+                paddle.static.io.load_pir(main_program, path)
+                for v in params:
+                    if v.get_defining_op().name() == "builtin.parameter":
+                        name = v.get_defining_op().attrs()["parameter_name"]
+                        t = scope.find_var(name).get_tensor()
+                        np.testing.assert_array_equal(t, param_dict[name])
+
+    def test_params2(self):
+        with IrGuard():
+            prog = paddle.static.Program()
+            with paddle.static.program_guard(prog):
+                x = paddle.static.data(
+                    name="static_x", shape=[None, IMAGE_SIZE], dtype='float32'
+                )
+                z = paddle.static.nn.fc(x, 10)
+                z = paddle.static.nn.fc(z, 10, bias_attr=False)
+                loss = paddle.mean(z)
+                opt = Adam(learning_rate=1e-3)
+                opt.minimize(loss)
+                place = paddle.CPUPlace()
+                exe = paddle.static.Executor(place)
+                exe.run(paddle.static.default_startup_program())
+                fake_inputs = np.random.randn(2, IMAGE_SIZE).astype('float32')
+                exe.run(prog, feed={'static_x': fake_inputs}, fetch_list=[loss])
+
+                param_dict, opt_dict = self.get_params(prog)
+                # test save_func and load_func
+                save_dir = os.path.join(self.temp_dir.name, "save_params")
+                for k, v in param_dict.items():
+                    path = os.path.join(save_dir, k, '.pdparams')
+                    # test fp16
+                    paddle.base.core.save_func(v, k, path, True, True)
+                    tensor = param_dict[k]
+                    tensor.set(np.zeros_like(np.array(tensor)), place)
+                    paddle.base.core.load_func(path, -1, [], True, tensor)
+                    np.testing.assert_array_equal(tensor, v)
+
+                for k, v in opt_dict.items():
+                    path = os.path.join(save_dir, k, '.pdopt')
+                    paddle.base.core.save_func(v, k, path, True, False)
+                    tensor = opt_dict[k]
+                    tensor.set(np.zeros_like(np.array(tensor)), place)
+                    paddle.base.core.load_func(path, -1, [], False, tensor)
+                    np.testing.assert_array_equal(tensor, v)
+
+                # test save_combine_func and load_combine_func
+                save_dir = os.path.join(
+                    self.temp_dir.name, "save_combine_params"
+                )
+                path = os.path.join(save_dir, 'demo.pdiparams')
+                param_vec = list(param_dict.values())
+                paddle.base.core.save_combine_func(
+                    param_vec, list(param_dict.keys()), path, True, False, False
+                )
+                param_new = []
+                for tensor in param_vec:
+                    tensor.set(np.zeros_like(np.array(tensor)), place)
+                    param_new.append(tensor)
+                paddle.base.core.load_combine_func(
+                    path, list(param_dict.keys()), param_new, False
+                )
+                np.testing.assert_equal(param_new, param_vec)
+                # save to memory
+                paddle.base.core.save_combine_func(
+                    param_vec, list(param_dict.keys()), path, True, False, True
+                )
+                # save as fp16
+                paddle.base.core.save_combine_func(
+                    param_vec, list(param_dict.keys()), path, True, True, False
+                )
+                # load as fp16
+                paddle.base.core.load_combine_func(
+                    path, list(param_dict.keys()), param_new, True
+                )
+
+                # test save_vars
+                path_prefix = os.path.join(save_dir, 'new')
+                params_path = path_prefix + ".pdiparams"
+                if os.path.isdir(params_path):
+                    raise ValueError(
+                        f"'{params_path}' is an existing directory."
+                    )
+
+                save_dirname = os.path.dirname(params_path)
+                params_filename = os.path.basename(params_path)
+                # test combine
+                paddle.static.io.save_vars_pir(
+                    dirname=save_dirname,
+                    main_program=prog,
+                    filename=params_filename,
+                )
+                # test sepearate
+                paddle.static.io.save_vars_pir(
+                    dirname=save_dirname,
+                    main_program=prog,
+                )
+                # test load_vars
+                load_dirname = os.path.dirname(params_path)
+                load_filename = os.path.basename(params_path)
+                # test combine
+                paddle.static.io.load_vars_pir(
+                    dirname=load_dirname,
+                    main_program=prog,
+                    filename=load_filename,
+                )
+                # test sepearate
+                paddle.static.io.load_vars_pir(
+                    dirname=load_dirname,
+                    main_program=prog,
+                )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/translator/CMakeLists.txt b/test/ir/pir/translator/CMakeLists.txt
index 4dd8c2563c509..41330d87b16db 100644
--- a/test/ir/pir/translator/CMakeLists.txt
+++ b/test/ir/pir/translator/CMakeLists.txt
@@ -14,11 +14,12 @@ list(APPEND DISTRIBUTED_OP_TRANSLATOR_TEST test_c_reduce_prod_translator)
 list(APPEND DISTRIBUTED_OP_TRANSLATOR_TEST test_c_scatter_translator)
 list(APPEND DISTRIBUTED_OP_TRANSLATOR_TEST test_c_split_translator)
 list(APPEND DISTRIBUTED_OP_TRANSLATOR_TEST test_distributed_fused_lamb_init)
+list(APPEND DISTRIBUTED_OP_TRANSLATOR_TEST test_distributed_fused_lamb)
 list(APPEND DISTRIBUTED_OP_TRANSLATOR_TEST
      test_distributed_lookup_table_translate)
 list(APPEND DISTRIBUTED_OP_TRANSLATOR_TEST
      test_distributed_push_sparse_translator)
-list(APPEND DISTRIBUTED_OP_TRANSLATOR_TEST test_distributed_fused_lamb_init)
+list(APPEND DISTRIBUTED_OP_TRANSLATOR_TEST test_dgc_translator)
 list(APPEND DISTRIBUTED_OP_TRANSLATOR_TEST test_nop_translator)
 list(APPEND DISTRIBUTED_OP_TRANSLATOR_TEST test_partial_allgather_translator)
 list(APPEND DISTRIBUTED_OP_TRANSLATOR_TEST test_partial_send_translator)
@@ -34,6 +35,10 @@ if(NOT WITH_DISTRIBUTE)
   list(REMOVE_ITEM TEST_INTERP_CASES ${DISTRIBUTED_OP_TRANSLATOR_TEST})
 endif()
 
+if(NOT WITH_DGC)
+  list(REMOVE_ITEM TEST_INTERP_CASES test_dgc_translator)
+endif()
+
 foreach(target ${TEST_INTERP_CASES})
   py_test_modules(${target} MODULES ${target})
 endforeach()
diff --git a/test/ir/pir/translator/test_dgc_momentum_translator.py b/test/ir/pir/translator/test_dgc_momentum_translator.py
new file mode 100644
index 0000000000000..b44b981ddc6cb
--- /dev/null
+++ b/test/ir/pir/translator/test_dgc_momentum_translator.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import test_op_translator
+
+import paddle
+from paddle.base.layer_helper import LayerHelper
+
+
+class TestDgcMomemtumOpTranslator(test_op_translator.TestOpTranslator):
+    def append_op(self):
+        self.op_type = "dgc_momentum"
+
+        grad = paddle.ones(shape=(123, 321), dtype='float32')
+        param = paddle.ones(shape=(123, 321), dtype='float32')
+        velocity = paddle.zeros(shape=(123, 321), dtype='float32')
+        learning_rate = paddle.to_tensor([0.001], dtype='float32')
+        current_step = paddle.to_tensor([1], dtype='float32')
+        nranks = paddle.to_tensor([1, 1], dtype='float32')
+
+        param_out = paddle.ones(shape=(123, 321), dtype='float32')
+        velocity_out = paddle.ones(shape=(123, 321), dtype='float32')
+        grad_out = paddle.ones(shape=(123, 321), dtype='float32')
+
+        attrs = {
+            'mu': 0.0001,
+            'use_nesterov': False,
+            'rampup_begin_step': 10.0,
+        }
+        helper = LayerHelper(self.op_type)
+        helper.append_op(
+            type=self.op_type,
+            inputs={
+                "Param": param,
+                "Grad": grad,
+                "Velocity": velocity,
+                "LearningRate": learning_rate,
+                "current_step": current_step,
+                "nranks": nranks,
+            },
+            outputs={
+                "ParamOut": param_out,
+                "VelocityOut": velocity_out,
+                "Grad_out": grad_out,
+            },
+            attrs=attrs,
+        )
+
+    def test_translator(self):
+        self.check()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/ir/pir/translator/test_dgc_translator.py b/test/ir/pir/translator/test_dgc_translator.py
new file mode 100644
index 0000000000000..6f2fe03137eb9
--- /dev/null
+++ b/test/ir/pir/translator/test_dgc_translator.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import test_op_translator
+
+import paddle
+from paddle.base.layer_helper import LayerHelper
+
+
+class TestDgcOpTranslator(test_op_translator.TestOpTranslator):
+    def append_op(self):
+        self.op_type = "dgc"
+        g_array_size = 102400
+        u = paddle.ones(shape=(g_array_size,), dtype='float32')
+        v = paddle.ones(shape=(g_array_size,), dtype='float32')
+        grad = paddle.ones(shape=(g_array_size,), dtype='float32')
+        param = paddle.ones(shape=(g_array_size,), dtype='float32')
+        current_step = paddle.to_tensor([0.0], dtype='float32')
+        nranks = paddle.to_tensor([2.0], dtype='float32')
+
+        u_out = paddle.ones(shape=(g_array_size,), dtype='float32')
+        v_out = paddle.ones(shape=(g_array_size,), dtype='float32')
+        encode_grad = paddle.ones(shape=(g_array_size,), dtype='float32')
+        grad_out = paddle.ones(shape=(g_array_size,), dtype='float32')
+        k = paddle.to_tensor([0.0], dtype='float32')
+        gather_buff = paddle.ones(shape=(g_array_size,), dtype='float32')
+        attrs = {
+            'm': 0.9,
+            'use_nesterov': True,
+            'sparsity': [],
+            'padding_idx': -1,
+            'rampup_begin_step': 0.0,
+            'rampup_step': 0.0,
+            'regular_coeff': 0.0,
+            'regular_type': 0,
+        }
+        helper = LayerHelper(self.op_type)
+        helper.append_op(
+            type=self.op_type,
+            inputs={
+                "U": u,
+                "V": v,
+                "Grad": grad,
+                "Param": param,
+                "current_step": current_step,
+                "nranks": nranks,
+            },
+            outputs={
+                "U_out": u_out,
+                "V_out": v_out,
+                "EncodeGrad": encode_grad,
+                "Grad_out": grad_out,
+                "k": k,
+                "GatherBuff": gather_buff,
+            },
+            attrs=attrs,
+        )
+
+    def test_translator(self):
+        self.check()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/ir/pir/translator/test_distributed_fused_lamb.py b/test/ir/pir/translator/test_distributed_fused_lamb.py
new file mode 100644
index 0000000000000..9493772d63799
--- /dev/null
+++ b/test/ir/pir/translator/test_distributed_fused_lamb.py
@@ -0,0 +1,218 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import test_op_translator
+
+import paddle
+from paddle.base import core, unique_name
+from paddle.base.layer_helper import LayerHelper
+
+
+class TestDistributedFusedLambOpTranslator(test_op_translator.TestOpTranslator):
+    def setUp(self):
+        super().setUp()
+        assert (
+            not paddle.in_dynamic_mode()
+        ), "DistributedFusedLamb does not support dygraph mode"
+        self._beta1 = 0.9
+        self._beta2 = 0.999
+        self._epsilon = 1e-6
+        self._weight_decay = 0.01
+        self._max_global_grad_norm = -1.0
+        self._alignment = 128
+        self._clip_after_allreduce = True
+        self._is_grad_scaled_by_nranks = True
+        self._scale = None
+        self._use_master_param_norm = True
+        self._gradient_accumulation_steps = 1
+        self._use_master_acc_grad = True
+        self._use_hierarchical_allreduce = False
+        self.helper = LayerHelper("distributed_fused_lamb")
+
+        main_block = self.helper.main_program.global_block()
+        self._found_inf = main_block.create_var(
+            name=unique_name.generate("found_inf"),
+            shape=[1],
+            dtype=core.VarDesc.VarType.BOOL,
+        )
+        self._step = None
+
+        self._param_to_master_param = {}
+
+    def _create_persistable_var(self, name=None, shape=[-1], dtype="float32"):
+        startup_block = self.helper.startup_program.global_block()
+        if name is not None:
+            name = unique_name.generate(name)
+        startup_var = startup_block.create_var(
+            name=name,
+            shape=shape,
+            dtype=dtype,
+            persistable=True,
+            stop_gradient=True,
+        )
+        main_block = self.helper.main_program.global_block()
+        main_var = main_block.create_var(
+            name=startup_var.name,
+            shape=startup_var.shape,
+            dtype=startup_var.dtype,
+            persistable=True,
+            stop_gradient=True,
+        )
+        return main_var
+
+    def _create_scale_from_constant(self, value):
+        name = unique_name.generate('global_scale')
+        return paddle.static.create_global_var(
+            name=name,
+            shape=[1],
+            dtype='float32',
+            value=float(value),
+            persistable=True,
+        )
+
+    def append_op(self):
+        self.op_type = "distributed_fused_lamb"
+        params = [paddle.ones(shape=(1, 1), dtype="float32")]
+        grads = [paddle.ones(shape=(1, 1), dtype="float32")]
+        lr = paddle.to_tensor(0.001, dtype="float32")
+        rank = paddle.distributed.get_rank()
+        nranks = paddle.distributed.get_world_size()
+        fp32_fused_param = self._create_persistable_var("fp32_fused_param")
+        fp32_fused_grad = self._create_persistable_var("fp32_fused_grad")
+        fp16_fused_param = self._create_persistable_var(
+            "fp16_fused_param", dtype="float16"
+        )
+        fp16_fused_grad = self._create_persistable_var(
+            "fp16_fused_grad", dtype="float16"
+        )
+
+        moment1 = self._create_persistable_var("moment1")
+        moment1.is_distributed = True
+        moment2 = self._create_persistable_var("moment2")
+        moment2.is_distributed = True
+        beta1pow = self._create_persistable_var("beta1pow")
+        beta2pow = self._create_persistable_var("beta2pow")
+
+        param_info = self._create_persistable_var("param_info", dtype="int32")
+        param_info.is_distributed = True
+
+        fused_offsets = self._create_persistable_var(
+            "fused_offsets", dtype="int32"
+        )
+
+        fp32_partial_fused_offsets = self._create_persistable_var(
+            "fp32_partial_fused_offsets", dtype="int32"
+        )
+        fp32_partial_fused_offsets.is_distributed = True
+
+        fp16_partial_fused_offsets = self._create_persistable_var(
+            "fp16_partial_fused_offsets", dtype="int32"
+        )
+        fp16_partial_fused_offsets.is_distributed = True
+
+        param_order = self._create_persistable_var("param_order", dtype="int32")
+        param_order.is_distributed = True
+
+        fp32_acc_fused_grad = [
+            self._create_persistable_var("fp32_acc_fused_grad")
+        ]
+        fp16_acc_fused_grad = [
+            self._create_persistable_var("fp16_acc_fused_grad", dtype="float16")
+        ]
+        acc_step = [self._create_persistable_var("acc_step", dtype="int64")]
+
+        scale = self._create_scale_from_constant(1.0)
+
+        step = self._create_persistable_var('step', dtype='int64')
+
+        ring_ids = []
+        ring_id = 0
+        ring_ids.append(ring_id)
+        main_block = self.helper.main_program.global_block()
+        _found_inf = main_block.create_var(
+            name=unique_name.generate("found_inf"),
+            shape=[1],
+            dtype=core.VarDesc.VarType.BOOL,
+        )
+        _stop_update = main_block.create_var(
+            name=unique_name.generate("stop_update"),
+            shape=[1],
+            dtype=core.VarDesc.VarType.BOOL,
+        )
+        attrs = {
+            "weight_decay": 0.01,
+            "beta1": 0.9,
+            "beta2": 0.999,
+            "epsilon": 1e-6,
+            "max_global_grad_norm": -1.0,
+            "clip_after_allreduce": True,
+            "rank": rank,
+            "nranks": nranks,
+            "ring_ids": ring_ids,
+            "use_master_param_norm": True,
+            "is_grad_scaled_by_nranks": True,
+            "acc_steps": 1,
+            "use_master_acc_grad": True,
+            "use_hierarchical_allreduce": False,
+        }
+
+        helper = LayerHelper(self.op_type)
+        helper.append_op(
+            type=self.op_type,
+            inputs={
+                "FP32FusedParam": [fp32_fused_param],
+                "FP32FusedGrad": [fp32_fused_grad],
+                "FP16FusedParam": [fp16_fused_param],
+                "FP16FusedGrad": [fp16_fused_grad],
+                "LearningRate": [lr],
+                "Moment1": [moment1],
+                "Moment2": [moment2],
+                "Beta1Pow": [beta1pow],
+                "Beta2Pow": [beta2pow],
+                "GlobalScale": [scale],
+                "ParamInfo": [param_info],
+                "Param": params,
+                "Grad": grads,
+                "FusedParamOffsets": [fused_offsets],
+                "FP32ShardFusedParamOffsets": [fp32_partial_fused_offsets],
+                "FP16ShardFusedParamOffsets": [fp16_partial_fused_offsets],
+                "ParamOrder": [param_order],
+            },
+            outputs={
+                "FP32FusedParamOut": [fp32_fused_param],
+                "FP16FusedParamOut": [fp16_fused_param],
+                "Moment1Out": [moment1],
+                "Moment2Out": [moment2],
+                "Beta1PowOut": [beta1pow],
+                "Beta2PowOut": [beta2pow],
+                "ParamOut": params,
+                "GradOut": grads,
+                "FoundInf": [_found_inf],
+                "FP32AccFusedGrad": fp32_acc_fused_grad,
+                "FP16AccFusedGrad": fp16_acc_fused_grad,
+                "AccStep": acc_step,
+                "StopUpdate": _stop_update,
+                "Step": [step],
+            },
+            attrs=attrs,
+        )
+
+    def test_translator(self):
+        self.check()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
index b8b019b5673c2..e53ce088882af 100644
--- a/test/legacy_test/CMakeLists.txt
+++ b/test/legacy_test/CMakeLists.txt
@@ -444,8 +444,6 @@ list(REMOVE_ITEM TEST_OPS test_feed_data_check_shape_type)
 list(REMOVE_ITEM TEST_OPS test_fetch_lod_tensor_array)
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_data_norm_op)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer_auto_growth)
 list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
 list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
 list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
@@ -453,12 +451,7 @@ list(REMOVE_ITEM TEST_OPS test_imperative_resnet_sorted_gradient)
 list(REMOVE_ITEM TEST_OPS test_imperative_mnist_sorted_gradient)
 list(REMOVE_ITEM TEST_OPS test_imperative_se_resnext)
 list(REMOVE_ITEM TEST_OPS test_imperative_mnist)
-list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer)
 list(REMOVE_ITEM TEST_OPS test_layers)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_seresnext_base_cpu)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_seresnext_with_reduce_cpu)
-list(REMOVE_ITEM TEST_OPS
-     test_parallel_executor_seresnext_with_fuse_all_reduce_cpu)
 list(REMOVE_ITEM TEST_OPS test_imperative_ocr_attention_model)
 list(REMOVE_ITEM TEST_OPS test_async_ssa_graph_executor_mnist)
 list(REMOVE_ITEM TEST_OPS test_install_check)
@@ -466,7 +459,6 @@ list(REMOVE_ITEM TEST_OPS test_basic_gru_api)
 list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op)
 list(REMOVE_ITEM TEST_OPS test_basic_lstm_api)
 list(REMOVE_ITEM TEST_OPS test_basic_lstm_unit_op)
-list(REMOVE_ITEM TEST_OPS test_fuse_all_reduce_pass)
 list(REMOVE_ITEM TEST_OPS test_fuse_bn_act_pass)
 list(REMOVE_ITEM TEST_OPS test_fuse_bn_add_act_pass)
 list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op)
@@ -525,7 +517,6 @@ set(TEST_OPS_WITH_GC
     test_fill_zeros_like2_op
     test_gather_op
     test_gather_nd_op
-    test_linear_chain_crf_op
     test_lod_reset_op
     test_lookup_table_op
     test_mean_op
@@ -772,25 +763,12 @@ if(WITH_DISTRIBUTE)
   endif()
 endif()
 
-py_test_modules(test_parallel_executor_transformer MODULES
-                test_parallel_executor_transformer)
 if(WIN32)
-  py_test_modules(
-    test_parallel_executor_transformer_auto_growth MODULES
-    test_parallel_executor_transformer_auto_growth ENVS
-    FLAGS_allocator_strategy=auto_growth CUDA_VISIBLE_DEVICES=0)
-  py_test_modules(test_fuse_all_reduce_pass MODULES test_fuse_all_reduce_pass
-                  ENVS CUDA_VISIBLE_DEVICES=0)
   py_test_modules(test_feed_data_check_shape_type MODULES
                   test_feed_data_check_shape_type ENVS CUDA_VISIBLE_DEVICES=0)
   py_test_modules(test_fetch_lod_tensor_array MODULES
                   test_fetch_lod_tensor_array ENVS CUDA_VISIBLE_DEVICES=0)
 else()
-  py_test_modules(
-    test_parallel_executor_transformer_auto_growth MODULES
-    test_parallel_executor_transformer_auto_growth ENVS
-    FLAGS_allocator_strategy=auto_growth)
-  py_test_modules(test_fuse_all_reduce_pass MODULES test_fuse_all_reduce_pass)
   py_test_modules(test_feed_data_check_shape_type MODULES
                   test_feed_data_check_shape_type)
   py_test_modules(test_fetch_lod_tensor_array MODULES
@@ -815,38 +793,10 @@ py_test_modules(
   FLAGS_cudnn_batchnorm_spatial_persistent=1
   FLAGS_conv_workspace_size_limit=1000)
 
-# NOTE: These unittests will appear NaN steadily in windows CI. After analysis,
-# it is found that windows CI will run all the training unittests with the ON_INFER option turned on,
-# which will not appear in other CIs. The calculation behavior of some ops in inference mode is
-# inconsistent with that in non-inference mode.
-if(WITH_PYTHON)
-  py_test_modules(test_parallel_executor_seresnext_base_cpu MODULES
-                  test_parallel_executor_seresnext_base_cpu)
-  py_test_modules(test_parallel_executor_seresnext_with_reduce_cpu MODULES
-                  test_parallel_executor_seresnext_with_reduce_cpu)
-  py_test_modules(
-    test_parallel_executor_seresnext_with_fuse_all_reduce_cpu MODULES
-    test_parallel_executor_seresnext_with_fuse_all_reduce_cpu)
-  set_tests_properties(test_parallel_executor_seresnext_base_cpu
-                       PROPERTIES TIMEOUT 900)
-  set_tests_properties(test_parallel_executor_seresnext_base_cpu
-                       PROPERTIES LABELS "RUN_TYPE=NIGHTLY")
-  set_tests_properties(test_parallel_executor_seresnext_with_reduce_cpu
-                       PROPERTIES TIMEOUT 750)
-  set_tests_properties(test_parallel_executor_seresnext_with_reduce_cpu
-                       PROPERTIES LABELS "RUN_TYPE=NIGHTLY")
-  set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_cpu
-                       PROPERTIES TIMEOUT 750)
-  set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_cpu
-                       PROPERTIES LABELS "RUN_TYPE=NIGHTLY")
-endif()
-
 if(NOT WIN32)
   # TODO: fix these unittests failure on Windows
   py_test_modules(test_layers MODULES test_layers ENVS
                   FLAGS_cudnn_deterministic=1)
-  py_test_modules(test_ir_memory_optimize_transformer MODULES
-                  test_ir_memory_optimize_transformer)
 endif()
 
 if(WITH_HETERPS)
@@ -871,11 +821,7 @@ set_tests_properties(
   test_data_norm_op test_dataloader_keep_order test_dataloader_unkeep_order
   test_buffer_shared_memory_reuse_pass PROPERTIES LABELS "RUN_TYPE=DIST")
 set_tests_properties(
-  test_sync_batch_norm_op
-  test_parallel_executor_seresnext_base_gpu
-  test_parallel_executor_seresnext_with_reduce_gpu
-  test_parallel_executor_seresnext_with_fuse_all_reduce_gpu
-  test_distributed_fused_lamb_op_with_clip
+  test_sync_batch_norm_op test_distributed_fused_lamb_op_with_clip
   test_distributed_fused_lamb_op_without_clip
   test_distributed_fused_lamb_op_with_gradient_merge
   PROPERTIES LABELS "RUN_TYPE=DIST")
@@ -907,13 +853,6 @@ if(NOT WIN32)
   set_tests_properties(test_multiprocess_reader_exception
                        PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
   set_tests_properties(test_layers PROPERTIES TIMEOUT 120)
-  if(WITH_NV_JETSON)
-    set_tests_properties(test_ir_memory_optimize_transformer PROPERTIES TIMEOUT
-                                                                        1200)
-  else()
-    set_tests_properties(test_ir_memory_optimize_transformer PROPERTIES TIMEOUT
-                                                                        120)
-  endif()
 endif()
 
 if(WITH_DISTRIBUTE)
@@ -950,7 +889,6 @@ set_tests_properties(test_cross_entropy_loss PROPERTIES TIMEOUT 180)
 set_tests_properties(test_gru_unit_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_activation_nn_grad PROPERTIES TIMEOUT 250)
 set_tests_properties(test_empty_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_parallel_executor_transformer PROPERTIES TIMEOUT 120)
 set_tests_properties(test_elementwise_div_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_regularizer_api PROPERTIES TIMEOUT 150)
 set_tests_properties(test_multiclass_nms_op PROPERTIES TIMEOUT 120)
@@ -963,7 +901,6 @@ set_tests_properties(test_add_reader_dependency PROPERTIES TIMEOUT 120)
 set_tests_properties(test_bilateral_slice_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_buffer_shared_memory_reuse_pass PROPERTIES TIMEOUT
                                                                      120)
-set_tests_properties(test_fuse_relu_depthwise_conv_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_fleet_util PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_transformer_sorted_gradient
                      PROPERTIES TIMEOUT 120)
@@ -1022,16 +959,12 @@ set_tests_properties(test_index_add_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_tensordot PROPERTIES TIMEOUT 200)
 set_tests_properties(test_partial_eager_deletion_transformer PROPERTIES TIMEOUT
                                                                         120)
-set_tests_properties(test_parallel_executor_seresnext_with_reduce_gpu
-                     PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dropout_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_argsort_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_gather_nd_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_nn_grad PROPERTIES TIMEOUT 180)
 set_tests_properties(test_elementwise_sub_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_row_conv_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_gpu
-                     PROPERTIES TIMEOUT 120)
 set_tests_properties(test_distributed_fused_lamb_op_with_clip PROPERTIES TIMEOUT
                                                                          240)
 set_tests_properties(test_distributed_fused_lamb_op_without_clip
@@ -1041,8 +974,6 @@ set_tests_properties(test_distributed_fused_lamb_op_with_gradient_merge
 set_tests_properties(test_elementwise_min_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_nan_inf PROPERTIES TIMEOUT 120)
 set_tests_properties(test_deformable_conv_v1_op PROPERTIES TIMEOUT 300)
-set_tests_properties(test_parallel_executor_transformer_auto_growth
-                     PROPERTIES TIMEOUT 120)
 set_tests_properties(test_elementwise_add_op PROPERTIES TIMEOUT 200)
 if(NOT WITH_COVERAGE)
   set_tests_properties(test_weight_decay PROPERTIES TIMEOUT 120)
@@ -1070,13 +1001,10 @@ set_tests_properties(test_imperative_optimizer_v2 PROPERTIES TIMEOUT 150)
 set_tests_properties(test_partial_sum_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cond PROPERTIES TIMEOUT 240)
 set_tests_properties(test_sgd_op PROPERTIES TIMEOUT 250)
-set_tests_properties(test_parallel_executor_seresnext_base_gpu
-                     PROPERTIES TIMEOUT 120)
 set_tests_properties(test_norm_nn_grad PROPERTIES TIMEOUT 180)
 set_tests_properties(test_matrix_nms_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_generator_dataloader PROPERTIES TIMEOUT 120)
 set_tests_properties(test_partial_concat_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_fuse_optimizer_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_softmax_with_cross_entropy_op PROPERTIES TIMEOUT 220)
 set_tests_properties(test_reduce_op PROPERTIES TIMEOUT 500)
 set_tests_properties(test_adam_optimizer_fp32_fp64 PROPERTIES TIMEOUT 120)
@@ -1102,7 +1030,6 @@ set_tests_properties(test_conv2d_op_depthwise_conv PROPERTIES TIMEOUT 120)
 set_tests_properties(test_conv2d_api PROPERTIES TIMEOUT 120)
 set_tests_properties(test_elementwise_mul_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cyclic_cifar_dataset PROPERTIES TIMEOUT 120)
-set_tests_properties(test_fuse_all_reduce_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dygraph_multi_forward PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_ocr_attention_model PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_mnist PROPERTIES TIMEOUT 120)
@@ -1123,6 +1050,7 @@ set_tests_properties(test_pad3d_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dataloader_keep_order PROPERTIES TIMEOUT 120)
 set_tests_properties(test_mean_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dataloader_unkeep_order PROPERTIES TIMEOUT 120)
+set_tests_properties(test_dataloader PROPERTIES TIMEOUT 120)
 set_tests_properties(test_reader_reset PROPERTIES TIMEOUT 120)
 set_tests_properties(test_pool3d_api PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 120)
@@ -1283,15 +1211,6 @@ foreach(TEST_CINN_OP ${TEST_CINN_OPS})
 endforeach()
 
 if(WITH_CINN AND WITH_TESTING)
-  set_tests_properties(
-    test_parallel_executor_run_cinn
-    PROPERTIES
-      LABELS
-      "RUN_TYPE=CINN"
-      ENVIRONMENT
-      FLAGS_allow_cinn_ops="conv2d;conv2d_grad;elementwise_add;elementwise_add_grad;relu;relu_grad;sum"
-  )
-
   set_tests_properties(test_tile_op PROPERTIES TIMEOUT 300)
 endif()
 
diff --git a/test/legacy_test/auto_parallel_op_test.py b/test/legacy_test/auto_parallel_op_test.py
index e6e9283ed8be8..23fbce116deee 100644
--- a/test/legacy_test/auto_parallel_op_test.py
+++ b/test/legacy_test/auto_parallel_op_test.py
@@ -289,15 +289,11 @@ def run_subprocess(start_command, env, timeout):
         )
     except subprocess.TimeoutExpired as err:
         raise TimeoutError(
-            "Timeout while running command {}, try to set a longer period, {} is not enough.".format(
-                err.cmd, err.timeout
-            )
+            f"Timeout while running command {err.cmd}, try to set a longer period, {err.timeout} is not enough."
         )
     except subprocess.CalledProcessError as err:
         raise RuntimeError(
-            "Error occurs when running this test case. The return code of command {} is {}".format(
-                err.cmd, err.returncode
-            )
+            f"Error occurs when running this test case. The return code of command {err.cmd} is {err.returncode}"
         )
 
 
@@ -498,12 +494,8 @@ def check_eager_auto_parallel(self):
             # check eager auto parallel forward
             if len(actual_ret) != len(self.eager_forward_desire):
                 msg = (
-                    "The eager auto parallel out tensor nums is different with eager out tensor nums on {}."
-                    'eager auto parallel out tensor nums = {}, eager out tensor nums = {}. \n'.format(
-                        str(self.place),
-                        len(actual_ret),
-                        len(self.eager_forward_desire),
-                    )
+                    f"The eager auto parallel out tensor nums is different with eager out tensor nums on {str(self.place)}."
+                    f'eager auto parallel out tensor nums = {len(actual_ret)}, eager out tensor nums = {len(self.eager_forward_desire)}. \n'
                 )
                 raise RuntimeError(msg)
             for i in range(len(actual_ret)):
@@ -721,12 +713,8 @@ def check_eager_auto_parallel(self):
             # check eager auto parallel forward
             if len(actual_forward_res) != len(self.eager_forward_desire):
                 msg = (
-                    "The eager auto parallel out tensor nums is different with eager out tensor nums on {}."
-                    'eager auto parallel out tensor nums = {}, eager out tensor nums = {}. \n'.format(
-                        str(self.place),
-                        len(actual_forward_res),
-                        len(self.eager_forward_desire),
-                    )
+                    f"The eager auto parallel out tensor nums is different with eager out tensor nums on {str(self.place)}."
+                    f'eager auto parallel out tensor nums = {len(actual_forward_res)}, eager out tensor nums = {len(self.eager_forward_desire)}. \n'
                 )
                 raise RuntimeError(msg)
             for i in range(len(actual_forward_res)):
@@ -751,12 +739,8 @@ def check_eager_auto_parallel(self):
             # check eager auto parallel grad
             if len(actual_grad_res) != len(self.eager_grad_desire):
                 msg = (
-                    "The eager auto parallel grad out tensor nums is different with eager grad out tensor nums on {}."
-                    'eager auto parallel grad out tensor nums = {}, eager grad out tensor nums = {}. \n'.format(
-                        str(self.place),
-                        len(actual_grad_res),
-                        len(self.eager_grad_desire),
-                    )
+                    f"The eager auto parallel grad out tensor nums is different with eager grad out tensor nums on {str(self.place)}."
+                    f'eager auto parallel grad out tensor nums = {len(actual_grad_res)}, eager grad out tensor nums = {len(self.eager_grad_desire)}. \n'
                 )
                 raise RuntimeError(msg)
             for i in range(len(actual_grad_res)):
@@ -795,9 +779,9 @@ def gen_eager_grad_outputs(self):
         return eager_vs
 
     def get_output_dict(self, np_outputs, api_outputs, outputs_sig):
-        assert len(api_outputs) <= len(outputs_sig), (
-            "forward api outputs length must be the less than or equal to KernelSignature outputs,but receive {} and {}"
-        ).format(len(api_outputs), len(outputs_sig))
+        assert len(api_outputs) <= len(
+            outputs_sig
+        ), f"forward api outputs length must be the less than or equal to KernelSignature outputs,but receive {len(api_outputs)} and {len(outputs_sig)}"
         output_dict = {}
         for i in range(len(api_outputs)):
             output_name = outputs_sig[i]
diff --git a/test/legacy_test/distributed_fused_lamb_test_base.py b/test/legacy_test/distributed_fused_lamb_test_base.py
index 348191e66d7d5..20904cabcb3e7 100644
--- a/test/legacy_test/distributed_fused_lamb_test_base.py
+++ b/test/legacy_test/distributed_fused_lamb_test_base.py
@@ -286,9 +286,7 @@ def config(self):
         gm_steps = int(os.getenv('GRADIENT_MERGE_STEPS', 1))
         use_master_acc_grad = bool(int(os.getenv('USE_MASTER_ACC_GRAD', '1')))
         print(
-            'clip_after_allreduce = {}, max_global_norm = {}'.format(
-                clip_after_allreduce, max_global_norm
-            )
+            f'clip_after_allreduce = {clip_after_allreduce}, max_global_norm = {max_global_norm}'
         )
         return {
             'clip_after_allreduce': clip_after_allreduce,
@@ -329,9 +327,7 @@ def run_main(
             atol = 1.5e-7
         for ret1, ret2 in zip(result1, result2):
             max_diff = np.max(np.abs(ret1 - ret2))
-            msg = 'max_diff = {} atol = {} when use_fp16 = {} , use_master_param_norm = {}'.format(
-                max_diff, atol, use_fp16, use_master_param_norm
-            )
+            msg = f'max_diff = {max_diff} atol = {atol} when use_fp16 = {use_fp16} , use_master_param_norm = {use_master_param_norm}'
             self.assertTrue(max_diff < atol, msg)
             print(msg)
 
diff --git a/test/legacy_test/gradient_checker.py b/test/legacy_test/gradient_checker.py
index 00a561bcaa960..210db283b979a 100644
--- a/test/legacy_test/gradient_checker.py
+++ b/test/legacy_test/gradient_checker.py
@@ -446,7 +446,7 @@ def fail_test(msg):
         n = numerical[x_idx][y_idx]
         if not np.allclose(a, n, rtol, atol):
             msg = (
-                f'Jacobian mismatch for output {y_idx} in y'
+                f'Jacobian mismatch for output {y_idx} in y '
                 f'with respect to input {x_idx} in x on {str(place)},\n'
                 f'numerical:{n}\nanalytical:{a}\n'
             )
diff --git a/test/legacy_test/multi_process.py b/test/legacy_test/multi_process.py
index 0a010a6cbd3e7..05759307c1cab 100644
--- a/test/legacy_test/multi_process.py
+++ b/test/legacy_test/multi_process.py
@@ -25,13 +25,7 @@ def train(prefix):
     worker_endpoints = worker_endpoints_env
     trainers_num = len(worker_endpoints.split(','))
 
-    name = "selected_gpus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}".format(
-        selected_gpus,
-        worker_endpoints,
-        trainers_num,
-        current_endpoint,
-        trainer_id,
-    )
+    name = f"selected_gpus:{selected_gpus} worker_endpoints:{worker_endpoints} trainers_num:{trainers_num} current_endpoint:{current_endpoint} trainer_id:{trainer_id}"
 
     print(name)
     with open(f"multi_process_{prefix}.check_{trainer_id}.log", "w") as f:
@@ -51,13 +45,7 @@ def train_abort(prefix):
             # train abort
             sys.exit(1)
         except SystemExit:
-            name = "abort>>> selected_gpus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}".format(
-                selected_gpus,
-                worker_endpoints,
-                trainers_num,
-                current_endpoint,
-                trainer_id,
-            )
+            name = f"abort>>> selected_gpus:{selected_gpus} worker_endpoints:{worker_endpoints} trainers_num:{trainers_num} current_endpoint:{current_endpoint} trainer_id:{trainer_id}"
             print(name)
             with open(
                 f"multi_process_{prefix}.check_{trainer_id}.log", "w"
@@ -67,13 +55,7 @@ def train_abort(prefix):
     else:
         # sleep 30s to make sure paddle.distributed.launch will terminate this process
         time.sleep(30)
-        name = "selected_gpus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}".format(
-            selected_gpus,
-            worker_endpoints,
-            trainers_num,
-            current_endpoint,
-            trainer_id,
-        )
+        name = f"selected_gpus:{selected_gpus} worker_endpoints:{worker_endpoints} trainers_num:{trainers_num} current_endpoint:{current_endpoint} trainer_id:{trainer_id}"
 
         print(name)
         with open(f"multi_process_{prefix}.check_{trainer_id}.log", "w") as f:
diff --git a/test/legacy_test/nets.py b/test/legacy_test/nets.py
index 25fbe91271fed..035cb04a6f6d7 100644
--- a/test/legacy_test/nets.py
+++ b/test/legacy_test/nets.py
@@ -490,12 +490,8 @@ def scaled_dot_product_attention(
     if not (queries.dtype == keys.dtype == values.dtype):
         raise TypeError(
             "The dtype of keys, values and queries should be the same."
-            "But received queries.dtype = {}, "
-            " keys.dtype = {}, values.dtype) = {}.".format(
-                convert_dtype(queries.dtype),
-                convert_dtype(keys.dtype),
-                convert_dtype(values.dtype),
-            )
+            f"But received queries.dtype = {convert_dtype(queries.dtype)}, "
+            f" keys.dtype = {convert_dtype(keys.dtype)}, values.dtype) = {convert_dtype(values.dtype)}."
         )
 
     if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
diff --git a/test/legacy_test/nproc_process.py b/test/legacy_test/nproc_process.py
index bee588de40bd4..e0ff2303238de 100644
--- a/test/legacy_test/nproc_process.py
+++ b/test/legacy_test/nproc_process.py
@@ -29,13 +29,7 @@ def train(prefix):
     worker_endpoints = worker_endpoints_env
     trainers_num = len(worker_endpoints.split(','))
 
-    name = "selected_devices:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}".format(
-        selected_devices,
-        worker_endpoints,
-        trainers_num,
-        current_endpoint,
-        trainer_id,
-    )
+    name = f"selected_devices:{selected_devices} worker_endpoints:{worker_endpoints} trainers_num:{trainers_num} current_endpoint:{current_endpoint} trainer_id:{trainer_id}"
 
     print(name)
     with open(f"{prefix}.check_{trainer_id}.log", "w") as f:
diff --git a/test/legacy_test/op_test.py b/test/legacy_test/op_test.py
index c18a142a1ec9d..533cae3f6b72e 100644
--- a/test/legacy_test/op_test.py
+++ b/test/legacy_test/op_test.py
@@ -96,7 +96,7 @@ def check_out_dtype(api_fn, in_specs, expect_dtypes, target_index=0, **configs):
         check_out_dtype(base.layers.pad_constant_like, [([2,3,2,3], 'float64'), ([1, 3, 1,3], )], ['float32', 'float64', 'int64'], target_index=1, pad_value=0.)
 
     """
-    with paddle_static_guard():
+    with paddle.pir_utils.OldIrGuard():
         for i, expect_dtype in enumerate(expect_dtypes):
             with paddle.static.program_guard(paddle.static.Program()):
                 input_t = []
@@ -123,9 +123,7 @@ def check_out_dtype(api_fn, in_specs, expect_dtypes, target_index=0, **configs):
 
                 if out_dtype != expect_dtype:
                     raise ValueError(
-                        "Expected out.dtype is {}, but got {} from {}.".format(
-                            expect_dtype, out_dtype, api_fn.__name__
-                        )
+                        f"Expected out.dtype is {expect_dtype}, but got {out_dtype} from {api_fn.__name__}."
                     )
 
 
@@ -503,7 +501,7 @@ def is_complex_test():
                     "This test of %s op needs check_grad." % cls.op_type
                 )
 
-            # check for op test with fp64 precision, but not check mkldnn op test for now
+            # check for op test with fp64 precision, but not check onednn op test for now
             if (
                 cls.dtype in [np.float32, np.float64]
                 and cls.op_type
@@ -527,7 +525,7 @@ def is_complex_test():
                 not in check_shape_white_list.NEED_TO_FIX_OP_LIST
             ):
                 raise AssertionError(
-                    "Input's shape should be large than or equal to 100 for "
+                    "Number of element(s) of input should be large than or equal to 100 for "
                     + cls.op_type
                     + " Op."
                 )
@@ -1088,9 +1086,7 @@ def _check_api_outs_by_dygraph_outs(self, api_outs, dygraph_outs, place):
             np_dyg = np.array(dygraph_outs[name])
             assert (
                 np_api.shape == np_dyg.shape
-            ), "Operator ({}) : Output ({}) shape mismatch, expect shape is {}, but actual shape is {}".format(
-                self.op_type, name, np_dyg.shape, np_api.shape
-            )
+            ), f"Operator ({self.op_type}) : Output ({name}) shape mismatch, expect shape is {np_dyg.shape}, but actual shape is {np_api.shape}"
             np.testing.assert_allclose(
                 np_api,
                 np_dyg,
@@ -1145,7 +1141,7 @@ def cal_python_api(python_api, args, kernel_sig):
             return result
 
         with base.dygraph.base.guard(place=place):
-            block = base.default_main_program().global_block()
+            block = base.framework.default_main_program().global_block()
             op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
             # prepare input variable
             dygraph_tensor_inputs = (
@@ -1212,7 +1208,7 @@ def _calc_dygraph_output(
             self.op_type
         )  # for ci check, please not delete it for now
         with base.dygraph.base.guard(place=place):
-            block = base.default_main_program().global_block()
+            block = base.framework.default_main_program().global_block()
 
             op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
 
@@ -1250,7 +1246,7 @@ def _calc_dygraph_output(
 
     def get_kernel_signature(self, place, egr_inps=None, egr_oups=None):
         with base.dygraph.base.guard(place=place):
-            block = base.default_main_program().global_block()
+            block = base.framework.default_main_program().global_block()
             op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
             # prepare input variable
             dygraph_tensor_inputs = (
@@ -1529,7 +1525,7 @@ def _calc_output(
         for_inplace_test=None,
         check_cinn=False,
     ):
-        with static_guard():
+        with paddle.pir_utils.OldIrGuard():
             program = Program()
             block = program.global_block()
             op = self._append_ops(block)
@@ -1594,6 +1590,7 @@ def _calc_output(
                 program = compiled_prog
 
             executor = Executor(place)
+
             outs = executor.run(
                 program,
                 feed=feed_map,
@@ -1635,14 +1632,12 @@ def _compare_expect_and_actual_outputs(
             actual_out = np.array(actual_outs[i])
             assert (
                 actual_out.shape == expect_out.shape
-            ), "Operator ({}) : Output ({}) shape mismatch, expect shape is {}, but actual shape is {}".format(
-                self.op_type, name, expect_out.shape, actual_out.shape
-            )
+            ), f"Operator ({self.op_type}) : Output ({name}) shape mismatch, expect shape is {expect_out.shape}, but actual shape is {actual_out.shape}"
             if inplace_atol is not None:
                 np.testing.assert_allclose(
                     expect_out,
                     actual_out,
-                    rtol=1e-05,
+                    rtol=1e-03 if self.dtype == np.uint16 else 1e-5,
                     atol=inplace_atol,
                     err_msg='Operator ('
                     + self.op_type
@@ -1691,37 +1686,38 @@ def _construct_grad_program_from_forward(
         Returns:
             grad_program (program): The program which contains the grad_op.
         """
-        grad_program = Program()
-        grad_block = grad_program.global_block()
-        new_op_desc = grad_block.desc.append_op()
-        new_op_desc.copy_from(grad_op_desc)
-        grad_program._sync_with_cpp()
-
-        # Create grad vars based on fwd vars (shape and dtype)
-        for arg in (
-            grad_op_desc.input_arg_names() + grad_op_desc.output_arg_names()
-        ):
-            fwd_var_name = op_grad_to_var.get(arg, None)
-            if fwd_var_name is None:
-                fwd_var_name = arg
-            fwd_var = fwd_program.global_block().vars.get(fwd_var_name)
-            assert fwd_var is not None, f"{fwd_var_name} cannot be found"
-            grad_var = grad_block.create_var(
-                name=arg,
-                dtype=fwd_var.dtype,
-                shape=fwd_var.shape,
-                type=fwd_var.type,
-                persistable=False,
-            )
+        with paddle.pir_utils.OldIrGuard():
+            grad_program = Program()
+            grad_block = grad_program.global_block()
+            new_op_desc = grad_block.desc.append_op()
+            new_op_desc.copy_from(grad_op_desc)
+            grad_program._sync_with_cpp()
+
+            # Create grad vars based on fwd vars (shape and dtype)
+            for arg in (
+                grad_op_desc.input_arg_names() + grad_op_desc.output_arg_names()
+            ):
+                fwd_var_name = op_grad_to_var.get(arg, None)
+                if fwd_var_name is None:
+                    fwd_var_name = arg
+                fwd_var = fwd_program.global_block().vars.get(fwd_var_name)
+                assert fwd_var is not None, f"{fwd_var_name} cannot be found"
+                grad_var = grad_block.create_var(
+                    name=arg,
+                    dtype=fwd_var.dtype,
+                    shape=fwd_var.shape,
+                    type=fwd_var.type,
+                    persistable=False,
+                )
 
-            # Some variables' tensors hold no buffer (tensor's _holder is NULL), like XShape in reshape2 op,
-            # and the shapes of those variables contain 0 (eg. Xshape.shape = [0, 2, 5]).
-            # Set persistable for those variables in order to get them from global_scope for inplace grad test directly other than feed them,
-            # since feed op calls check_memory_size() which fails when tensor's holder_ is NULL.
-            if 0 in grad_var.shape:
-                grad_var.persistable = True
-        grad_program._sync_with_cpp()
-        return grad_program
+                # Some variables' tensors hold no buffer (tensor's _holder is NULL), like XShape in reshape2 op,
+                # and the shapes of those variables contain 0 (eg. Xshape.shape = [0, 2, 5]).
+                # Set persistable for those variables in order to get them from global_scope for inplace grad test directly other than feed them,
+                # since feed op calls check_memory_size() which fails when tensor's holder_ is NULL.
+                if 0 in grad_var.shape:
+                    grad_var.persistable = True
+            grad_program._sync_with_cpp()
+            return grad_program
 
     def _construct_grad_feed_map_from_forward(
         self, place, fwd_res, grad_op_desc, op_grad_to_var
@@ -2139,9 +2135,7 @@ def _compare_numpy(self, name, actual_np, expect_np):
                 expect_np = np.array(expect_np)
                 assert (
                     actual_np.shape == expect_np.shape
-                ), "Operator ({}) : Output ({}) shape mismatch, expect shape is {}, but actual shape is {}".format(
-                    self.op_type, name, expect_np.shape, actual_np.shape
-                )
+                ), f"Operator ({self.op_type}) : Output ({name}) shape mismatch, expect shape is {expect_np.shape}, but actual shape is {actual_np.shape}"
                 np.testing.assert_allclose(
                     actual_np,
                     expect_np,
@@ -2308,9 +2302,7 @@ def _compare_numpy(self, name, actual_np, expect_np):
                 expect_np = np.array(expect_np)
                 assert (
                     actual_np.shape == expect_np.shape
-                ), "Operator ({}) : Output ({}) shape mismatch, expect shape is {}, but actual shape is {}".format(
-                    self.op_type, name, expect_np.shape, actual_np.shape
-                )
+                ), f"Operator ({self.op_type}) : Output ({name}) shape mismatch, expect shape is {expect_np.shape}, but actual shape is {actual_np.shape}"
                 np.testing.assert_allclose(
                     actual_np,
                     expect_np,
@@ -2421,9 +2413,7 @@ def _compare_numpy(self, name, actual_np, expect_np):
                 expect_np = np.array(expect_np)
                 assert (
                     actual_np.shape == expect_np.shape
-                ), "Operator ({}) : Output ({}) shape mismatch, expect shape is {}, but actual shape is {}".format(
-                    self.op_type, name, expect_np.shape, actual_np.shape
-                )
+                ), f"Operator ({self.op_type}) : Output ({name}) shape mismatch, expect shape is {expect_np.shape}, but actual shape is {actual_np.shape}"
                 np.testing.assert_allclose(
                     actual_np,
                     expect_np,
@@ -2578,11 +2568,12 @@ def _is_skip_name(self, name):
                 )
 
         if check_prim:
-            prim_checker = PrimForwardChecker(self, place)
-            prim_checker.check()
-            # Support operators which are not in the NO_FP64_CHECK_GRAD_OP_LIST list can be test prim with fp32
-            self.__class__.check_prim = True
-            self.__class__.op_type = self.op_type
+            with paddle.pir_utils.OldIrGuard():
+                prim_checker = PrimForwardChecker(self, place)
+                prim_checker.check()
+                # Support operators which are not in the NO_FP64_CHECK_GRAD_OP_LIST list can be test prim with fp32
+                self.__class__.check_prim = True
+                self.__class__.op_type = self.op_type
 
         if check_prim_pir:
             with paddle.pir_utils.IrGuard():
@@ -2865,9 +2856,7 @@ def _assert_is_close(
         for a, b, name in zip(numeric_grads, analytic_grads, names):
             assert tuple(a.shape) == tuple(
                 b.shape
-            ), "Operator ({}) : Output ({}) gradient shape mismatch, expect shape is {}, but actual shape is {}".format(
-                self.op_type, name, a.shape, b.shape
-            )
+            ), f"Operator ({self.op_type}) : Output ({name}) gradient shape mismatch, expect shape is {a.shape}, but actual shape is {b.shape}"
             # Used by bfloat16 for now to solve precision problem
             if self.is_bfloat16_op():
                 if a.size == 0:
@@ -2879,13 +2868,7 @@ def _assert_is_close(
                     atol=atol,
                     equal_nan=False,
                     err_msg=(
-                        "Operator {} error, {} variable {} (shape: {}, dtype: {}) max gradient diff over limit"
-                    ).format(
-                        self.op_type,
-                        msg_prefix,
-                        name,
-                        str(a.shape),
-                        self.dtype,
+                        f"Operator {self.op_type} error, {msg_prefix} variable {name} (shape: {str(a.shape)}, dtype: {self.dtype}) max gradient diff over limit"
                     ),
                 )
             else:
@@ -3432,7 +3415,7 @@ def _get_dygraph_grad(
             check_dygraph = False
 
         with base.dygraph.base.guard(place=place):
-            block = base.default_main_program().global_block()
+            block = base.framework.default_main_program().global_block()
 
             op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
 
@@ -3657,7 +3640,7 @@ def _get_gradient(
         parallel=False,
         check_cinn=False,
     ):
-        with static_guard():
+        with paddle.pir_utils.OldIrGuard():
             prog = Program()
             scope = core.Scope()
             ir_scope = core.Scope()
diff --git a/test/legacy_test/parallel_executor_test_base.py b/test/legacy_test/parallel_executor_test_base.py
deleted file mode 100644
index 9cf5b00324f3a..0000000000000
--- a/test/legacy_test/parallel_executor_test_base.py
+++ /dev/null
@@ -1,270 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import multiprocessing
-import os
-import sys
-import time
-import unittest
-
-import numpy as np
-from feed_data_reader import FeedDataReader
-
-import paddle
-from paddle import base
-from paddle.base import compiler, core
-
-__all__ = ['TestParallelExecutorBase']
-DeviceType = core.DeviceType
-
-
-class TestParallelExecutorBase(unittest.TestCase):
-    @classmethod
-    def check_network_convergence(
-        cls,
-        method,
-        use_device=DeviceType.CUDA,
-        iter=5,
-        batch_size=None,
-        feed_dict=None,
-        feed_data_reader=None,
-        get_data_from_feeder=None,
-        use_parallel_executor=True,
-        use_reduce=False,
-        use_ir_memory_optimize=False,
-        enable_inplace=True,
-        fuse_elewise_add_act_ops=False,
-        fuse_all_optimizer_ops=False,
-        fuse_all_reduce_ops=False,
-        fuse_relu_depthwise_conv=False,
-        optimizer=paddle.optimizer.Adam,
-        use_fast_executor=False,
-        enable_sequential_execution=False,
-    ):
-        def run_executor(exe, binary, feed, fetch_list):
-            if feed_data_reader is None:
-                res = exe.run(binary, feed=feed, fetch_list=fetch_list)
-            else:
-                res = exe.run(
-                    binary,
-                    feed=feed_data_reader.get_next(exe, binary),
-                    fetch_list=fetch_list,
-                )
-            return res
-
-        if feed_data_reader is not None:
-            assert isinstance(
-                feed_data_reader, FeedDataReader
-            ), "feed_data_reader must be type of FeedDataReader"
-
-        paddle.seed(0)
-        paddle.framework.random._manual_program_seed(0)
-        main = base.Program()
-        startup = base.Program()
-
-        with base.program_guard(main, startup):
-            feed_dict, loss = cls.build_model(
-                feed_dict, get_data_from_feeder, main, method, optimizer
-            )
-
-        place = (
-            base.CUDAPlace(0)
-            if use_device == DeviceType.CUDA
-            else base.XPUPlace(0)
-            if use_device == DeviceType.XPU
-            else base.CPUPlace()
-        )
-        exe = base.Executor(place)
-        exe.run(startup)
-
-        build_strategy, exec_strategy = cls.set_strategy(
-            enable_inplace,
-            enable_sequential_execution,
-            fuse_all_optimizer_ops,
-            fuse_all_reduce_ops,
-            fuse_elewise_add_act_ops,
-            fuse_relu_depthwise_conv,
-            use_fast_executor,
-            use_ir_memory_optimize,
-            use_reduce,
-            use_device,
-        )
-
-        if use_parallel_executor:
-            binary = compiler.CompiledProgram(
-                main,
-                build_strategy=build_strategy,
-            )
-        else:
-            binary = main
-
-        if batch_size is not None:
-            batch_size *= (
-                base.core.get_cuda_device_count()
-                if use_device == DeviceType.CUDA
-                else base.core.get_xpu_device_count()
-                if use_device == DeviceType.XPU
-                else int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-            )
-
-        area_below_loss = 0
-        begin = time.time()
-        (first_loss,) = run_executor(
-            exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]
-        )
-        area_below_loss += 0.5 * first_loss.mean()
-        for _ in range(iter):
-            mid_loss = run_executor(
-                exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]
-            )
-            area_below_loss += mid_loss[0].mean()
-        (last_loss,) = run_executor(
-            exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]
-        )
-        area_below_loss += 0.5 * last_loss.mean()
-        end = time.time()
-
-        if batch_size is not None:
-            print(
-                "%.4f Instance per second"
-                % ((batch_size * iter + 2) / (end - begin))
-            )
-
-        avg_last_loss_val = np.array(last_loss).mean()
-        avg_first_loss_val = np.array(first_loss).mean()
-        if math.isnan(float(avg_last_loss_val)) or math.isnan(
-            float(avg_first_loss_val)
-        ):
-            sys.exit("got NaN loss, training failed.")
-
-        print(first_loss, last_loss, area_below_loss)
-        # self.assertGreater(first_loss[0], last_loss[0])
-        return first_loss, last_loss, area_below_loss
-
-    @classmethod
-    def check_pass_conflict(
-        cls,
-        method,
-        use_device=DeviceType.CUDA,
-        feed_dict=None,
-        get_data_from_feeder=None,
-        use_reduce=False,
-        use_ir_memory_optimize=True,
-        enable_inplace=True,
-        fuse_elewise_add_act_ops=False,
-        fuse_all_optimizer_ops=False,
-        fuse_all_reduce_ops=False,
-        fuse_relu_depthwise_conv=False,
-        optimizer=paddle.optimizer.Adam,
-        use_fast_executor=True,
-        enable_sequential_execution=False,
-    ):
-        main = base.Program()
-        startup = base.Program()
-        with base.program_guard(main, startup):
-            feed_dict, loss = cls.build_model(
-                feed_dict, get_data_from_feeder, main, method, optimizer
-            )
-
-        place = (
-            base.CUDAPlace(0)
-            if use_device == DeviceType.CUDA
-            else base.XPUPlace(0)
-            if use_device == DeviceType.XPU
-            else base.CPUPlace()
-        )
-        exe = base.Executor(place)
-        exe.run(startup)
-
-        build_strategy, exec_strategy = cls.set_strategy(
-            enable_inplace,
-            enable_sequential_execution,
-            fuse_all_optimizer_ops,
-            fuse_all_reduce_ops,
-            fuse_elewise_add_act_ops,
-            fuse_relu_depthwise_conv,
-            use_fast_executor,
-            use_ir_memory_optimize,
-            use_reduce,
-            use_device,
-        )
-
-        binary = compiler.CompiledProgram(
-            main,
-            build_strategy=build_strategy,
-        )
-
-        exe.run(binary, feed=feed_dict, fetch_list=[loss.name])
-
-    @classmethod
-    def set_strategy(
-        cls,
-        enable_inplace,
-        enable_sequential_execution,
-        fuse_all_optimizer_ops,
-        fuse_all_reduce_ops,
-        fuse_elewise_add_act_ops,
-        fuse_relu_depthwise_conv,
-        use_fast_executor,
-        use_ir_memory_optimize,
-        use_reduce,
-        use_device,
-    ):
-        exec_strategy = base.ExecutionStrategy()
-        if use_fast_executor:
-            exec_strategy.use_experimental_executor = True
-        build_strategy = base.BuildStrategy()
-        build_strategy.reduce_strategy = (
-            base.BuildStrategy.ReduceStrategy.Reduce
-            if use_reduce
-            else base.BuildStrategy.ReduceStrategy.AllReduce
-        )
-        build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
-        build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv
-        build_strategy.fuse_all_optimizer_ops = fuse_all_optimizer_ops
-        build_strategy.fuse_all_reduce_ops = fuse_all_reduce_ops
-        build_strategy.memory_optimize = use_ir_memory_optimize
-        build_strategy.enable_inplace = enable_inplace
-        build_strategy.enable_sequential_execution = enable_sequential_execution
-
-        if use_device == DeviceType.CUDA and core.is_compiled_with_cuda():
-            build_strategy.remove_unnecessary_lock = True
-        if use_device == DeviceType.XPU and core.is_compiled_with_xpu():
-            build_strategy.fuse_elewise_add_act_ops = False
-            build_strategy.fuse_relu_depthwise_conv = False
-            build_strategy.fuse_all_optimizer_ops = False
-            build_strategy.memory_optimize = False
-            build_strategy.enable_inplace = False
-            build_strategy.enable_sequential_execution = False
-
-        return build_strategy, exec_strategy
-
-    @classmethod
-    def build_model(
-        cls, feed_dict, get_data_from_feeder, main, method, optimizer
-    ):
-        loss = method(use_feed=feed_dict is not None)
-        # NOTE(zjl): memory_optimize/inplace pass would not require
-        # that loss.persistable = True.
-        # We set loss.persistable = False here to verify our memory
-        # optimization strategies intentionally.
-        loss.persistable = False
-        if optimizer:
-            optimizer().minimize(loss)
-
-        if get_data_from_feeder is not None:
-            assert feed_dict is None
-            feed_dict = get_data_from_feeder()
-        return feed_dict, loss
diff --git a/test/legacy_test/prim_op_test.py b/test/legacy_test/prim_op_test.py
index 4498c51b64de7..02ce618ef363a 100644
--- a/test/legacy_test/prim_op_test.py
+++ b/test/legacy_test/prim_op_test.py
@@ -673,13 +673,8 @@ def check_static_comp(self):
         # check static forward
         if len(ret) != len(self.eager_desire):
             msg = (
-                "The static comp forward api out tensor nums is different with eager forward api out tensor nums on {}."
-                'when enable_fw_comp is {}, static comp forward api out tensor nums = {}, eager forward api out tensor nums = {}. \n'.format(
-                    str(self.place),
-                    self.enable_fw_comp,
-                    len(ret),
-                    len(self.eager_desire),
-                )
+                f"The static comp forward api out tensor nums is different with eager forward api out tensor nums on {str(self.place)}."
+                f'when enable_fw_comp is {self.enable_fw_comp}, static comp forward api out tensor nums = {len(ret)}, eager forward api out tensor nums = {len(self.eager_desire)}. \n'
             )
             raise RuntimeError(msg)
         for i in range(len(ret)):
@@ -759,13 +754,8 @@ def check_jit_comp(self):
             # check jit comp forward
             if len(ret) != len(self.eager_desire):
                 msg = (
-                    "The jit comp forward api out tensor nums is different with eager forward api out tensor nums on {}."
-                    'when enable_fw_comp is {}, jit comp forward api out tensor nums = {}, eager forward api out tensor nums = {}. \n'.format(
-                        str(self.place),
-                        self.enable_fw_comp,
-                        len(ret),
-                        len(self.eager_desire),
-                    )
+                    f"The jit comp forward api out tensor nums is different with eager forward api out tensor nums on {str(self.place)}."
+                    f'when enable_fw_comp is {self.enable_fw_comp}, jit comp forward api out tensor nums = {len(ret)}, eager forward api out tensor nums = {len(self.eager_desire)}. \n'
                 )
                 raise RuntimeError(msg)
             for i in range(len(ret)):
@@ -857,14 +847,8 @@ def check_jit_comp_with_cinn(self):
             # check jit comp forward
             if len(ret) != len(self.eager_desire):
                 msg = (
-                    "The jit comp with cinn forward api out tensor nums is different with eager forward api out tensor nums on {}."
-                    'when enable_fw_comp is {}, enable_cinn is {}, jit comp forward api out tensor nums = {}, eager forward api out tensor nums = {}. \n'.format(
-                        str(self.place),
-                        self.enable_fw_comp,
-                        core.is_compiled_with_cinn() and self.enable_cinn,
-                        len(ret),
-                        len(self.eager_desire),
-                    )
+                    f"The jit comp with cinn forward api out tensor nums is different with eager forward api out tensor nums on {str(self.place)}."
+                    f'when enable_fw_comp is {self.enable_fw_comp}, enable_cinn is {core.is_compiled_with_cinn() and self.enable_cinn}, jit comp forward api out tensor nums = {len(ret)}, eager forward api out tensor nums = {len(self.eager_desire)}. \n'
                 )
                 raise RuntimeError(msg)
             for i in range(len(ret)):
@@ -935,9 +919,9 @@ def check(self):
                     self.check_jit_comp()
 
     def get_output_dict(self, np_outputs, api_outputs, outputs_sig):
-        assert len(api_outputs) <= len(outputs_sig), (
-            "forward api outputs length must be the less than or equal to KernelSignature outputs,but receive {} and {}"
-        ).format(len(api_outputs), len(outputs_sig))
+        assert len(api_outputs) <= len(
+            outputs_sig
+        ), f"forward api outputs length must be the less than or equal to KernelSignature outputs,but receive {len(api_outputs)} and {len(outputs_sig)}"
         output_dict = {}
         for i in range(len(api_outputs)):
             output_name = outputs_sig[i]
@@ -1060,13 +1044,8 @@ def check_eager_comp(self):
             # check static forward
             if len(actual_ret) != len(self.eager_desire):
                 msg = (
-                    "The eager comp grad out tensor nums is different with eager grad out tensor nums on {}."
-                    'when enable_rev_comp is {}, eager comp grad api out tensor nums = {}, eager grad out tensor nums = {}. \n'.format(
-                        str(self.place),
-                        self.enable_rev_comp,
-                        len(actual_ret),
-                        len(self.eager_desire),
-                    )
+                    f"The eager comp grad out tensor nums is different with eager grad out tensor nums on {str(self.place)}."
+                    f'when enable_rev_comp is {self.enable_rev_comp}, eager comp grad api out tensor nums = {len(actual_ret)}, eager grad out tensor nums = {len(self.eager_desire)}. \n'
                 )
                 raise RuntimeError(msg)
             for i in range(len(actual_ret)):
@@ -1183,14 +1162,8 @@ def check_static_comp(self):
         # check static grad out
         if len(actual_ret) != len(self.eager_desire):
             msg = (
-                "The static comp grad out tensor nums is different with eager grad out tensor nums on {}."
-                'when enable_fw_comp is {},enable_rev_comp is {}, static comp grad out tensor nums = {}, eager grad out tensor nums = {}. \n'.format(
-                    str(self.place),
-                    self.enable_fw_comp,
-                    self.enable_rev_comp,
-                    len(actual_ret),
-                    len(self.eager_desire),
-                )
+                f"The static comp grad out tensor nums is different with eager grad out tensor nums on {str(self.place)}."
+                f'when enable_fw_comp is {self.enable_fw_comp},enable_rev_comp is {self.enable_rev_comp}, static comp grad out tensor nums = {len(actual_ret)}, eager grad out tensor nums = {len(self.eager_desire)}. \n'
             )
             raise RuntimeError(msg)
         for i in range(len(actual_ret)):
@@ -1303,14 +1276,8 @@ def check_jit_comp(self):
             # check jit comp grad out
             if len(ret) != len(self.eager_desire):
                 msg = (
-                    "The jit comp grad out tensor nums is different with eager grad out tensor nums on {}."
-                    'when enable_fw_comp is {}, enable_rev_comp is {}, jit comp grad out tensor nums = {}, eager grad out tensor nums = {}. \n'.format(
-                        str(self.place),
-                        self.enable_fw_comp,
-                        self.enable_rev_comp,
-                        len(ret),
-                        len(self.eager_desire),
-                    )
+                    f"The jit comp grad out tensor nums is different with eager grad out tensor nums on {str(self.place)}."
+                    f'when enable_fw_comp is {self.enable_fw_comp}, enable_rev_comp is {self.enable_rev_comp}, jit comp grad out tensor nums = {len(ret)}, eager grad out tensor nums = {len(self.eager_desire)}. \n'
                 )
                 raise RuntimeError(msg)
             for i in range(len(ret)):
@@ -1436,15 +1403,8 @@ def check_jit_comp_with_cinn(self):
             # check jit comp grad out
             if len(ret) != len(self.eager_desire):
                 msg = (
-                    "The jit comp with cinn grad out tensor nums is different with eager grad out tensor nums on {}."
-                    'when enable_fw_comp is {}, enable_rev_comp is {}, enable_cinn is {}, jit comp grad out tensor nums = {}, eager grad out tensor nums = {}. \n'.format(
-                        str(self.place),
-                        self.enable_fw_comp,
-                        self.enable_rev_comp,
-                        self.enable_cinn and core.is_compiled_with_cinn(),
-                        len(ret),
-                        len(self.eager_desire),
-                    )
+                    f"The jit comp with cinn grad out tensor nums is different with eager grad out tensor nums on {str(self.place)}."
+                    f'when enable_fw_comp is {self.enable_fw_comp}, enable_rev_comp is {self.enable_rev_comp}, enable_cinn is {self.enable_cinn and core.is_compiled_with_cinn()}, jit comp grad out tensor nums = {len(ret)}, eager grad out tensor nums = {len(self.eager_desire)}. \n'
                 )
                 raise RuntimeError(msg)
             for i in range(len(ret)):
diff --git a/test/legacy_test/seresnext_net.py b/test/legacy_test/seresnext_net.py
index 357b5b7e226b1..ef19deebba378 100644
--- a/test/legacy_test/seresnext_net.py
+++ b/test/legacy_test/seresnext_net.py
@@ -18,11 +18,12 @@
 
 import os
 
-from seresnext_test_base import DeviceType
 from simple_nets import init_data
 
 import paddle
 
+DeviceType = base.core.DeviceType
+
 os.environ['CPU_NUM'] = str(4)
 os.environ['FLAGS_cudnn_deterministic'] = str(1)
 
diff --git a/test/legacy_test/seresnext_test_base.py b/test/legacy_test/seresnext_test_base.py
deleted file mode 100644
index 73ad9c27c0196..0000000000000
--- a/test/legacy_test/seresnext_test_base.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import seresnext_net
-from parallel_executor_test_base import DeviceType, TestParallelExecutorBase
-
-from paddle.base import core
-
-
-class TestResnetBase(TestParallelExecutorBase):
-    def _compare_result_with_origin_model(
-        self, check_func, use_device, delta2=1e-5, compare_separately=True
-    ):
-        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
-            return
-
-        (
-            func_1_first_loss,
-            func_1_last_loss,
-            func_1_loss_area,
-        ) = self.check_network_convergence(
-            seresnext_net.model,
-            feed_dict=seresnext_net.feed_dict(use_device),
-            iter=seresnext_net.iter(use_device),
-            batch_size=seresnext_net.batch_size(use_device),
-            use_device=use_device,
-            use_reduce=False,
-            optimizer=seresnext_net.optimizer,
-        )
-
-        func_2_first_loss, func_2_last_loss, func_2_loss_area = check_func(
-            seresnext_net.model,
-            feed_dict=seresnext_net.feed_dict(use_device),
-            iter=seresnext_net.iter(use_device),
-            batch_size=seresnext_net.batch_size(use_device),
-            use_device=use_device,
-        )
-
-        if compare_separately:
-            self.assertAlmostEqual(
-                func_1_first_loss, func_2_first_loss, delta=1e-5
-            )
-            self.assertAlmostEqual(
-                func_1_last_loss, func_2_last_loss, delta=delta2
-            )
-        else:
-            np.testing.assert_allclose(
-                func_1_loss_area, func_2_loss_area, rtol=delta2
-            )
-            self.assertAlmostEqual(
-                func_1_first_loss, func_2_first_loss, delta=1e-5
-            )
-            self.assertAlmostEqual(
-                func_1_last_loss, func_2_last_loss, delta=delta2
-            )
diff --git a/test/legacy_test/test_activation_nn_grad.py b/test/legacy_test/test_activation_nn_grad.py
index 7bdcc6fcf3034..56daaac30a3c7 100644
--- a/test/legacy_test/test_activation_nn_grad.py
+++ b/test/legacy_test/test_activation_nn_grad.py
@@ -467,12 +467,14 @@ def func(self, place):
 
         x_arr = np.random.uniform(0.1, 1, shape).astype(dtype)
 
+        core._set_prim_all_enabled(True)
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps
         )
         gradient_checker.double_grad_check_for_dygraph(
             self.log_wrapper, [x], y, x_init=x_arr, place=place
         )
+        core._set_prim_all_enabled(False)
 
     def test_grad(self):
         paddle.enable_static()
diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py
index 63d04046f61fa..10b2b93d73336 100644
--- a/test/legacy_test/test_activation_op.py
+++ b/test/legacy_test/test_activation_op.py
@@ -1624,7 +1624,7 @@ def test_errors(self):
 class TestSqrt(TestActivation, TestParameter):
     def setUp(self):
         self.op_type = "sqrt"
-        self.prim_op_type = "comp"
+        self.prim_op_type = "prim"
         self.python_api = paddle.sqrt
         self.public_python_api = paddle.sqrt
 
@@ -1667,7 +1667,7 @@ def test_check_output(self):
 class TestSqrtPrimFp32(TestActivation):
     def setUp(self):
         self.op_type = "sqrt"
-        self.prim_op_type = "comp"
+        self.prim_op_type = "prim"
         self.python_api = paddle.sqrt
         self.public_python_api = paddle.sqrt
         self.init_dtype()
@@ -1718,7 +1718,7 @@ def init_shape(self):
 class TestSqrtBF16(OpTest):
     def setUp(self):
         self.op_type = "sqrt"
-        self.prim_op_type = "comp"
+        self.prim_op_type = "prim"
         self.python_api = paddle.sqrt
         self.public_python_api = paddle.sqrt
         self.init_dtype()
@@ -1767,7 +1767,7 @@ def test_check_grad(self):
 class TestSqrtComp(TestActivation, TestParameter):
     def setUp(self):
         self.op_type = "sqrt"
-        self.prim_op_type = "comp"
+        self.prim_op_type = "prim"
         self.python_api = paddle.sqrt
         self.public_python_api = paddle.sqrt
         self.init_dtype()
@@ -1811,7 +1811,7 @@ def test_check_output(self):
 class TestSqrtCompFp32(TestActivation):
     def setUp(self):
         self.op_type = "sqrt"
-        self.prim_op_type = "comp"
+        self.prim_op_type = "prim"
         self.python_api = paddle.sqrt
         self.public_python_api = paddle.sqrt
         self.init_dtype()
@@ -1863,6 +1863,7 @@ def setUp(self):
         self.op_type = "rsqrt"
         self.python_api = paddle.rsqrt
         self.public_python_api = paddle.rsqrt
+        self.prim_op_type = "prim"
         self.init_dtype()
         self.init_shape()
         self.if_enable_cinn()
@@ -1894,7 +1895,9 @@ def test_check_grad(self):
             ['X'],
             'Out',
             max_relative_error=0.0005,
+            check_prim=True,
             check_pir=True,
+            check_prim_pir=True,
             check_pir_onednn=self.check_pir_onednn,
         )
 
@@ -3678,6 +3681,8 @@ class TestReciprocal(TestActivation):
     def setUp(self):
         self.op_type = "reciprocal"
         self.python_api = paddle.reciprocal
+        self.public_python_api = paddle.reciprocal
+        self.prim_op_type = "comp"
         self.init_dtype()
         self.init_shape()
 
@@ -3716,7 +3721,9 @@ def test_check_grad(self):
 
     def test_check_output(self):
         self.check_output(
-            check_pir=True, check_pir_onednn=self.check_pir_onednn
+            check_pir=True,
+            check_prim_pir=True,
+            check_pir_onednn=self.check_pir_onednn,
         )
 
 
@@ -3724,11 +3731,23 @@ class TestReciprocal_Complex64(TestReciprocal):
     def init_dtype(self):
         self.dtype = np.complex64
 
+    def test_check_output(self):
+        self.check_output(
+            check_pir=True,
+            check_pir_onednn=self.check_pir_onednn,
+        )
+
 
 class TestReciprocal_Complex128(TestReciprocal):
     def init_dtype(self):
         self.dtype = np.complex128
 
+    def test_check_output(self):
+        self.check_output(
+            check_pir=True,
+            check_pir_onednn=self.check_pir_onednn,
+        )
+
 
 class TestReciprocal_ZeroDim(TestReciprocal):
     def init_shape(self):
diff --git a/test/legacy_test/test_batch_sampler.py b/test/legacy_test/test_batch_sampler.py
index 750a916b3b29a..9440d9b5777fc 100644
--- a/test/legacy_test/test_batch_sampler.py
+++ b/test/legacy_test/test_batch_sampler.py
@@ -87,6 +87,16 @@ def test_with_num_samples(self):
             rets.append(i)
             assert i >= 0 and i < 100
 
+    def test_with_num_samples_and_without_replacement(self):
+        dataset = RandomDataset(100, 10)
+        sampler = RandomSampler(dataset, num_samples=80, replacement=False)
+        assert len(sampler) == 80
+
+        rets = []
+        for i in iter(sampler):
+            rets.append(i)
+            assert i >= 0 and i < 100
+
     def test_with_generator(self):
         dataset = RandomDataset(100, 10)
         generator = iter(range(0, 60))
@@ -111,6 +121,10 @@ def test_with_generator_num_samples(self):
             rets.append(i)
         assert tuple(sorted(rets)) == tuple(range(0, 50))
 
+    def test_with_num_samples_error(self):
+        dataset = RandomDataset(100, 10)
+        self.assertRaises(ValueError, RandomSampler, dataset, False, 120)
+
 
 class TestSubsetRandomSampler(unittest.TestCase):
     def test_main(self):
diff --git a/test/legacy_test/test_bpr_loss_op.py b/test/legacy_test/test_bpr_loss_op.py
deleted file mode 100644
index 9f4bf31dc76d6..0000000000000
--- a/test/legacy_test/test_bpr_loss_op.py
+++ /dev/null
@@ -1,57 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest, randomize_probability
-
-import paddle
-
-
-class TestBprLossOp1(OpTest):
-    """Test BprLoss with discrete one-hot labels."""
-
-    def setUp(self):
-        self.op_type = "bpr_loss"
-        batch_size = 40
-        class_num = 5
-        X = randomize_probability(batch_size, class_num, dtype='float64')
-        label = np.random.randint(0, class_num, (batch_size, 1), dtype="int64")
-        bpr_loss_result = []
-        for i in range(batch_size):
-            sum = 0.0
-            for j in range(class_num):
-                if j == label[i][0]:
-                    continue
-                sum += -np.log(1.0 + np.exp(X[i][j] - X[i][label[i][0]]))
-            bpr_loss_result.append(-sum / (class_num - 1))
-        bpr_loss = np.asmatrix([[x] for x in bpr_loss_result], dtype="float64")
-        self.inputs = {"X": X, "Label": label}
-        self.outputs = {"Y": bpr_loss}
-
-    def test_check_output(self):
-        paddle.enable_static()
-        self.check_output(check_dygraph=False)
-        paddle.disable_static()
-
-    def test_check_grad(self):
-        self.check_grad(
-            ["X"], "Y", numeric_grad_delta=0.001, check_dygraph=False
-        )
-
-
-if __name__ == "__main__":
-    paddle.enable_static()
-    unittest.main()
diff --git a/test/legacy_test/test_buffer_shared_memory_reuse_pass.py b/test/legacy_test/test_buffer_shared_memory_reuse_pass.py
index 4eaa5387216f0..e393ec262c4da 100644
--- a/test/legacy_test/test_buffer_shared_memory_reuse_pass.py
+++ b/test/legacy_test/test_buffer_shared_memory_reuse_pass.py
@@ -122,11 +122,7 @@ def check_single_card_fetch_var(self):
                         np.testing.assert_array_equal(
                             fetch_val1,
                             fetch_val2,
-                            err_msg='error var name: {}, fetch_val1: {}, fetch_val2: {}'.format(
-                                fetch_var,
-                                fetch_val1[~np.equal(fetch_val1, fetch_val2)],
-                                fetch_val2[~np.equal(fetch_val1, fetch_val2)],
-                            ),
+                            err_msg=f'error var name: {fetch_var}, fetch_val1: {fetch_val1[~np.equal(fetch_val1, fetch_val2)]}, fetch_val2: {fetch_val2[~np.equal(fetch_val1, fetch_val2)]}',
                         )
 
 
diff --git a/test/legacy_test/test_cholesky_solve_op.py b/test/legacy_test/test_cholesky_solve_op.py
index 59b544d8eb4e5..914a6de628120 100644
--- a/test/legacy_test/test_cholesky_solve_op.py
+++ b/test/legacy_test/test_cholesky_solve_op.py
@@ -70,9 +70,7 @@ def broadcast_shape(matA, matB):
             Broadshape.append(max(shapeA[idx], shapeB[idx]))
         else:
             raise Exception(
-                'shapeA and shapeB should be broadcasted, but got {} and {}'.format(
-                    shapeA, shapeB
-                )
+                f'shapeA and shapeB should be broadcasted, but got {shapeA} and {shapeB}'
             )
     bsA = Broadshape + list(shapeA[-2:])
     bsB = Broadshape + list(shapeB[-2:])
diff --git a/test/legacy_test/test_chunk_eval_op.py b/test/legacy_test/test_chunk_eval_op.py
deleted file mode 100644
index b9db50079b4b3..0000000000000
--- a/test/legacy_test/test_chunk_eval_op.py
+++ /dev/null
@@ -1,282 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-class Segment:
-    def __init__(self, chunk_type, start_idx, end_idx):
-        self.chunk_type = chunk_type
-        self.start_idx = start_idx
-        self.end_idx = end_idx
-
-    def __str__(self):
-        return f'(Segment: {self.chunk_type}, {self.start_idx}, {self.end_idx})'
-
-    __repr__ = __str__
-
-
-class TestChunkEvalOp(OpTest):
-    num_sequences = 5
-    batch_size = 50
-
-    def parse_scheme(self):
-        if self.scheme == 'IOB':
-            self.num_tag_types = 2
-        elif self.scheme == 'IOE':
-            self.num_tag_types = 2
-
-    def fill_with_chunks(self, data, chunks):
-        for chunk in chunks:
-            if self.scheme == 'IOB':
-                data[chunk.start_idx] = chunk.chunk_type * self.num_tag_types
-                data[
-                    chunk.start_idx + 1 : chunk.end_idx
-                ] = chunk.chunk_type * self.num_tag_types + (
-                    self.num_tag_types - 1
-                )
-                data[chunk.end_idx] = (
-                    chunk.chunk_type * self.num_tag_types
-                    + (self.num_tag_types - 1)
-                    if chunk.start_idx < chunk.end_idx
-                    else data[chunk.start_idx]
-                )
-            elif self.scheme == 'IOE':
-                data[chunk.start_idx : chunk.end_idx] = (
-                    chunk.chunk_type * self.num_tag_types
-                )
-                data[chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
-                    self.num_tag_types - 1
-                )
-
-    def rand_chunks(self, starts, num_chunks):
-        if num_chunks < 0:
-            num_chunks = np.random.randint(starts[-1])
-        chunks = []
-        # generate chunk beginnings
-        chunk_begins = sorted(
-            np.random.choice(list(range(starts[-1])), num_chunks, replace=False)
-        )
-        seq_chunk_begins = []
-        begin_idx = 0
-        # divide chunks into sequences
-        for i in range(len(starts) - 1):
-            tmp_chunk_begins = []
-            while (
-                begin_idx < len(chunk_begins)
-                and chunk_begins[begin_idx] < starts[i + 1]
-            ):
-                tmp_chunk_begins.append(chunk_begins[begin_idx])
-                begin_idx += 1
-            seq_chunk_begins.append(tmp_chunk_begins)
-        # generate chunk ends
-        chunk_ends = []
-        for i in range(len(seq_chunk_begins)):
-            for j in range(len(seq_chunk_begins[i])):
-                low = seq_chunk_begins[i][j]
-                high = (
-                    seq_chunk_begins[i][j + 1]
-                    if j < len(seq_chunk_begins[i]) - 1
-                    else starts[i + 1]
-                )
-                chunk_ends.append(np.random.randint(low, high))
-        # generate chunks
-        for chunk_pos in zip(chunk_begins, chunk_ends):
-            chunk_type = np.random.randint(self.num_chunk_types)
-            chunks.append(Segment(chunk_type, *chunk_pos))
-        return chunks
-
-    def gen_chunks(self, infer, label, starts):
-        chunks = self.rand_chunks(
-            starts,
-            self.num_infer_chunks
-            + self.num_label_chunks
-            - self.num_correct_chunks,
-        )
-        correct_chunks = np.random.choice(
-            list(range(len(chunks))), self.num_correct_chunks, replace=False
-        )
-        infer_chunks = np.random.choice(
-            [x for x in range(len(chunks)) if x not in correct_chunks],
-            self.num_infer_chunks - self.num_correct_chunks,
-            replace=False,
-        )
-        infer_chunks = sorted(correct_chunks.tolist() + infer_chunks.tolist())
-        label_chunks = np.random.choice(
-            [x for x in range(len(chunks)) if x not in infer_chunks],
-            self.num_label_chunks - self.num_correct_chunks,
-            replace=False,
-        )
-        label_chunks = sorted(correct_chunks.tolist() + label_chunks.tolist())
-        self.fill_with_chunks(infer, [chunks[idx] for idx in infer_chunks])
-        self.fill_with_chunks(label, [chunks[idx] for idx in label_chunks])
-        # exclude types in excluded_chunk_types
-        if len(self.excluded_chunk_types) > 0:
-            for idx in correct_chunks:
-                if chunks[idx].chunk_type in self.excluded_chunk_types:
-                    self.num_correct_chunks -= 1
-            for idx in infer_chunks:
-                if chunks[idx].chunk_type in self.excluded_chunk_types:
-                    self.num_infer_chunks -= 1
-            for idx in label_chunks:
-                if chunks[idx].chunk_type in self.excluded_chunk_types:
-                    self.num_label_chunks -= 1
-        return (
-            self.num_correct_chunks,
-            self.num_infer_chunks,
-            self.num_label_chunks,
-        )
-
-    def set_confs(self):
-        # Use the IOB scheme and labels with 2 chunk types
-        self.scheme = 'IOB'
-        self.num_chunk_types = 2
-        self.excluded_chunk_types = []
-        self.other_chunk_type = self.num_chunk_types
-        self.attrs = {
-            'num_chunk_types': self.num_chunk_types,
-            'chunk_scheme': self.scheme,
-            'excluded_chunk_types': self.excluded_chunk_types,
-        }
-        self.parse_scheme()
-        (
-            self.num_correct_chunks,
-            self.num_infer_chunks,
-            self.num_label_chunks,
-        ) = (4, 5, 9)
-
-    def set_data(self):
-        infer = np.zeros((self.batch_size,)).astype('int64')
-        infer.fill(self.num_chunk_types * self.num_tag_types)
-        label = np.copy(infer)
-        starts = np.random.choice(
-            list(range(1, self.batch_size)),
-            self.num_sequences - 1,
-            replace=False,
-        ).tolist()
-        starts.extend([0, self.batch_size])
-        starts = sorted(starts)
-        (
-            self.num_correct_chunks,
-            self.num_infer_chunks,
-            self.num_label_chunks,
-        ) = self.gen_chunks(infer, label, starts)
-        lod = []
-        for i in range(len(starts) - 1):
-            lod.append(starts[i + 1] - starts[i])
-        self.set_input(infer, label, lod)
-        precision = (
-            float(self.num_correct_chunks) / self.num_infer_chunks
-            if self.num_infer_chunks
-            else 0
-        )
-        recall = (
-            float(self.num_correct_chunks) / self.num_label_chunks
-            if self.num_label_chunks
-            else 0
-        )
-        f1 = (
-            float(2 * precision * recall) / (precision + recall)
-            if self.num_correct_chunks
-            else 0
-        )
-        self.outputs = {
-            'Precision': np.asarray([precision], dtype='float32'),
-            'Recall': np.asarray([recall], dtype='float32'),
-            'F1-Score': np.asarray([f1], dtype='float32'),
-            'NumInferChunks': np.asarray(
-                [self.num_infer_chunks], dtype='int64'
-            ),
-            'NumLabelChunks': np.asarray(
-                [self.num_label_chunks], dtype='int64'
-            ),
-            'NumCorrectChunks': np.asarray(
-                [self.num_correct_chunks], dtype='int64'
-            ),
-        }
-
-    def set_input(self, infer, label, lod):
-        self.inputs = {'Inference': (infer, [lod]), 'Label': (label, [lod])}
-
-    def setUp(self):
-        self.op_type = 'chunk_eval'
-        self.set_confs()
-        self.set_data()
-
-    def test_check_output(self):
-        # NODE(yjjiang11): This op will be deprecated.
-        self.check_output(check_dygraph=False)
-
-
-class TestChunkEvalOpWithExclude(TestChunkEvalOp):
-    def set_confs(self):
-        # Use the IOE scheme and labels with 3 chunk types
-        self.scheme = 'IOE'
-        self.num_chunk_types = 3
-        self.excluded_chunk_types = [1]
-        self.other_chunk_type = self.num_chunk_types
-        self.attrs = {
-            'num_chunk_types': self.num_chunk_types,
-            'chunk_scheme': self.scheme,
-            'excluded_chunk_types': self.excluded_chunk_types,
-        }
-        self.parse_scheme()
-        (
-            self.num_correct_chunks,
-            self.num_infer_chunks,
-            self.num_label_chunks,
-        ) = (15, 18, 20)
-
-
-class TestChunkEvalOpWithTensorInput(TestChunkEvalOp):
-    def set_input(self, infer, label, lod):
-        max_len = np.max(lod)
-        pad_infer = []
-        pad_label = []
-        start = 0
-        for i in range(len(lod)):
-            end = lod[i] + start
-            pad_infer.append(
-                np.pad(
-                    infer[start:end],
-                    (0, max_len - lod[i]),
-                    'constant',
-                    constant_values=(-1,),
-                )
-            )
-            pad_label.append(
-                np.pad(
-                    label[start:end],
-                    (0, max_len - lod[i]),
-                    'constant',
-                    constant_values=(-1,),
-                )
-            )
-            start = end
-
-        pad_infer = np.expand_dims(np.array(pad_infer, dtype='int64'), 2)
-        pad_label = np.expand_dims(np.array(pad_label, dtype='int64'), 2)
-        lod = np.array(lod, dtype='int64')
-        self.inputs = {
-            'Inference': pad_infer,
-            'Label': pad_label,
-            'SeqLength': lod,
-        }
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_collective_api_base.py b/test/legacy_test/test_collective_api_base.py
index f71b524344aec..fa31fe1e16b54 100644
--- a/test/legacy_test/test_collective_api_base.py
+++ b/test/legacy_test/test_collective_api_base.py
@@ -199,10 +199,7 @@ class TestDistBase(unittest.TestCase):
     def setUp(self):
         self._port_set = set()
         self._trainers = 2
-        self._ps_endpoints = "127.0.0.1:{},127.0.0.1:{}".format(
-            self._find_free_port(),
-            self._find_free_port(),
-        )
+        self._ps_endpoints = f"127.0.0.1:{self._find_free_port()},127.0.0.1:{self._find_free_port()}"
         self._python_interp = sys.executable
         self._master_endpoints = "127.0.0.1:%s" % (self._find_free_port())
 
diff --git a/test/legacy_test/test_collective_base.py b/test/legacy_test/test_collective_base.py
index 544cee3ac0e7e..b11b992bcd5f8 100644
--- a/test/legacy_test/test_collective_base.py
+++ b/test/legacy_test/test_collective_base.py
@@ -156,10 +156,7 @@ class TestDistBase(unittest.TestCase):
     def setUp(self):
         self._port_set = set()
         self._trainers = 2
-        self._ps_endpoints = "127.0.0.1:{},127.0.0.1:{}".format(
-            self._find_free_port(),
-            self._find_free_port(),
-        )
+        self._ps_endpoints = f"127.0.0.1:{self._find_free_port()},127.0.0.1:{self._find_free_port()}"
         self._python_interp = sys.executable
 
         self.temp_dir = tempfile.TemporaryDirectory()
diff --git a/test/legacy_test/test_complex_elementwise_layers.py b/test/legacy_test/test_complex_elementwise_layers.py
index ea579cbf0948b..a75f65d29663a 100644
--- a/test/legacy_test/test_complex_elementwise_layers.py
+++ b/test/legacy_test/test_complex_elementwise_layers.py
@@ -47,11 +47,7 @@ def assert_check(self, pd_result, np_result, place):
             pd_result,
             np_result,
             rtol=1e-05,
-            err_msg='\nplace: {}\npaddle diff result:\n {}\nnumpy diff result:\n {}\n'.format(
-                place,
-                pd_result[~np.isclose(pd_result, np_result)],
-                np_result[~np.isclose(pd_result, np_result)],
-            ),
+            err_msg=f'\nplace: {place}\npaddle diff result:\n {pd_result[~np.isclose(pd_result, np_result)]}\nnumpy diff result:\n {np_result[~np.isclose(pd_result, np_result)]}\n',
         )
 
     def compare_by_basic_api(self, x, y):
diff --git a/test/legacy_test/test_complex_matmul.py b/test/legacy_test/test_complex_matmul.py
index 8740571587a7a..33c920bced403 100644
--- a/test/legacy_test/test_complex_matmul.py
+++ b/test/legacy_test/test_complex_matmul.py
@@ -39,11 +39,7 @@ def compare_by_basic_api(self, x, y, np_result):
                     pd_result,
                     np_result,
                     rtol=1e-05,
-                    err_msg='\nplace: {}\npaddle diff result:\n {}\nnumpy diff result:\n {}\n'.format(
-                        place,
-                        pd_result[~np.isclose(pd_result, np_result)],
-                        np_result[~np.isclose(pd_result, np_result)],
-                    ),
+                    err_msg=f'\nplace: {place}\npaddle diff result:\n {pd_result[~np.isclose(pd_result, np_result)]}\nnumpy diff result:\n {np_result[~np.isclose(pd_result, np_result)]}\n',
                 )
 
     def compare_op_by_basic_api(self, x, y, np_result):
@@ -57,11 +53,7 @@ def compare_op_by_basic_api(self, x, y, np_result):
                     pd_result,
                     np_result,
                     rtol=1e-05,
-                    err_msg='\nplace: {}\npaddle diff result:\n {}\nnumpy diff result:\n {}\n'.format(
-                        place,
-                        pd_result[~np.isclose(pd_result, np_result)],
-                        np_result[~np.isclose(pd_result, np_result)],
-                    ),
+                    err_msg=f'\nplace: {place}\npaddle diff result:\n {pd_result[~np.isclose(pd_result, np_result)]}\nnumpy diff result:\n {np_result[~np.isclose(pd_result, np_result)]}\n',
                 )
 
     def test_complex_xy(self):
diff --git a/test/legacy_test/test_conv2d_op.py b/test/legacy_test/test_conv2d_op.py
index 37ef304015f33..a3bfa75d1225f 100644
--- a/test/legacy_test/test_conv2d_op.py
+++ b/test/legacy_test/test_conv2d_op.py
@@ -498,7 +498,7 @@ def has_cuda(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_output_with_place(
             place,
             atol=1e-5,
@@ -512,7 +512,7 @@ def test_check_grad(self):
         ):
             return
         place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_grad_with_place(
             place,
             {'Input', 'Filter'},
@@ -528,7 +528,7 @@ def test_check_grad_no_filter(self):
         ):
             return
         place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_grad_with_place(
             place,
             ['Input'],
@@ -545,7 +545,7 @@ def test_check_grad_no_input(self):
         ):
             return
         place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_grad_with_place(
             place,
             ['Filter'],
@@ -831,7 +831,7 @@ def has_cuda(self):
         )
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
         self.check_output_with_place(
             place,
@@ -841,7 +841,7 @@ def test_check_output(self):
         )
 
     def test_check_grad(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         if self.dtype == np.float16:
             return
         place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
@@ -855,7 +855,7 @@ def test_check_grad(self):
         )
 
     def test_check_grad_no_filter(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         if self.dtype == np.float16:
             return
         place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
@@ -870,7 +870,7 @@ def test_check_grad_no_filter(self):
         )
 
     def test_check_grad_no_input(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         if self.dtype == np.float16:
             return
         place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
diff --git a/test/legacy_test/test_conv2d_transpose_op.py b/test/legacy_test/test_conv2d_transpose_op.py
index 3bb2a6f09f4d2..0c8000003de8c 100644
--- a/test/legacy_test/test_conv2d_transpose_op.py
+++ b/test/legacy_test/test_conv2d_transpose_op.py
@@ -223,7 +223,7 @@ def setUp(self):
         self.outputs = {'Output': output}
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         if self.use_cudnn:
             place = core.CUDAPlace(0)
             self.check_output_with_place(
diff --git a/test/legacy_test/test_conv3d_op.py b/test/legacy_test/test_conv3d_op.py
index cd0d6449020ca..143deb493c756 100644
--- a/test/legacy_test/test_conv3d_op.py
+++ b/test/legacy_test/test_conv3d_op.py
@@ -454,7 +454,7 @@ def has_cudnn(self):
         return core.is_compiled_with_cuda() and self.use_cudnn
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace()
         self.check_output_with_place(
             place,
@@ -466,7 +466,7 @@ def test_check_output(self):
 
     def test_check_grad(self):
         place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace()
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_grad_with_place(
             place,
             {'Input', 'Filter'},
@@ -479,7 +479,7 @@ def test_check_grad(self):
 
     def test_check_grad_no_filter(self):
         place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace()
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_grad_with_place(
             place,
             ['Input'],
@@ -493,7 +493,7 @@ def test_check_grad_no_filter(self):
 
     def test_check_grad_no_input(self):
         place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace()
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_grad_with_place(
             place,
             ['Filter'],
diff --git a/test/legacy_test/test_parallel_executor_seresnext_with_reduce_gpu.py b/test/legacy_test/test_cpp_error_msg.py
similarity index 50%
rename from test/legacy_test/test_parallel_executor_seresnext_with_reduce_gpu.py
rename to test/legacy_test/test_cpp_error_msg.py
index 187f837e7e7b1..164ab16187c1c 100644
--- a/test/legacy_test/test_parallel_executor_seresnext_with_reduce_gpu.py
+++ b/test/legacy_test/test_cpp_error_msg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,18 +14,22 @@
 
 import unittest
 
-from test_parallel_executor_seresnext_with_reduce_cpu import (
-    DeviceType,
-    TestResnetWithReduceBase,
-)
+import paddle
 
 
-class TestResnetWithReduceGPU(TestResnetWithReduceBase):
-    def test_seresnext_with_reduce(self):
-        self._compare_reduce_and_allreduce(
-            use_device=DeviceType.CUDA, delta2=1e-2
+class TestCppErrorMsg(unittest.TestCase):
+    def setUp(self) -> None:
+        paddle.base.set_flags({'FLAGS_call_stack_level': 1})
+
+    def test_invalid_argument(self):
+        with self.assertRaises(ValueError) as em:
+            input_value = paddle.to_tensor([1, 2, 3, 4, 5])
+            paddle.bincount(input_value, minlength=-1)
+        # InvalidArgumentError: xxx -> (InvalidArgument) xxx
+        self.assertEqual(
+            str(em.exception).startswith("(InvalidArgument)"), True
         )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/test/legacy_test/test_cross_op.py b/test/legacy_test/test_cross_op.py
index 803a6924f25f9..5f35f5099d5d4 100644
--- a/test/legacy_test/test_cross_op.py
+++ b/test/legacy_test/test_cross_op.py
@@ -32,6 +32,17 @@ def setUp(self):
             'X': np.random.random(self.shape).astype(self.dtype),
             'Y': np.random.random(self.shape).astype(self.dtype),
         }
+        if self.dtype is np.complex64 or self.dtype is np.complex128:
+            self.inputs = {
+                'X': (
+                    np.random.random(self.shape)
+                    + 1j * np.random.random(self.shape)
+                ).astype(self.dtype),
+                'Y': (
+                    np.random.random(self.shape)
+                    + 1j * np.random.random(self.shape)
+                ).astype(self.dtype),
+            }
         self.init_output()
 
     def initTestCase(self):
@@ -81,6 +92,30 @@ def init_output(self):
         self.outputs = {'Out': np.array(z_list).reshape(self.shape)}
 
 
+class TestCrossComplex64Op(TestCrossOp):
+    def initTestCase(self):
+        self.shape = (2048, 3)
+        self.dtype = np.complex64
+
+    def init_output(self):
+        z_list = []
+        for i in range(2048):
+            z_list.append(np.cross(self.inputs['X'][i], self.inputs['Y'][i]))
+        self.outputs = {'Out': np.array(z_list).reshape(self.shape)}
+
+
+class TestCrossComplex128Op(TestCrossOp):
+    def initTestCase(self):
+        self.shape = (2048, 3)
+        self.dtype = np.complex128
+
+    def init_output(self):
+        z_list = []
+        for i in range(2048):
+            z_list.append(np.cross(self.inputs['X'][i], self.inputs['Y'][i]))
+        self.outputs = {'Out': np.array(z_list).reshape(self.shape)}
+
+
 @unittest.skipIf(
     not core.is_compiled_with_cuda()
     or not core.is_bfloat16_supported(core.CUDAPlace(0)),
diff --git a/test/legacy_test/test_ctc_align.py b/test/legacy_test/test_ctc_align.py
deleted file mode 100644
index 699b176518be1..0000000000000
--- a/test/legacy_test/test_ctc_align.py
+++ /dev/null
@@ -1,232 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-import paddle
-
-
-def CTCAlign(input, lod, blank, merge_repeated, padding=0, input_length=None):
-    if input_length is None:
-        lod0 = lod[0]
-        result = []
-        cur_offset = 0
-        for i in range(len(lod0)):
-            prev_token = -1
-            for j in range(cur_offset, cur_offset + lod0[i]):
-                token = input[j][0]
-                if (token != blank) and not (
-                    merge_repeated and token == prev_token
-                ):
-                    result.append(token)
-                prev_token = token
-            cur_offset += lod0[i]
-        result = np.array(result).reshape([len(result), 1]).astype("int32")
-        if len(result) == 0:
-            result = np.array([[-1]])
-        return result
-    else:
-        result = [[] for i in range(len(input))]
-        output_length = []
-        for i in range(len(input)):
-            prev_token = -1
-            for j in range(input_length[i][0]):
-                token = input[i][j]
-                if (token != blank) and not (
-                    merge_repeated and token == prev_token
-                ):
-                    result[i].append(token)
-                prev_token = token
-            start = len(result[i])
-            output_length.append([start])
-            for j in range(start, len(input[i])):
-                result[i].append(padding)
-        result = (
-            np.array(result)
-            .reshape([len(input), len(input[0])])
-            .astype("int32")
-        )
-        output_length = (
-            np.array(output_length).reshape([len(input), 1]).astype("int32")
-        )
-
-    return result, output_length
-
-
-class TestCTCAlignOp(OpTest):
-    def config(self):
-        self.op_type = "ctc_align"
-        self.input_lod = [[11, 7]]
-        self.blank = 0
-        self.merge_repeated = False
-        self.input = (
-            np.array([0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7, 0])
-            .reshape([18, 1])
-            .astype("int32")
-        )
-
-    def setUp(self):
-        self.config()
-        output = CTCAlign(
-            self.input, self.input_lod, self.blank, self.merge_repeated
-        )
-
-        self.inputs = {
-            "Input": (self.input, self.input_lod),
-        }
-        self.outputs = {"Output": output}
-        self.attrs = {
-            "blank": self.blank,
-            "merge_repeated": self.merge_repeated,
-        }
-
-    def test_check_output(self):
-        # NODE(yjjiang11): This op will be deprecated.
-        self.check_output(check_dygraph=False)
-
-
-class TestCTCAlignOpCase1(TestCTCAlignOp):
-    def config(self):
-        self.op_type = "ctc_align"
-        self.input_lod = [[11, 8]]
-        self.blank = 0
-        self.merge_repeated = True
-        self.input = (
-            np.array([0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7, 0, 0])
-            .reshape([19, 1])
-            .astype("int32")
-        )
-
-
-class TestCTCAlignOpCase2(TestCTCAlignOp):
-    def config(self):
-        self.op_type = "ctc_align"
-        self.input_lod = [[4]]
-        self.blank = 0
-        self.merge_repeated = True
-        self.input = np.array([0, 0, 0, 0]).reshape([4, 1]).astype("int32")
-
-
-class TestCTCAlignPaddingOp(OpTest):
-    def config(self):
-        self.op_type = "ctc_align"
-        self.input_lod = []
-        self.blank = 0
-        self.padding_value = 0
-        self.merge_repeated = True
-        self.input = (
-            np.array(
-                [
-                    [0, 2, 4, 4, 0, 6, 3, 6, 6, 0, 0],
-                    [1, 1, 3, 0, 0, 4, 5, 6, 0, 0, 0],
-                ]
-            )
-            .reshape([2, 11])
-            .astype("int32")
-        )
-        self.input_length = np.array([[9], [8]]).reshape([2, 1]).astype("int32")
-
-    def setUp(self):
-        self.config()
-        output, output_length = CTCAlign(
-            self.input,
-            self.input_lod,
-            self.blank,
-            self.merge_repeated,
-            self.padding_value,
-            self.input_length,
-        )
-        self.inputs = {
-            "Input": (self.input, self.input_lod),
-            "InputLength": self.input_length,
-        }
-        self.outputs = {"Output": output, "OutputLength": output_length}
-        self.attrs = {
-            "blank": self.blank,
-            "merge_repeated": self.merge_repeated,
-            "padding_value": self.padding_value,
-        }
-
-    def test_check_output(self):
-        # NODE(yjjiang11): This op will be deprecated.
-        self.check_output(check_dygraph=False)
-
-
-class TestCTCAlignOpCase3(TestCTCAlignPaddingOp):
-    def config(self):
-        self.op_type = "ctc_align"
-        self.blank = 0
-        self.input_lod = []
-        self.merge_repeated = True
-        self.padding_value = 0
-        self.input = (
-            np.array(
-                [[0, 1, 2, 2, 0, 4], [0, 4, 5, 0, 6, 0], [0, 7, 7, 7, 0, 0]]
-            )
-            .reshape([3, 6])
-            .astype("int32")
-        )
-        self.input_length = (
-            np.array([[6], [5], [4]]).reshape([3, 1]).astype("int32")
-        )
-
-
-class TestCTCAlignOpCase4(TestCTCAlignPaddingOp):
-    '''
-    # test tensor input which has attr input padding_value
-    '''
-
-    def config(self):
-        self.op_type = "ctc_align"
-        self.blank = 0
-        self.input_lod = []
-        self.merge_repeated = False
-        self.padding_value = 0
-        self.input = (
-            np.array(
-                [[0, 1, 2, 2, 0, 4], [0, 4, 5, 0, 6, 0], [0, 7, 7, 7, 0, 0]]
-            )
-            .reshape([3, 6])
-            .astype("int32")
-        )
-        self.input_length = (
-            np.array([[6], [5], [4]]).reshape([3, 1]).astype("int32")
-        )
-
-
-class TestCTCAlignOpCase5(TestCTCAlignPaddingOp):
-    def config(self):
-        self.op_type = "ctc_align"
-        self.blank = 0
-        self.input_lod = []
-        self.merge_repeated = False
-        self.padding_value = 1
-        self.input = (
-            np.array(
-                [[0, 1, 2, 2, 0, 4], [0, 4, 5, 0, 6, 0], [0, 7, 1, 7, 0, 0]]
-            )
-            .reshape([3, 6])
-            .astype("int32")
-        )
-        self.input_length = (
-            np.array([[6], [5], [4]]).reshape([3, 1]).astype("int32")
-        )
-
-
-if __name__ == "__main__":
-    paddle.enable_static()
-    unittest.main()
diff --git a/test/legacy_test/test_cumsum_op.py b/test/legacy_test/test_cumsum_op.py
index 5cc45e0b0b117..f782fdc1b0ff1 100644
--- a/test/legacy_test/test_cumsum_op.py
+++ b/test/legacy_test/test_cumsum_op.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import tempfile
 import unittest
 
 import numpy as np
@@ -515,8 +513,9 @@ def test_bad_x():
 class TestTensorAxis(unittest.TestCase):
     def setUp(self):
         paddle.seed(2022)
-        self.temp_dir = tempfile.TemporaryDirectory()
-        self.save_path = os.path.join(self.temp_dir.name, 'tensor_axis_cumsum')
+        # self.temp_dir = tempfile.TemporaryDirectory()
+        # self.save_path = os.path.join(self.temp_dir.name, 'tensor_axis_cumsum')
+        self.save_path = "./tensor_axis_cumsum"
         self.place = (
             paddle.CUDAPlace(0)
             if paddle.is_compiled_with_cuda()
@@ -576,6 +575,53 @@ def test_static_and_infer(self):
             infer_out = output_handle.copy_to_cpu()
             np.testing.assert_allclose(static_out[0], infer_out)
 
+    def test_static(self):
+        paddle.enable_static()
+        np_x = np.random.randn(9, 10, 11).astype('float32')
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            startup_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, startup_prog):
+                # run static
+                x = paddle.static.data(
+                    shape=np_x.shape, name='x', dtype=np_x.dtype
+                )
+                linear = paddle.nn.Linear(np_x.shape[-1], np_x.shape[-1])
+                linear_out = linear(x)
+                relu_out = paddle.nn.functional.relu(linear_out)
+                axis = paddle.full([1], 2, dtype='int64')
+                out = paddle.cumsum(relu_out, axis=axis)
+                loss = paddle.mean(out)
+                sgd = paddle.optimizer.SGD(learning_rate=0.0)
+                sgd.minimize(paddle.mean(out))
+
+                exe = paddle.static.Executor(self.place)
+                exe.run(startup_prog)
+                static_out = exe.run(feed={'x': np_x}, fetch_list=[out])
+
+                # run infer
+                paddle.static.save_inference_model(
+                    self.save_path, [x], [out], exe, program=main_prog
+                )
+
+                load_program, _, _ = paddle.static.load_inference_model(
+                    self.save_path, exe
+                )
+                self.assertEqual(
+                    len(load_program.global_block().ops) + 1,
+                    len(main_prog.global_block().ops),
+                )
+
+                self.assertEqual(
+                    load_program.global_block().ops[8].name(), 'pd_op.cumsum'
+                )
+                infer_out = exe.run(
+                    program=load_program,
+                    feed={'x': np_x},
+                    fetch_list=[load_program.global_block().ops[8].result(0)],
+                )
+                np.testing.assert_allclose(static_out[0], infer_out[0])
+
 
 class TestCumSumOpFp16(unittest.TestCase):
     @test_with_pir_api
diff --git a/test/legacy_test/test_dataloader.py b/test/legacy_test/test_dataloader.py
new file mode 100644
index 0000000000000..a7e0de0ba55f1
--- /dev/null
+++ b/test/legacy_test/test_dataloader.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle import base
+from paddle.io import DataLoader, Dataset
+
+BATCH_NUM = 4
+BATCH_SIZE = 8
+EPOCH_NUM = 2
+
+IMAGE_SIZE = 784
+CLASS_NUM = 10
+
+
+# define a random dataset
+class RandomDataset(Dataset):
+    def __init__(self, num_samples):
+        self.num_samples = num_samples
+
+    def __getitem__(self, idx):
+        image = np.random.random([IMAGE_SIZE]).astype('float32')
+        label = np.random.randint(0, CLASS_NUM - 1, (1,)).astype('int64')
+        return image, label
+
+    def __len__(self):
+        return self.num_samples
+
+
+dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+
+
+class TestDygraphDataLoader(unittest.TestCase):
+    def setUp(self):
+        self.batch_size = BATCH_SIZE
+        self.batch_num = BATCH_NUM
+        self.epoch_num = EPOCH_NUM
+
+    def iter_loader_data(self, loader):
+        for _ in range(self.epoch_num):
+            for image, label in loader():
+                relu = F.relu(image)
+                self.assertEqual(image.shape, [self.batch_size, IMAGE_SIZE])
+                self.assertEqual(label.shape, [self.batch_size, 1])
+                self.assertEqual(relu.shape, [self.batch_size, IMAGE_SIZE])
+
+    def test_single_process_loader_filedescriptor(self):
+        with base.dygraph.guard():
+            loader = DataLoader(
+                dataset,
+                batch_size=self.batch_size,
+                shuffle=True,
+                drop_last=True,
+                use_shared_memory=True,
+                num_workers=0,
+            )
+            self.iter_loader_data(loader)
+
+    def test_multi_process_dataloader_filedescriptor(self):
+        with base.dygraph.guard():
+            loader = DataLoader(
+                dataset,
+                batch_size=self.batch_size,
+                shuffle=True,
+                drop_last=True,
+                use_shared_memory=True,
+                num_workers=2,
+            )
+            self.iter_loader_data(loader)
+
+    def test_single_process_loader_filename(self):
+        paddle.base.core.globals()[
+            "FLAGS_dataloader_use_file_descriptor"
+        ] = False
+        with base.dygraph.guard():
+            loader = DataLoader(
+                dataset,
+                batch_size=self.batch_size,
+                shuffle=True,
+                drop_last=True,
+                use_shared_memory=True,
+                num_workers=0,
+            )
+            self.iter_loader_data(loader)
+
+    def test_multi_process_dataloader_filename(self):
+        paddle.base.core.globals()[
+            "FLAGS_dataloader_use_file_descriptor"
+        ] = False
+        with base.dygraph.guard():
+            loader = DataLoader(
+                dataset,
+                batch_size=self.batch_size,
+                shuffle=True,
+                drop_last=True,
+                use_shared_memory=True,
+                num_workers=2,
+            )
+            self.iter_loader_data(loader)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/legacy_test/test_dataloader_dataset.py b/test/legacy_test/test_dataloader_dataset.py
index 0d28b558d1acb..b6e5cfe204d29 100644
--- a/test/legacy_test/test_dataloader_dataset.py
+++ b/test/legacy_test/test_dataloader_dataset.py
@@ -89,5 +89,34 @@ def test_multi_process(self):
                     break
 
 
+class TestRandomSplitApi(unittest.TestCase):
+    def test_main(self):
+        paddle.seed(1)
+
+        dataset1, dataset2, dataset3 = paddle.io.random_split(
+            range(5), [0.3, 0.0, 0.7]
+        )
+
+        self.assertTrue(len(dataset1) == 2)
+        self.assertTrue(len(dataset2) == 0)
+        self.assertTrue(len(dataset3) == 3)
+
+        elements_list = list(range(5))
+
+        for _, val in enumerate(dataset1):
+            elements_list.remove(val)
+
+        for _, val in enumerate(dataset3):
+            elements_list.remove(val)
+
+        self.assertTrue(len(elements_list) == 0)
+
+    def test_errors(self):
+        paddle.seed(1)
+        self.assertRaises(
+            ValueError, paddle.io.random_split, range(5), [-0.2, 1.2]
+        )
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/legacy_test/test_directory_migration.py b/test/legacy_test/test_directory_migration.py
index 8dc4e3106a0e5..3efedf6601419 100644
--- a/test/legacy_test/test_directory_migration.py
+++ b/test/legacy_test/test_directory_migration.py
@@ -72,7 +72,6 @@ def test_new_directory(self):
             'paddle.static.gradients',
             'paddle.static.BuildStrategy',
             'paddle.static.CompiledProgram',
-            'paddle.static.ExecutionStrategy',
             'paddle.static.default_main_program',
             'paddle.static.default_startup_program',
             'paddle.static.Program',
@@ -157,7 +156,6 @@ def test_old_directory(self):
             'paddle.gradients',
             'paddle.BuildStrategy',
             'paddle.CompiledProgram',
-            'paddle.ExecutionStrategy',
             'paddle.name_scope',
             'paddle.program_guard',
             'paddle.Print',
diff --git a/test/legacy_test/test_dist_base.py b/test/legacy_test/test_dist_base.py
index c50f48690691d..0abf18fe42c87 100755
--- a/test/legacy_test/test_dist_base.py
+++ b/test/legacy_test/test_dist_base.py
@@ -335,11 +335,7 @@ def run_use_fleet_api_trainer(self, args):
 
         self.lr = args.lr
 
-        exec_strategy = base.ExecutionStrategy()
-        exec_strategy.num_threads = 1
-
         dist_strategy = DistributedStrategy()
-        dist_strategy.exec_strategy = exec_strategy
         dist_strategy.fuse_memory_size = 1  # MB
         dist_strategy.fuse_laryer_size = 1
         if args.use_local_sgd:
@@ -619,9 +615,6 @@ def run_trainer(self, args):
         exe.run(base.default_startup_program())
         print_to_err(type(self).__name__, "run worker startup program done.")
 
-        exec_strategy = base.ExecutionStrategy()
-        exec_strategy.num_threads = 1
-
         print_to_err(type(self).__name__, "begin to compile with data parallel")
         binary = compiler.CompiledProgram(
             trainer_prog, build_strategy=build_stra
@@ -1025,14 +1018,10 @@ def setUp(self):
             DIST_UT_PORT = int(os.getenv("PADDLE_DIST_UT_PORT"))
 
         if DIST_UT_PORT == 0:
-            self._ps_endpoints = "127.0.0.1:{},127.0.0.1:{}".format(
-                self._find_free_port(),
-                self._find_free_port(),
-            )
+            self._ps_endpoints = f"127.0.0.1:{self._find_free_port()},127.0.0.1:{self._find_free_port()}"
         else:
-            self._ps_endpoints = "127.0.0.1:{},127.0.0.1:{}".format(
-                DIST_UT_PORT,
-                DIST_UT_PORT + 1,
+            self._ps_endpoints = (
+                f"127.0.0.1:{DIST_UT_PORT},127.0.0.1:{DIST_UT_PORT + 1}"
             )
             DIST_UT_PORT += 2
             self._dist_port = DIST_UT_PORT
diff --git a/test/legacy_test/test_dist_fleet_base.py b/test/legacy_test/test_dist_fleet_base.py
index 94d6f836750b0..affe9b58d7eb8 100644
--- a/test/legacy_test/test_dist_fleet_base.py
+++ b/test/legacy_test/test_dist_fleet_base.py
@@ -212,24 +212,16 @@ def setUp(self):
 
         if DIST_UT_PORT:
             print("set begin_port:", DIST_UT_PORT)
-            self._ps_endpoints = "127.0.0.1:{},127.0.0.1:{}".format(
-                DIST_UT_PORT,
-                DIST_UT_PORT + 1,
+            self._ps_endpoints = (
+                f"127.0.0.1:{DIST_UT_PORT},127.0.0.1:{DIST_UT_PORT + 1}"
             )
-            self._tr_endpoints = "127.0.0.1:{},127.0.0.1:{}".format(
-                DIST_UT_PORT + 2,
-                DIST_UT_PORT + 3,
+            self._tr_endpoints = (
+                f"127.0.0.1:{DIST_UT_PORT + 2},127.0.0.1:{DIST_UT_PORT + 3}"
             )
             DIST_UT_PORT += 4
         else:
-            self._ps_endpoints = "127.0.0.1:{},127.0.0.1:{}".format(
-                self._find_free_port(),
-                self._find_free_port(),
-            )
-            self._tr_endpoints = "127.0.0.1:{},127.0.0.1:{}".format(
-                self._find_free_port(),
-                self._find_free_port(),
-            )
+            self._ps_endpoints = f"127.0.0.1:{self._find_free_port()},127.0.0.1:{self._find_free_port()}"
+            self._tr_endpoints = f"127.0.0.1:{self._find_free_port()},127.0.0.1:{self._find_free_port()}"
 
         self._python_interp = sys.executable
         self._geo_sgd_need_push_nums = 5
@@ -338,31 +330,9 @@ def _run_cluster(self, model, envs):
             python_path += " -m coverage run --branch -p"
         env.update(envs)
 
-        tr_cmd = "{} {} --role trainer --endpoints {} --trainer_endpoints {} --current_id {{}} --trainers {} --mode {} --geo_sgd_need_push_nums {} --reader {} --gloo_path {} --test {}".format(
-            python_path,
-            model,
-            self._ps_endpoints,
-            self._tr_endpoints,
-            self._trainers,
-            self._mode,
-            self._geo_sgd_need_push_nums,
-            self._reader,
-            gloo_path,
-            self._need_test,
-        )
+        tr_cmd = f"{python_path} {model} --role trainer --endpoints {self._ps_endpoints} --trainer_endpoints {self._tr_endpoints} --current_id {{}} --trainers {self._trainers} --mode {self._mode} --geo_sgd_need_push_nums {self._geo_sgd_need_push_nums} --reader {self._reader} --gloo_path {gloo_path} --test {self._need_test}"
 
-        ps_cmd = "{} {} --role pserver --endpoints {} --trainer_endpoints {} --current_id {{}} --trainers {} --mode {} --geo_sgd_need_push_nums {} --reader {} --gloo_path {} --test {}".format(
-            python_path,
-            model,
-            self._ps_endpoints,
-            self._tr_endpoints,
-            self._trainers,
-            self._mode,
-            self._geo_sgd_need_push_nums,
-            self._reader,
-            gloo_path,
-            self._need_test,
-        )
+        ps_cmd = f"{python_path} {model} --role pserver --endpoints {self._ps_endpoints} --trainer_endpoints {self._tr_endpoints} --current_id {{}} --trainers {self._trainers} --mode {self._mode} --geo_sgd_need_push_nums {self._geo_sgd_need_push_nums} --reader {self._reader} --gloo_path {gloo_path} --test {self._need_test}"
 
         if self._model_dir:
             tr_cmd += f" --model_dir {self._model_dir}"
diff --git a/test/legacy_test/test_dist_fleet_heter_base.py b/test/legacy_test/test_dist_fleet_heter_base.py
index 3f75352a03e56..808c81ace17ab 100644
--- a/test/legacy_test/test_dist_fleet_heter_base.py
+++ b/test/legacy_test/test_dist_fleet_heter_base.py
@@ -209,40 +209,24 @@ def setUp(self):
 
         if DIST_UT_PORT:
             print("set begin_port:", DIST_UT_PORT)
-            self._ps_endpoints = "127.0.0.1:{},127.0.0.1:{}".format(
-                DIST_UT_PORT,
-                DIST_UT_PORT + 1,
+            self._ps_endpoints = (
+                f"127.0.0.1:{DIST_UT_PORT},127.0.0.1:{DIST_UT_PORT + 1}"
             )
-            self._tr_endpoints = "127.0.0.1:{},127.0.0.1:{}".format(
-                DIST_UT_PORT + 2,
-                DIST_UT_PORT + 3,
+            self._tr_endpoints = (
+                f"127.0.0.1:{DIST_UT_PORT + 2},127.0.0.1:{DIST_UT_PORT + 3}"
             )
-            self._heter_endpoints = "127.0.0.1:{},127.0.0.1:{}".format(
-                DIST_UT_PORT + 4,
-                DIST_UT_PORT + 5,
+            self._heter_endpoints = (
+                f"127.0.0.1:{DIST_UT_PORT + 4},127.0.0.1:{DIST_UT_PORT + 5}"
             )
-            self._heter_endpoints_2 = "127.0.0.1:{},127.0.0.1:{}".format(
-                DIST_UT_PORT + 6,
-                DIST_UT_PORT + 7,
+            self._heter_endpoints_2 = (
+                f"127.0.0.1:{DIST_UT_PORT + 6},127.0.0.1:{DIST_UT_PORT + 7}"
             )
             DIST_UT_PORT += 8
         else:
-            self._ps_endpoints = "127.0.0.1:{},127.0.0.1:{}".format(
-                self._find_free_port(),
-                self._find_free_port(),
-            )
-            self._tr_endpoints = "127.0.0.1:{},127.0.0.1:{}".format(
-                self._find_free_port(),
-                self._find_free_port(),
-            )
-            self._heter_endpoints = "127.0.0.1:{},127.0.0.1:{}".format(
-                self._find_free_port(),
-                self._find_free_port(),
-            )
-            self._heter_endpoints_2 = "127.0.0.1:{},127.0.0.1:{}".format(
-                self._find_free_port(),
-                self._find_free_port(),
-            )
+            self._ps_endpoints = f"127.0.0.1:{self._find_free_port()},127.0.0.1:{self._find_free_port()}"
+            self._tr_endpoints = f"127.0.0.1:{self._find_free_port()},127.0.0.1:{self._find_free_port()}"
+            self._heter_endpoints = f"127.0.0.1:{self._find_free_port()},127.0.0.1:{self._find_free_port()}"
+            self._heter_endpoints_2 = f"127.0.0.1:{self._find_free_port()},127.0.0.1:{self._find_free_port()}"
 
         self._python_interp = sys.executable
         self._geo_sgd_need_push_nums = 5
@@ -376,47 +360,11 @@ def _run_cluster(self, model, envs):
             (self._heter_endpoints, self._heter_endpoints_2)
         )
 
-        tr_cmd = "{} {} --role trainer --endpoints {} --trainer_endpoints {} --current_id {{}} --trainers {} --mode {} --geo_sgd_need_push_nums {} --reader {} --gloo_path {} --heter_trainer_endpoints {} --heter_trainer_device {}".format(
-            python_path,
-            model,
-            self._ps_endpoints,
-            self._tr_endpoints,
-            self._trainers,
-            self._mode,
-            self._geo_sgd_need_push_nums,
-            self._reader,
-            gloo_path,
-            self._all_heter_endpoints,
-            self._heter_device,
-        )
+        tr_cmd = f"{python_path} {model} --role trainer --endpoints {self._ps_endpoints} --trainer_endpoints {self._tr_endpoints} --current_id {{}} --trainers {self._trainers} --mode {self._mode} --geo_sgd_need_push_nums {self._geo_sgd_need_push_nums} --reader {self._reader} --gloo_path {gloo_path} --heter_trainer_endpoints {self._all_heter_endpoints} --heter_trainer_device {self._heter_device}"
 
-        ps_cmd = "{} {} --role pserver --endpoints {} --trainer_endpoints {} --current_id {{}} --trainers {} --mode {} --geo_sgd_need_push_nums {} --reader {} --gloo_path {} --heter_trainer_endpoints {} --heter_trainer_device {}".format(
-            python_path,
-            model,
-            self._ps_endpoints,
-            self._tr_endpoints,
-            self._trainers,
-            self._mode,
-            self._geo_sgd_need_push_nums,
-            self._reader,
-            gloo_path,
-            self._all_heter_endpoints,
-            self._heter_device,
-        )
+        ps_cmd = f"{python_path} {model} --role pserver --endpoints {self._ps_endpoints} --trainer_endpoints {self._tr_endpoints} --current_id {{}} --trainers {self._trainers} --mode {self._mode} --geo_sgd_need_push_nums {self._geo_sgd_need_push_nums} --reader {self._reader} --gloo_path {gloo_path} --heter_trainer_endpoints {self._all_heter_endpoints} --heter_trainer_device {self._heter_device}"
 
-        heter_cmd = "{} {} --role heter_trainer --endpoints {} --trainer_endpoints {} --current_id {{}} --stage_id {{}} --trainers {} --mode {} --geo_sgd_need_push_nums {} --reader {} --gloo_path {} --heter_trainer_endpoints {} --heter_trainer_device {}".format(
-            python_path,
-            model,
-            self._ps_endpoints,
-            self._tr_endpoints,
-            self._trainers,
-            self._mode,
-            self._geo_sgd_need_push_nums,
-            self._reader,
-            gloo_path,
-            self._all_heter_endpoints,
-            self._heter_device,
-        )
+        heter_cmd = f"{python_path} {model} --role heter_trainer --endpoints {self._ps_endpoints} --trainer_endpoints {self._tr_endpoints} --current_id {{}} --stage_id {{}} --trainers {self._trainers} --mode {self._mode} --geo_sgd_need_push_nums {self._geo_sgd_need_push_nums} --reader {self._reader} --gloo_path {gloo_path} --heter_trainer_endpoints {self._all_heter_endpoints} --heter_trainer_device {self._heter_device}"
 
         # Run dist train to compare with local results
         ps0, ps1, ps0_pipe, ps1_pipe = self._start_pserver(ps_cmd, env)
diff --git a/test/legacy_test/test_dist_mnist_fleetapi.py b/test/legacy_test/test_dist_mnist_fleetapi.py
index 8a79af12a8d54..1691a482c17cf 100644
--- a/test/legacy_test/test_dist_mnist_fleetapi.py
+++ b/test/legacy_test/test_dist_mnist_fleetapi.py
@@ -72,8 +72,6 @@ def test_open_sync_batch_norm(self):
         )
         dist_optimizer.minimize(loss)
 
-        self.assertEqual(dist_strategy.exec_strategy.num_threads, 1)
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/legacy_test/test_downpoursgd.py b/test/legacy_test/test_downpoursgd.py
index c2ae5f54ed4a0..60ccacce6e895 100644
--- a/test/legacy_test/test_downpoursgd.py
+++ b/test/legacy_test/test_downpoursgd.py
@@ -48,9 +48,7 @@ def test_device_work_use_cvm(self):
             if not os.path.exists(
                 '{}/{}'.format(cache_path, 'fleet_desc.prototxt')
             ):
-                cmd = "wget --no-check-certificate https://pslib.bj.bcebos.com/fleet_desc.prototxt -P {}/".format(
-                    cache_path
-                )
+                cmd = f"wget --no-check-certificate https://pslib.bj.bcebos.com/fleet_desc.prototxt -P {cache_path}/"
                 os.system(cmd)
             x = paddle.static.data(name='x', shape=[-1, 1], dtype='int64')
             x_emb = paddle.static.nn.embedding(
@@ -112,9 +110,7 @@ def test_device_work(self):
             if not os.path.exists(
                 '{}/{}'.format(cache_path, 'fleet_desc.prototxt')
             ):
-                cmd = "wget --no-check-certificate https://pslib.bj.bcebos.com/fleet_desc.prototxt -P {}/".format(
-                    cache_path
-                )
+                cmd = f"wget --no-check-certificate https://pslib.bj.bcebos.com/fleet_desc.prototxt -P {cache_path}/"
                 os.system(cmd)
             x = paddle.static.data(name='x', shape=[-1, 1], dtype='int64')
             x_emb = paddle.static.nn.embedding(
@@ -174,9 +170,7 @@ def test_downpour_opt_work(self):
             if not os.path.exists(
                 '{}/{}'.format(cache_path, 'fleet_desc.prototxt')
             ):
-                cmd = "wget --no-check-certificate https://pslib.bj.bcebos.com/fleet_desc.prototxt -P {}/".format(
-                    cache_path
-                )
+                cmd = f"wget --no-check-certificate https://pslib.bj.bcebos.com/fleet_desc.prototxt -P {cache_path}/"
                 os.system(cmd)
             x = paddle.static.data(name='x', shape=[-1, 1], dtype='int64')
             x_emb = paddle.static.nn.embedding(
diff --git a/test/legacy_test/test_dropout_op.py b/test/legacy_test/test_dropout_op.py
index 77bebbbef9be1..7d7f8e596ebe7 100644
--- a/test/legacy_test/test_dropout_op.py
+++ b/test/legacy_test/test_dropout_op.py
@@ -29,7 +29,7 @@
 from paddle.pir_utils import test_with_pir_api
 
 
-def dropout_wapper(
+def dropout_wrapper(
     X,
     Seed=None,
     dropout_prob=0.5,
@@ -71,7 +71,7 @@ class TestDropoutOp(OpTest):
     def setUp(self):
         self.op_type = "dropout"
         self.prim_op_type = "comp"
-        self.python_api = dropout_wapper
+        self.python_api = dropout_wrapper
         self.public_python_api = prim_dropout_wrapper
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
         self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
@@ -99,7 +99,7 @@ class TestDropoutOp_ZeroDim(TestDropoutOp):
     def setUp(self):
         self.op_type = "dropout"
         self.prim_op_type = "comp"
-        self.python_api = dropout_wapper
+        self.python_api = dropout_wrapper
         self.public_python_api = prim_dropout_wrapper
         self.inputs = {'X': np.random.random(()).astype("float32")}
         self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
@@ -119,7 +119,7 @@ def setUp(self):
 class TestDropoutOpInput1d(OpTest):
     def setUp(self):
         self.op_type = "dropout"
-        self.python_api = dropout_wapper
+        self.python_api = dropout_wrapper
         self.public_python_api = prim_dropout_wrapper
         self.prim_op_type = "comp"
         self.inputs = {'X': np.random.random((2000,)).astype("float32")}
@@ -147,7 +147,7 @@ def test_check_grad_normal(self):
 class TestDropoutOp2(TestDropoutOp):
     def setUp(self):
         self.op_type = "dropout"
-        self.python_api = dropout_wapper
+        self.python_api = dropout_wrapper
         self.public_python_api = prim_dropout_wrapper
         self.prim_op_type = "comp"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
@@ -164,7 +164,7 @@ def setUp(self):
 class TestDropoutOp2_ZeroDim(TestDropoutOp2):
     def setUp(self):
         self.op_type = "dropout"
-        self.python_api = dropout_wapper
+        self.python_api = dropout_wrapper
         self.public_python_api = prim_dropout_wrapper
         self.prim_op_type = "comp"
         self.inputs = {'X': np.random.random(()).astype("float32")}
@@ -181,7 +181,7 @@ def setUp(self):
 class TestDropoutOp3(TestDropoutOp):
     def setUp(self):
         self.op_type = "dropout"
-        self.python_api = dropout_wapper
+        self.python_api = dropout_wrapper
         self.public_python_api = prim_dropout_wrapper
         self.prim_op_type = "comp"
         self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")}
@@ -203,7 +203,7 @@ def setUp(self):
 class TestDropoutOp4(OpTest):
     def setUp(self):
         self.op_type = "dropout"
-        self.python_api = dropout_wapper
+        self.python_api = dropout_wrapper
         self.public_python_api = prim_dropout_wrapper
         self.prim_op_type = "comp"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
@@ -223,7 +223,7 @@ def test_check_output(self):
 class TestDropoutOp5(OpTest):
     def setUp(self):
         self.op_type = "dropout"
-        self.python_api = dropout_wapper
+        self.python_api = dropout_wrapper
         self.public_python_api = prim_dropout_wrapper
         self.prim_op_type = "comp"
         self.inputs = {'X': np.random.random((32, 64, 3)).astype("float32")}
@@ -242,7 +242,7 @@ def test_check_output(self):
 class TestDropoutOp6(TestDropoutOp):
     def setUp(self):
         self.op_type = "dropout"
-        self.python_api = dropout_wapper
+        self.python_api = dropout_wrapper
         self.public_python_api = prim_dropout_wrapper
         self.prim_op_type = "comp"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
@@ -264,7 +264,7 @@ def setUp(self):
 class TestDropoutOp7(TestDropoutOp):
     def setUp(self):
         self.op_type = "dropout"
-        self.python_api = dropout_wapper
+        self.python_api = dropout_wrapper
         self.public_python_api = prim_dropout_wrapper
         self.prim_op_type = "comp"
         self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")}
@@ -291,7 +291,7 @@ def setUp(self):
 class TestDropoutOp8(OpTest):
     def setUp(self):
         self.op_type = "dropout"
-        self.python_api = dropout_wapper
+        self.python_api = dropout_wrapper
         self.public_python_api = prim_dropout_wrapper
         self.prim_op_type = "comp"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
@@ -314,7 +314,7 @@ def test_check_output(self):
 class TestDropoutOp9(OpTest):
     def setUp(self):
         self.op_type = "dropout"
-        self.python_api = dropout_wapper
+        self.python_api = dropout_wrapper
         self.public_python_api = prim_dropout_wrapper
         self.prim_op_type = "comp"
         self.inputs = {'X': np.random.random((32, 64, 3)).astype("float32")}
@@ -335,7 +335,7 @@ def test_check_output(self):
 class TestDropoutOpWithSeed(OpTest):
     def setUp(self):
         self.op_type = "dropout"
-        self.python_api = dropout_wapper
+        self.python_api = dropout_wrapper
         self.public_python_api = prim_dropout_wrapper
         self.prim_op_type = "comp"
         self.inputs = {
@@ -380,7 +380,7 @@ def test_check_grad_normal(self):
 class TestFP16DropoutOp(OpTest):
     def setUp(self):
         self.op_type = "dropout"
-        self.python_api = dropout_wapper
+        self.python_api = dropout_wrapper
         self.public_python_api = prim_dropout_wrapper
         self.prim_op_type = "comp"
         self.init_test_case()
@@ -431,7 +431,7 @@ def init_test_case(self):
 class TestBF16DropoutOp(OpTest):
     def setUp(self):
         self.op_type = "dropout"
-        self.python_api = dropout_wapper
+        self.python_api = dropout_wrapper
         self.public_python_api = prim_dropout_wrapper
         self.prim_op_type = "comp"
         self.dtype = np.uint16
diff --git a/test/legacy_test/test_eager_deletion_padding_rnn.py b/test/legacy_test/test_eager_deletion_padding_rnn.py
index 0bc835a86368f..3fd59573efd78 100644
--- a/test/legacy_test/test_eager_deletion_padding_rnn.py
+++ b/test/legacy_test/test_eager_deletion_padding_rnn.py
@@ -323,12 +323,6 @@ def setUp(self):
         self.reader = Reader()
         self.device_count = 1
 
-        # The default exec_strategy used for PaddingRNN.
-        # You can change it in set_customed_config.
-        self.exec_strategy = base.ExecutionStrategy()
-        self.exec_strategy.num_threads = self.device_count
-        self.exec_strategy.num_iteration_per_drop_scope = 100
-
         # The default build_strategy used for PaddingRNN.
         # You can change it in set_customed_config.
         self.build_strategy = base.BuildStrategy()
diff --git a/test/legacy_test/test_elementwise_add_op.py b/test/legacy_test/test_elementwise_add_op.py
index 81fc27044e04d..1f640e61056bb 100644
--- a/test/legacy_test/test_elementwise_add_op.py
+++ b/test/legacy_test/test_elementwise_add_op.py
@@ -54,7 +54,7 @@ def check_dygraph(self):
         return not self.use_mkldnn and self.axis == -1
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_output(
             check_dygraph=self.check_dygraph(),
             check_pir=self.check_dygraph(),
@@ -62,7 +62,7 @@ def test_check_output(self):
         )
 
     def test_check_grad_normal(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         if self.dtype == np.float16:
             return
         self.check_grad(
@@ -76,7 +76,7 @@ def test_check_grad_normal(self):
         )
 
     def test_check_grad_ignore_x(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         if self.dtype == np.float16:
             return
         self.check_grad(
@@ -91,7 +91,7 @@ def test_check_grad_ignore_x(self):
         )
 
     def test_check_grad_ignore_y(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         if self.dtype == np.float16:
             return
         self.check_grad(
@@ -152,7 +152,7 @@ def init_dtype(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         place = core.CUDAPlace(0)
         self.check_output_with_place(
             place,
diff --git a/test/legacy_test/test_elementwise_mul_op.py b/test/legacy_test/test_elementwise_mul_op.py
index e6959b299ae9d..5f100a064fff6 100644
--- a/test/legacy_test/test_elementwise_mul_op.py
+++ b/test/legacy_test/test_elementwise_mul_op.py
@@ -46,7 +46,7 @@ def setUp(self):
         self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_output(
             check_dygraph=(not self.use_mkldnn),
             check_pir=(not self.use_mkldnn),
@@ -54,7 +54,7 @@ def test_check_output(self):
         )
 
     def test_check_grad_normal(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_grad(
             ['X', 'Y'],
             'Out',
@@ -66,7 +66,7 @@ def test_check_grad_normal(self):
         )
 
     def test_check_grad_ignore_x(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_grad(
             ['Y'],
             'Out',
@@ -79,7 +79,7 @@ def test_check_grad_ignore_x(self):
         )
 
     def test_check_grad_ignore_y(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_grad(
             ['X'],
             'Out',
@@ -451,14 +451,14 @@ def if_enable_cinn(self):
         pass
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_output(
             check_dygraph=(not self.use_mkldnn),
             check_pir_onednn=self.check_pir_onednn,
         )
 
     def test_check_grad_normal(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_grad(
             ['X', 'Y'],
             'Out',
@@ -470,7 +470,7 @@ def test_check_grad_normal(self):
         )
 
     def test_check_grad_ignore_x(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_grad(
             ['Y'],
             'Out',
@@ -483,7 +483,7 @@ def test_check_grad_ignore_x(self):
         )
 
     def test_check_grad_ignore_y(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_grad(
             ['X'],
             'Out',
diff --git a/test/legacy_test/test_empty_op.py b/test/legacy_test/test_empty_op.py
index 2d73213817c45..49fb2526b7e53 100644
--- a/test/legacy_test/test_empty_op.py
+++ b/test/legacy_test/test_empty_op.py
@@ -20,7 +20,7 @@
 import paddle
 from paddle import base
 from paddle.base import core
-from paddle.base.framework import convert_np_dtype_to_dtype_
+from paddle.base.framework import convert_np_dtype_to_proto_type
 
 
 # Situation 1: Attr(shape) is a list(without tensor)
@@ -59,7 +59,7 @@ def verify_output(self, outs):
     def init_config(self):
         shape = [500, 3]
         dtype = 'float32'
-        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        dtype_inner = convert_np_dtype_to_proto_type(dtype)
         self.attrs = {'shape': shape, 'dtype': dtype_inner}
         self.inputs = {}
         self.outputs = {'Out': np.zeros(shape).astype(dtype)}
@@ -69,7 +69,7 @@ class TestEmptyOp2(TestEmptyOp):
     def init_config(self):
         shape = [500, 3]
         dtype = 'float64'
-        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        dtype_inner = convert_np_dtype_to_proto_type(dtype)
         self.attrs = {'shape': shape, 'dtype': dtype_inner}
         self.inputs = {}
         self.outputs = {'Out': np.zeros(shape).astype(dtype)}
@@ -79,7 +79,7 @@ class TestEmptyOp3(TestEmptyOp):
     def init_config(self):
         shape = [500, 3]
         dtype = 'int32'
-        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        dtype_inner = convert_np_dtype_to_proto_type(dtype)
         self.attrs = {'shape': shape, 'dtype': dtype_inner}
         self.inputs = {}
         self.outputs = {'Out': np.zeros(shape).astype(dtype)}
@@ -89,7 +89,7 @@ class TestEmptyOp4(TestEmptyOp):
     def init_config(self):
         shape = [500, 3]
         dtype = 'int64'
-        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        dtype_inner = convert_np_dtype_to_proto_type(dtype)
         self.attrs = {'shape': shape, 'dtype': dtype_inner}
         self.inputs = {}
         self.outputs = {'Out': np.zeros(shape).astype(dtype)}
@@ -99,7 +99,7 @@ class TestEmptyOp5(TestEmptyOp):
     def init_config(self):
         shape = [500, 3]
         dtype = 'bool'
-        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        dtype_inner = convert_np_dtype_to_proto_type(dtype)
         self.attrs = {'shape': shape, 'dtype': dtype_inner}
         self.inputs = {}
         self.outputs = {'Out': np.zeros(shape).astype(dtype)}
@@ -115,7 +115,7 @@ def setUp(self):
     def init_config(self):
         self.shape = [500, 3]
         dtype = 'float32'
-        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        dtype_inner = convert_np_dtype_to_proto_type(dtype)
         self.attrs = {'dtype': dtype_inner}
         self.inputs = {"ShapeTensor": np.array(self.shape).astype("int32")}
         self.outputs = {'Out': np.zeros(self.shape).astype(dtype)}
@@ -159,7 +159,7 @@ def init_config(self):
         self.infer_shape = [-1, 92]
 
         dtype = 'float32'
-        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        dtype_inner = convert_np_dtype_to_proto_type(dtype)
 
         shape_tensor_list = []
         for index, ele in enumerate(self.shape):
@@ -241,6 +241,7 @@ def test_dygraph_api_attr(self):
         paddle.enable_static()
 
     def test_static_graph(self):
+        paddle.enable_static()
         dtype = 'float64'
 
         positive_2_int32 = paddle.tensor.fill_constant([1], "int32", 3)
@@ -287,7 +288,7 @@ class TestEmptyFP16Op(TestEmptyOp):
     def init_config(self):
         shape = [500, 3]
         self.dtype = np.float16
-        dtype_inner = convert_np_dtype_to_dtype_(self.dtype)
+        dtype_inner = convert_np_dtype_to_proto_type(self.dtype)
         self.attrs = {'shape': shape, 'dtype': dtype_inner}
         self.inputs = {}
         self.outputs = {'Out': np.zeros(shape).astype(self.dtype)}
@@ -305,7 +306,7 @@ def setUp(self):
         self.__class__.op_type = self.op_type
         self.python_api = paddle.empty
         shape = np.array([200, 3]).astype('int32')
-        dtype_inner = convert_np_dtype_to_dtype_(self.dtype)
+        dtype_inner = convert_np_dtype_to_proto_type(self.dtype)
         output = np.zeros(shape).astype(self.dtype)
         self.inputs = {}
         self.attrs = {'shape': shape, 'dtype': dtype_inner}
@@ -328,6 +329,7 @@ def verify_output(self, outs):
 class TestEmptyError(unittest.TestCase):
     def test_attr(self):
         def test_dtype():
+            paddle.enable_static()
             shape = [200, 3]
             dtype = 'uint8'
             result = paddle.empty(shape=shape, dtype=dtype)
diff --git a/test/legacy_test/test_expand_v2_op.py b/test/legacy_test/test_expand_v2_op.py
index ff96f28ba5caa..8cbbfb2a2e39a 100644
--- a/test/legacy_test/test_expand_v2_op.py
+++ b/test/legacy_test/test_expand_v2_op.py
@@ -110,6 +110,110 @@ def init_data(self):
         self.expand_times = (1, 1, 1, 1)
 
 
+class TestExpandV2OpRank5(TestExpandV2OpRank1):
+    def init_data(self):
+        self.ori_shape = [5, 2, 1, 4, 5]
+        self.shape = [5, 2, 3, 4, 5]
+        self.expand_times = [1, 1, 3, 1, 1]
+
+
+class TestExpandV2OpRank5_Corner(TestExpandV2OpRank1):
+    def init_data(self):
+        self.ori_shape = [5, 2, 3, 4, 5]
+        self.shape = [5, 2, 3, 4, 5]
+        self.expand_times = [1, 1, 1, 1, 1]
+
+
+class TestExpandV2OpRank5_ZeroDim(TestExpandV2OpRank1):
+    def init_data(self):
+        self.ori_shape = []
+        self.shape = [5, 2, 3, 4, 5]
+        self.expand_times = [5, 2, 3, 4, 5]
+
+    def if_enable_cinn(self):
+        self.enable_cinn = False
+
+
+class TestExpandV2OpRank6(TestExpandV2OpRank1):
+    def init_data(self):
+        self.ori_shape = [1, 2, 1, 4, 5, 6]
+        self.shape = [1, 2, 3, 4, 5, 6]
+        self.expand_times = [1, 1, 3, 1, 1, 1]
+
+
+class TestExpandV2OpRank6_Corner(TestExpandV2OpRank1):
+    def init_data(self):
+        self.ori_shape = [1, 2, 3, 4, 5, 6]
+        self.shape = [1, 2, 3, 4, 5, 6]
+        self.expand_times = [1, 1, 1, 1, 1, 1]
+
+
+class TestExpandV2OpRank6_ZeroDim(TestExpandV2OpRank1):
+    def init_data(self):
+        self.ori_shape = []
+        self.shape = [1, 2, 3, 4, 5, 6]
+        self.expand_times = [1, 2, 3, 4, 5, 6]
+
+    def if_enable_cinn(self):
+        self.enable_cinn = False
+
+
+class TestExpandV2OpRank7(TestExpandV2OpRank1):
+    def init_data(self):
+        self.ori_shape = [5, 2, 1, 4, 5, 6, 7]
+        self.shape = [5, 2, 3, 4, 5, 6, 7]
+        self.expand_times = [1, 1, 3, 1, 1, 1, 1]
+
+
+class TestExpandV2OpRank7_Corner(TestExpandV2OpRank1):
+    def init_data(self):
+        self.ori_shape = [1, 2, 3, 4, 5, 2, 2]
+        self.shape = [1, 2, 3, 4, 5, 2, 2]
+        self.expand_times = [1, 1, 1, 1, 1, 1, 1]
+
+
+class TestExpandV2OpRank7_ZeroDim(TestExpandV2OpRank1):
+    def init_data(self):
+        self.ori_shape = []
+        self.shape = [1, 2, 3, 4, 5, 6, 7]
+        self.expand_times = [1, 2, 3, 4, 5, 6, 7]
+
+    def if_enable_cinn(self):
+        self.enable_cinn = False
+
+
+class TestExpandV2OpRank8(TestExpandV2OpRank1):
+    def init_data(self):
+        self.ori_shape = [1, 2, 1, 4, 5, 6, 7, 8]
+        self.shape = [1, 2, 3, 4, 5, 6, 7, 8]
+        self.expand_times = [1, 1, 3, 1, 1, 1, 1, 1]
+
+
+class TestExpandV2OpRank8_Corner(TestExpandV2OpRank1):
+    def init_data(self):
+        self.ori_shape = [1, 2, 3, 4, 5, 2, 2, 2]
+        self.shape = [1, 2, 3, 4, 5, 2, 2, 2]
+        self.expand_times = [1, 1, 1, 1, 1, 1, 1, 1]
+
+    def test_check_grad(self):
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_pir=True,
+            check_prim_pir=True,
+            numeric_grad_delta=1e-5,
+            max_relative_error=2e-7,  # need slightly larger than 1e-7.
+        )
+
+
+class TestExpandV2OpRank8_ZeroDim(TestExpandV2OpRank1):
+    def init_data(self):
+        self.ori_shape = []
+        self.shape = [1, 2, 3, 4, 5, 6, 7, 8]
+        self.expand_times = [1, 2, 3, 4, 5, 6, 7, 8]
+
+
 # Situation 2: shape is a list(with tensor)
 class TestExpandV2OpRank1_tensor_attr(OpTest):
     def setUp(self):
@@ -300,22 +404,23 @@ def test_check_grad(self):
 class TestExpandV2Error(unittest.TestCase):
     @test_with_pir_api
     def test_errors(self):
-        paddle.enable_static()
-        with paddle.static.program_guard(
-            paddle.static.Program(), paddle.static.Program()
-        ):
-            shape = [2, 2]
-            if not in_pir_mode():
-                x1 = base.create_lod_tensor(
-                    np.array([[-1]]), [[1]], base.CPUPlace()
-                )
-                self.assertRaises(TypeError, paddle.tensor.expand, x1, shape)
-            x2 = paddle.static.data(name='x2', shape=[-1, 4], dtype="bool")
-            x2.stop_gradient = False
-            self.assertRaises(ValueError, paddle.tensor.expand, x2, shape)
-            x2.stop_gradient = True
-            self.assertRaises(TypeError, paddle.tensor.expand, x2, 1)
-        paddle.disable_static()
+        with static_guard():
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
+                shape = [2, 2]
+                if not in_pir_mode():
+                    x1 = base.create_lod_tensor(
+                        np.array([[-1]]), [[1]], base.CPUPlace()
+                    )
+                    self.assertRaises(
+                        TypeError, paddle.tensor.expand, x1, shape
+                    )
+                x2 = paddle.static.data(name='x2', shape=[-1, 4], dtype="bool")
+                x2.stop_gradient = False
+                self.assertRaises(ValueError, paddle.tensor.expand, x2, shape)
+                x2.stop_gradient = True
+                self.assertRaises(TypeError, paddle.tensor.expand, x2, 1)
 
 
 # Test python API
@@ -552,7 +657,7 @@ class TestExpandPirValueListShape(unittest.TestCase):
     def test_value_list_shape1(self):
         with static_guard():
             with paddle.static.program_guard(paddle.static.Program()):
-                x = paddle.static.data('x', [1, 3])
+                x = paddle.static.data('x', [1, 1])
                 shape = [2, paddle.full([], 4)]
                 out = paddle.expand(x, shape)
                 np.testing.assert_array_equal(tuple(out.shape), (2, -1))
diff --git a/test/legacy_test/test_flash_attention.py b/test/legacy_test/test_flash_attention.py
index 343cb02e216d2..a5eba6148db81 100644
--- a/test/legacy_test/test_flash_attention.py
+++ b/test/legacy_test/test_flash_attention.py
@@ -51,8 +51,7 @@ def attention_naive(q, k, v, causal=False):
     kt = paddle.transpose(k, [0, 2, 1, 3])
     vt = paddle.transpose(v, [0, 2, 1, 3])
     scale = 1.0 / np.sqrt(q.shape[-1])
-    s = paddle.matmul(qt, paddle.transpose(kt, [0, 1, 3, 2]))
-    s = paddle.scale(s, scale)
+    s = paddle.matmul(qt * scale, paddle.transpose(kt, [0, 1, 3, 2]))
     p = (
         paddle.incubate.softmax_mask_fuse_upper_triangle(s)
         if causal
@@ -268,7 +267,7 @@ def test_all(self):
         self.assertEqual(q_.grad.shape, q.shape)
 
         np.testing.assert_allclose(
-            q.grad.numpy(), q_.grad.numpy(), rtol=5e-03, atol=1e-03
+            q.grad.numpy(), q_.grad.numpy(), rtol=5e-03, atol=2e-03
         )
 
         # test static
diff --git a/test/legacy_test/test_fleet_rolemaker.py b/test/legacy_test/test_fleet_rolemaker.py
index 7caf6452bfb14..0d781b4182308 100644
--- a/test/legacy_test/test_fleet_rolemaker.py
+++ b/test/legacy_test/test_fleet_rolemaker.py
@@ -33,7 +33,7 @@ def setUp(self):
         ] = "127.0.0.1:36001,127.0.0.2:36001"
 
     def test_tr_rolemaker(self):
-        """Test tr rolenamer."""
+        """Test tr rolemaker."""
         os.environ["TRAINING_ROLE"] = "TRAINER"
         os.environ["PADDLE_TRAINER_ID"] = "0"
 
@@ -105,26 +105,26 @@ def test_pslib_1(self):
             return
         fleet.clear_one_table(0)
         from paddle.incubate.distributed.fleet.role_maker import (
-            MPISymetricRoleMaker,
+            MPISymmetricRoleMaker,
         )
 
         try:
-            role = MPISymetricRoleMaker()
+            role = MPISymmetricRoleMaker()
             role._all_reduce([1], [2])
         except:
             print("catch expected error of not inited")
         try:
-            role = MPISymetricRoleMaker()
+            role = MPISymmetricRoleMaker()
             role._all_reduce([1], [2], "min")
         except:
             print("catch expected error of not inited")
         try:
-            role = MPISymetricRoleMaker()
+            role = MPISymmetricRoleMaker()
             role._all_reduce([1], [2], "max")
         except:
             print("catch expected error of not inited")
         try:
-            role = MPISymetricRoleMaker()
+            role = MPISymmetricRoleMaker()
             role._all_reduce([1], [2], "unknown")
         except:
             print("catch expected error of unknown type")
diff --git a/test/legacy_test/test_fleet_rolemaker_2.py b/test/legacy_test/test_fleet_rolemaker_2.py
index b7ee8ed7a3049..364cfb17e0453 100644
--- a/test/legacy_test/test_fleet_rolemaker_2.py
+++ b/test/legacy_test/test_fleet_rolemaker_2.py
@@ -279,18 +279,18 @@ def save_persistables(self):
         tmp.barrier_worker()
         tmp.barrier_all()
         from paddle.incubate.distributed.fleet.role_maker import (
-            MPISymetricRoleMaker,
+            MPISymmetricRoleMaker,
         )
 
-        tmp1 = MPISymetricRoleMaker()
+        tmp1 = MPISymmetricRoleMaker()
         tmp1.all_gather(1)
         tmp1.all_gather(1)
-        tmp2 = MPISymetricRoleMaker()
+        tmp2 = MPISymmetricRoleMaker()
         tmp2.all_reduce_worker([], [])
-        tmp3 = MPISymetricRoleMaker()
+        tmp3 = MPISymmetricRoleMaker()
         tmp3.barrier_worker()
         tmp3.barrier_worker()
-        tmp4 = MPISymetricRoleMaker()
+        tmp4 = MPISymmetricRoleMaker()
         tmp4.barrier_all()
         tmp4.barrier_all()
 
diff --git a/test/legacy_test/test_fleet_rolemaker_init.py b/test/legacy_test/test_fleet_rolemaker_init.py
index ecea552846f1f..617ac96b361a2 100644
--- a/test/legacy_test/test_fleet_rolemaker_init.py
+++ b/test/legacy_test/test_fleet_rolemaker_init.py
@@ -21,7 +21,7 @@
 
 class TestPSCloudRoleMakerCase1(unittest.TestCase):
     """
-    Test cases for PaddleCloudRoleMake Parameter Server.
+    Test cases for PaddleCloudRoleMaker Parameter Server.
     """
 
     def setUp(self):
@@ -37,7 +37,7 @@ def test_paddle_trainers_num(self):
 
 class TestPSCloudRoleMakerCase2(unittest.TestCase):
     """
-    Test cases for PaddleCloudRoleMake Parameter Server.
+    Test cases for PaddleCloudRoleMaker Parameter Server.
     """
 
     def setUp(self):
@@ -54,7 +54,7 @@ def test_training_role(self):
 
 class TestPSCloudRoleMakerCase3(unittest.TestCase):
     """
-    Test cases for PaddleCloudRoleMake Parameter Server.
+    Test cases for PaddleCloudRoleMaker Parameter Server.
     """
 
     def setUp(self):
@@ -72,7 +72,7 @@ def test_trainer_id(self):
 
 class TestPSCloudRoleMakerCase4(unittest.TestCase):
     """
-    Test cases for PaddleCloudRoleMake Parameter Server.
+    Test cases for PaddleCloudRoleMaker Parameter Server.
     """
 
     def setUp(self):
@@ -90,7 +90,7 @@ def test_ps_port(self):
 
 class TestPSCloudRoleMakerCase5(unittest.TestCase):
     """
-    Test cases for PaddleCloudRoleMake Parameter Server.
+    Test cases for PaddleCloudRoleMaker Parameter Server.
     """
 
     def setUp(self):
@@ -109,7 +109,7 @@ def test_ps_ip(self):
 
 class TestPSCloudRoleMakerCase6(unittest.TestCase):
     """
-    Test cases for PaddleCloudRoleMake Parameter Server.
+    Test cases for PaddleCloudRoleMaker Parameter Server.
     """
 
     def setUp(self):
@@ -130,7 +130,7 @@ def test_heter_port(self):
 
 class TestPSCloudRoleMakerCase7(unittest.TestCase):
     """
-    Test cases for PaddleCloudRoleMake Parameter Server.
+    Test cases for PaddleCloudRoleMaker Parameter Server.
     """
 
     def setUp(self):
diff --git a/test/legacy_test/test_fractional_max_pool2d_op.py b/test/legacy_test/test_fractional_max_pool2d_op.py
index 84870aa0e4aaf..08e356350eda5 100644
--- a/test/legacy_test/test_fractional_max_pool2d_op.py
+++ b/test/legacy_test/test_fractional_max_pool2d_op.py
@@ -102,7 +102,7 @@ def fractional_max_pool2D_forward_naive(
 
 
 # ----------------fractional_max_pool2d----------------
-def fractional_max_pool2d_wapper(
+def fractional_max_pool2d_wrapper(
     x,
     output_size=None,
     kernel_size=[0, 0],
@@ -121,7 +121,7 @@ def fractional_max_pool2d_wapper(
 class TestMaxPoolWithIndex_Op(OpTest):
     def setUp(self):
         self.op_type = "fractional_max_pool2d"
-        self.python_api = fractional_max_pool2d_wapper
+        self.python_api = fractional_max_pool2d_wrapper
         self.pool_forward_naive = fractional_max_pool2D_forward_naive
 
         self.init_test_case()
diff --git a/test/legacy_test/test_fractional_max_pool3d_op.py b/test/legacy_test/test_fractional_max_pool3d_op.py
index 418be4d3abbe0..24164222ec762 100644
--- a/test/legacy_test/test_fractional_max_pool3d_op.py
+++ b/test/legacy_test/test_fractional_max_pool3d_op.py
@@ -119,7 +119,7 @@ def fractional_max_pool3D_forward_naive(
 
 
 # ----------------fractional_max_pool3d----------------
-def fractional_max_pool3d_wapper(
+def fractional_max_pool3d_wrapper(
     x,
     output_size=None,
     kernel_size=[0, 0, 0],
@@ -138,7 +138,7 @@ def fractional_max_pool3d_wapper(
 class TestMaxPoolWithIndex_Op(OpTest):
     def setUp(self):
         self.op_type = "fractional_max_pool3d"
-        self.python_api = fractional_max_pool3d_wapper
+        self.python_api = fractional_max_pool3d_wrapper
         self.pool_forward_naive = fractional_max_pool3D_forward_naive
 
         self.init_test_case()
diff --git a/test/legacy_test/test_function_hook.py b/test/legacy_test/test_function_hook.py
index 3ac3b7e526c2c..ae99c79764ea8 100644
--- a/test/legacy_test/test_function_hook.py
+++ b/test/legacy_test/test_function_hook.py
@@ -34,7 +34,7 @@ def grad_hook(grad):
     return grad
 
 
-class TestBakcwardFunctionHookError(unittest.TestCase):
+class TestBackwardFunctionHookError(unittest.TestCase):
     def test_hook(self):
         input_data = np.ones([4, 4]).astype('float32')
 
diff --git a/test/legacy_test/test_fuse_all_reduce_pass.py b/test/legacy_test/test_fuse_all_reduce_pass.py
deleted file mode 100644
index 0745844bda323..0000000000000
--- a/test/legacy_test/test_fuse_all_reduce_pass.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-from functools import partial
-
-from fake_reader import fake_imdb_reader
-from parallel_executor_test_base import DeviceType, TestParallelExecutorBase
-from simple_nets import bow_net, fc_with_batchnorm, init_data, simple_fc_net
-
-import paddle
-from paddle import base
-from paddle.base import core
-
-paddle.enable_static()
-
-
-class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-
-    def compare_fuse_all_reduce_ops(
-        self,
-        model,
-        use_device,
-        init_feed_dict=None,
-        get_data_from_feeder=None,
-        optimizer=None,
-        fuse_all_optimizer_ops=False,
-    ):
-        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
-            return
-        if use_device == DeviceType.XPU and not core.is_compiled_with_xpu():
-            return
-
-        feed_dict_data = None
-        if init_feed_dict is not None:
-            img, label = init_feed_dict()
-            feed_dict_data = {"image": img, "label": label}
-
-        (
-            not_fuse_op_first_loss,
-            not_fuse_op_last_loss,
-            _,
-        ) = self.check_network_convergence(
-            model,
-            feed_dict=feed_dict_data,
-            get_data_from_feeder=get_data_from_feeder,
-            use_device=use_device,
-            fuse_all_reduce_ops=False,
-            fuse_all_optimizer_ops=fuse_all_optimizer_ops,
-            optimizer=optimizer,
-        )
-        (
-            fuse_op_first_loss,
-            fuse_op_last_loss,
-            _,
-        ) = self.check_network_convergence(
-            model,
-            feed_dict=feed_dict_data,
-            get_data_from_feeder=get_data_from_feeder,
-            use_device=use_device,
-            fuse_all_reduce_ops=True,
-            fuse_all_optimizer_ops=fuse_all_optimizer_ops,
-            optimizer=optimizer,
-        )
-
-        self.assertAlmostEqual(
-            not_fuse_op_first_loss, fuse_op_first_loss, delta=1e-6
-        )
-        self.assertAlmostEqual(
-            not_fuse_op_last_loss, fuse_op_last_loss, delta=1e-6
-        )
-
-    def optimizer(self, learning_rate=1e-3):
-        optimizer = paddle.optimizer.SGD(
-            learning_rate=learning_rate,
-            weight_decay=paddle.regularizer.L2Decay(1e-3),
-        )
-        return optimizer
-
-
-class TestFuseAllReduceOps(TestFuseAllReduceOpsBase):
-    def _decorate_compare_fused_all_reduce(self, model, use_device):
-        self.compare_fuse_all_reduce_ops(
-            model,
-            use_device,
-            init_feed_dict=init_data,
-            optimizer=self.optimizer,
-            fuse_all_optimizer_ops=True,
-        )
-
-    def test_simple_fc_with_fuse_all_reduce(self):
-        self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CUDA)
-        self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.XPU)
-        self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CPU)
-
-    def test_batchnorm_fc_with_fuse_all_reduce(self):
-        self._decorate_compare_fused_all_reduce(
-            fc_with_batchnorm, DeviceType.CUDA
-        )
-        # TODO(wangxi): xpu batch_norm op only support dim = 4
-        # self._decorate_compare_fused_all_reduce(fc_with_batchnorm,
-        #                                         DeviceType.XPU)
-        self._decorate_compare_fused_all_reduce(
-            fc_with_batchnorm, DeviceType.CPU
-        )
-
-
-class TestFuseAllReduceOpsAndOptiOps(TestFuseAllReduceOps):
-    def _decorate_compare_fused_all_reduce(self, model, use_device):
-        self.compare_fuse_all_reduce_ops(
-            model,
-            use_device,
-            init_feed_dict=init_data,
-            optimizer=self.optimizer,
-            fuse_all_optimizer_ops=True,
-        )
-
-
-class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-        cls.word_dict_len = 5147
-        batch_size = 64
-        reader = fake_imdb_reader(cls.word_dict_len, batch_size * 100)
-        reader = paddle.batch(reader, batch_size=batch_size)()
-        cls.train_data = next(reader)
-
-    def get_data_from_feeder(self):
-        place = base.CPUPlace()
-        feeder = base.DataFeeder(feed_list=["words", "label"], place=place)
-        return feeder.feed(self.train_data)
-
-    def _decorate_compare_fused_all_reduce(self, model, use_device):
-        self.compare_fuse_all_reduce_ops(
-            model,
-            use_device,
-            get_data_from_feeder=self.get_data_from_feeder,
-            optimizer=self.optimizer,
-        )
-
-    def test_simple_bow_net_with_fuse_all_reduce(self):
-        model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True)
-        self._decorate_compare_fused_all_reduce(model, DeviceType.CUDA)
-        # TODO(wangxi): xpu sum op only support LodTensor for now
-        # self._decorate_compare_fused_all_reduce(model, DeviceType.XPU)
-        self._decorate_compare_fused_all_reduce(model, DeviceType.CPU)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_fuse_elewise_add_act_pass.py b/test/legacy_test/test_fuse_elewise_add_act_pass.py
index b9237a14bd108..2f61178920a10 100644
--- a/test/legacy_test/test_fuse_elewise_add_act_pass.py
+++ b/test/legacy_test/test_fuse_elewise_add_act_pass.py
@@ -12,86 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 import unittest
 
 import numpy
-from parallel_executor_test_base import DeviceType, TestParallelExecutorBase
-from simple_nets import fc_with_batchnorm, init_data, simple_fc_net
 
 import paddle
 import paddle.nn.functional as F
 from paddle import base
-from paddle.base import core
-
-
-class TestMNIST(TestParallelExecutorBase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-
-    def _compare_fuse_elewise_add_act_ops(self, model, use_device):
-        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
-            return
-        img, label = init_data()
-
-        def _optimizer(learning_rate=1e-6):
-            optimizer = paddle.optimizer.SGD(
-                learning_rate=learning_rate,
-                weight_decay=paddle.regularizer.L2Decay(1e-6),
-            )
-            return optimizer
-
-        # NOTE(dzh):
-        # need to make it compatible with elewise fuse act
-        # FIXME (liuwei12)
-        # the new memory optimize strategy will crash this unittest
-        # add enable_inplace=False here to force pass the unittest
-        (
-            not_fuse_op_first_loss,
-            not_fuse_op_last_loss,
-            _,
-        ) = self.check_network_convergence(
-            model,
-            feed_dict={"image": img, "label": label},
-            use_device=use_device,
-            fuse_elewise_add_act_ops=False,
-            use_ir_memory_optimize=False,
-            enable_inplace=False,
-            optimizer=_optimizer,
-        )
-        (
-            fuse_op_first_loss,
-            fuse_op_last_loss,
-            _,
-        ) = self.check_network_convergence(
-            model,
-            feed_dict={"image": img, "label": label},
-            use_device=use_device,
-            fuse_elewise_add_act_ops=True,
-            use_ir_memory_optimize=False,
-            enable_inplace=False,
-            optimizer=_optimizer,
-        )
-
-        self.assertAlmostEqual(
-            not_fuse_op_first_loss, fuse_op_first_loss, delta=1e-6
-        )
-        self.assertAlmostEqual(
-            not_fuse_op_last_loss, fuse_op_last_loss, delta=1e-6
-        )
-
-    def test_simple_fc_with_fuse_op(self):
-        self._compare_fuse_elewise_add_act_ops(simple_fc_net, DeviceType.CUDA)
-        self._compare_fuse_elewise_add_act_ops(simple_fc_net, DeviceType.CPU)
-
-    def test_batchnorm_fc_with_fuse_op(self):
-        self._compare_fuse_elewise_add_act_ops(
-            fc_with_batchnorm, DeviceType.CUDA
-        )
-        self._compare_fuse_elewise_add_act_ops(
-            fc_with_batchnorm, DeviceType.CPU
-        )
 
 
 class TestFuseActElewiseAddInplaceGradPass(unittest.TestCase):
diff --git a/test/legacy_test/test_fuse_gemm_epilogue_pass.py b/test/legacy_test/test_fuse_gemm_epilogue_pass.py
index 177ebfa6b1819..d556d7e44876f 100644
--- a/test/legacy_test/test_fuse_gemm_epilogue_pass.py
+++ b/test/legacy_test/test_fuse_gemm_epilogue_pass.py
@@ -158,16 +158,12 @@ def _test_output(self):
         )
         self.assertTrue(
             verify_node_count(program._graph, "fused_gemm_epilogue", 3),
-            "[{}] The number of fused_gemm_epilogue is miss-matched in the computing graph.".format(
-                type(self).__name__
-            ),
+            f"[{type(self).__name__}] The number of fused_gemm_epilogue is miss-matched in the computing graph.",
         )
         act_fwd_name = self._get_act_type()[1]
         self.assertTrue(
             verify_node_count(program._graph, act_fwd_name, 1),
-            "[{}] The number of {} is miss-matched in the computing graph.".format(
-                type(self).__name__, act_fwd_name
-            ),
+            f"[{type(self).__name__}] The number of {act_fwd_name} is miss-matched in the computing graph.",
         )
 
     def _pre_test_hooks(self):
@@ -335,28 +331,20 @@ def _test_output(self):
 
         self.assertTrue(
             verify_node_count(program._graph, "fused_gemm_epilogue", 3),
-            "[{}] The number of fused_gemm_epilogue is miss-matched in the computing graph.".format(
-                type(self).__name__
-            ),
+            f"[{type(self).__name__}] The number of fused_gemm_epilogue is miss-matched in the computing graph.",
         )
         self.assertTrue(
             verify_node_count(program._graph, "fused_gemm_epilogue_grad", 3),
-            "[{}] The number of fused_gemm_epilogue_grad is miss-matched in the computing graph.".format(
-                type(self).__name__
-            ),
+            f"[{type(self).__name__}] The number of fused_gemm_epilogue_grad is miss-matched in the computing graph.",
         )
         _, act_fwd_name, act_bwd_name = self._get_act_type()
         self.assertTrue(
             verify_node_count(program._graph, act_fwd_name, 1),
-            "[{}] The number of {} is miss-matched in the computing graph.".format(
-                type(self).__name__, act_fwd_name
-            ),
+            f"[{type(self).__name__}] The number of {act_fwd_name} is miss-matched in the computing graph.",
         )
         self.assertTrue(
             verify_node_count(program._graph, act_bwd_name, 2),
-            "[{}] The number of {} is miss-matched in the computing graph.".format(
-                type(self).__name__, act_bwd_name
-            ),
+            f"[{type(self).__name__}] The number of {act_bwd_name} is miss-matched in the computing graph.",
         )
 
     def _pre_test_hooks(self):
diff --git a/test/legacy_test/test_fuse_optimizer_pass.py b/test/legacy_test/test_fuse_optimizer_pass.py
deleted file mode 100644
index 3fa7f3d999a61..0000000000000
--- a/test/legacy_test/test_fuse_optimizer_pass.py
+++ /dev/null
@@ -1,215 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-from functools import partial
-
-from fake_reader import fake_imdb_reader
-from parallel_executor_test_base import DeviceType, TestParallelExecutorBase
-from simple_nets import bow_net, fc_with_batchnorm, init_data
-
-import paddle
-from paddle import base
-from paddle.base import core
-
-
-class TestFuseOptimizationOps(TestParallelExecutorBase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-
-    def _get_feed_dict(self):
-        img, label = init_data()
-        return {"image": img, "label": label}
-
-    def _compare_fused_optimizer_ops(
-        self,
-        model,
-        use_device,
-        feed_dict=None,
-        get_data_from_feeder=None,
-        optimizer=paddle.optimizer.Adam,
-    ):
-        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
-            return
-
-        (
-            not_fuse_op_first_loss,
-            not_fuse_op_last_loss,
-            _,
-        ) = self.check_network_convergence(
-            model,
-            feed_dict=feed_dict,
-            get_data_from_feeder=get_data_from_feeder,
-            use_device=use_device,
-            fuse_all_optimizer_ops=False,
-            optimizer=optimizer,
-        )
-        (
-            fuse_op_first_loss,
-            fuse_op_last_loss,
-            _,
-        ) = self.check_network_convergence(
-            model,
-            feed_dict=feed_dict,
-            get_data_from_feeder=get_data_from_feeder,
-            use_device=use_device,
-            fuse_all_optimizer_ops=True,
-            optimizer=optimizer,
-        )
-
-        self.assertAlmostEqual(
-            not_fuse_op_first_loss, fuse_op_first_loss, delta=1e-6
-        )
-        self.assertAlmostEqual(
-            not_fuse_op_last_loss, fuse_op_last_loss, delta=1e-6
-        )
-
-    def _decorate_compare_fused_optimizer_ops(
-        self, model, use_device, optimizer
-    ):
-        self._compare_fused_optimizer_ops(
-            model,
-            use_device,
-            feed_dict=self._get_feed_dict(),
-            optimizer=optimizer,
-        )
-
-
-class TestFuseAdamOps(TestFuseOptimizationOps):
-    def optimizer(self, learning_rate=1e-4):
-        return paddle.optimizer.Adam(learning_rate=learning_rate)
-
-    def test_batchnorm_fc_with_fuse_op(self):
-        self._decorate_compare_fused_optimizer_ops(
-            fc_with_batchnorm, DeviceType.CUDA, optimizer=self.optimizer
-        )
-        self._decorate_compare_fused_optimizer_ops(
-            fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer
-        )
-
-
-class TestFuseSGDOps(TestFuseAdamOps):
-    def optimizer(self, learning_rate=1e-3):
-        return paddle.optimizer.SGD(learning_rate=learning_rate)
-
-
-class TestFuseMomentumOps(TestFuseAdamOps):
-    def optimizer(self, learning_rate=1e-3):
-        return paddle.optimizer.Momentum(
-            learning_rate=learning_rate, momentum=0.1
-        )
-
-
-class TestSpareFuseAdamOps(TestFuseOptimizationOps):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-        cls.word_dict_len = 5147
-        batch_size = 64
-        reader = fake_imdb_reader(cls.word_dict_len, batch_size * 100)
-        reader = paddle.batch(reader, batch_size=batch_size)()
-        cls.train_data = next(reader)
-
-    def _get_data_from_feeder(self):
-        place = base.CPUPlace()
-        feeder = base.DataFeeder(feed_list=["words", "label"], place=place)
-        return feeder.feed(self.train_data)
-
-    def _decorate_compare_fused_optimizer_ops(
-        self, model, use_device, optimizer
-    ):
-        self._compare_fused_optimizer_ops(
-            model,
-            use_device,
-            get_data_from_feeder=self._get_data_from_feeder,
-            optimizer=optimizer,
-        )
-
-    def optimizer(self, learning_rate=1e-4):
-        return paddle.optimizer.Adam(learning_rate=learning_rate)
-
-    def test_simple_bow_net_with_fuse_op(self):
-        model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True)
-        self._decorate_compare_fused_optimizer_ops(
-            model, DeviceType.CUDA, optimizer=self.optimizer
-        )
-        self._decorate_compare_fused_optimizer_ops(
-            model, DeviceType.CPU, optimizer=self.optimizer
-        )
-
-
-class TestSpareFuseSGDOps(TestSpareFuseAdamOps):
-    def optimizer(self, learning_rate=1e-3):
-        return paddle.optimizer.SGD(learning_rate=learning_rate)
-
-
-class TestSpareFuseMomentumOps(TestSpareFuseAdamOps):
-    def optimizer(self, learning_rate=1e-3):
-        return paddle.optimizer.Momentum(
-            learning_rate=learning_rate, momentum=0.1
-        )
-
-
-class TestPassConflictBase(TestFuseAdamOps):
-    def _compare_fused_optimizer_ops(
-        self,
-        model,
-        use_device,
-        feed_dict=None,
-        get_data_from_feeder=None,
-        optimizer=paddle.optimizer.Adam,
-    ):
-        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
-            return
-
-        self.check_pass_conflict(
-            model,
-            feed_dict=feed_dict,
-            get_data_from_feeder=get_data_from_feeder,
-            use_device=use_device,
-            fuse_all_optimizer_ops=True,
-            optimizer=optimizer,
-            enable_sequential_execution=True,
-        )
-
-
-class TestFuseAdamOpsPassConflict(TestPassConflictBase):
-    def optimizer(self, learning_rate=1e-4):
-        return paddle.optimizer.Adam(learning_rate=learning_rate)
-
-    def test_batchnorm_fc_with_fuse_op(self):
-        self._decorate_compare_fused_optimizer_ops(
-            fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer
-        )
-        self._decorate_compare_fused_optimizer_ops(
-            fc_with_batchnorm, DeviceType.CUDA, optimizer=self.optimizer
-        )
-
-
-class TestFuseSGDOpsPassConflict(TestFuseAdamOpsPassConflict):
-    def optimizer(self, learning_rate=1e-3):
-        return paddle.optimizer.SGD(learning_rate=learning_rate)
-
-
-class TestFuseMomentumOpsPassConflict(TestFuseAdamOpsPassConflict):
-    def optimizer(self, learning_rate=1e-3):
-        return paddle.optimizer.Momentum(
-            learning_rate=learning_rate, momentum=0.1
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_fuse_relu_depthwise_conv_pass.py b/test/legacy_test/test_fuse_relu_depthwise_conv_pass.py
deleted file mode 100644
index 50392ac974460..0000000000000
--- a/test/legacy_test/test_fuse_relu_depthwise_conv_pass.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from parallel_executor_test_base import DeviceType, TestParallelExecutorBase
-
-import paddle
-import paddle.nn.functional as F
-from paddle.base import core
-
-
-def norm(*args, **kargs):
-    return paddle.static.nn.batch_norm(*args, **kargs)
-
-
-def sep_conv(input, channel, stride, filter, dilation=1, act=None):
-    # with scope('depthwise'):
-    input = paddle.static.nn.conv2d(
-        input,
-        input.shape[1],
-        filter,
-        stride,
-        groups=input.shape[1],
-        padding=(filter // 2) * dilation,
-        dilation=dilation,
-        use_cudnn=False,
-        bias_attr=False,
-    )
-    input = norm(input)
-    if act:
-        input = act(input)
-    # with scope('pointwise'):
-    input = paddle.static.nn.conv2d(
-        input, channel, 1, 1, groups=1, padding=0, bias_attr=False
-    )
-    input = norm(input)
-    if act:
-        input = act(input)
-    return input
-
-
-def simple_depthwise_net(use_feed):
-    assert use_feed
-    img = paddle.static.data(name='image', shape=[-1, 784], dtype='float32')
-    label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64')
-    hidden = paddle.reshape(img, (-1, 1, 28, 28))
-    for _ in range(4):
-        hidden = sep_conv(hidden, channel=200, stride=2, filter=5)
-        hidden = F.relu(hidden)
-    prediction = paddle.static.nn.fc(hidden, size=10, activation='softmax')
-    loss = paddle.nn.functional.cross_entropy(
-        input=prediction, label=label, reduction='none', use_softmax=False
-    )
-    loss = paddle.mean(loss)
-    return loss
-
-
-class TestMNIST(TestParallelExecutorBase):
-    def _init_data(self, random=True):
-        np.random.seed(5)
-        if random:
-            img = np.random.random(size=[32, 784]).astype(np.float32)
-        else:
-            img = np.ones(shape=[32, 784], dtype='float32')
-        label = np.ones(shape=[32, 1], dtype='int64')
-        return img, label
-
-    def _compare(self, model, use_device, random_data=True, only_forward=False):
-        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
-            return
-        img, label = self._init_data(random_data)
-
-        def _optimizer(learning_rate=1e-6):
-            optimizer = paddle.optimizer.SGD(
-                learning_rate=learning_rate,
-                weight_decay=paddle.regularizer.L2Decay(1e-6),
-            )
-            return optimizer
-
-        if only_forward:
-            _optimizer = None
-
-        (
-            fuse_op_first_loss,
-            fuse_op_last_loss,
-            _,
-        ) = self.check_network_convergence(
-            model,
-            feed_dict={"image": img, "label": label},
-            use_device=use_device,
-            fuse_relu_depthwise_conv=True,
-            use_ir_memory_optimize=True,
-            optimizer=_optimizer,
-        )
-        (
-            not_fuse_op_first_loss,
-            not_fuse_op_last_loss,
-            _,
-        ) = self.check_network_convergence(
-            model,
-            feed_dict={"image": img, "label": label},
-            use_device=use_device,
-            fuse_relu_depthwise_conv=False,
-            optimizer=_optimizer,
-        )
-
-        self.assertAlmostEqual(
-            not_fuse_op_first_loss, fuse_op_first_loss, delta=1e-6
-        )
-        self.assertAlmostEqual(
-            not_fuse_op_last_loss, fuse_op_last_loss, delta=1e-6
-        )
-
-    def test_simple_depthwise_with_fuse_op(self):
-        self._compare(simple_depthwise_net, DeviceType.CUDA)
-        self._compare(simple_depthwise_net, DeviceType.CPU)
-
-    def test_simple_depthwise_with_fuse_op_only_forward(self):
-        self._compare(simple_depthwise_net, DeviceType.CUDA, only_forward=True)
-        self._compare(simple_depthwise_net, DeviceType.CPU, only_forward=True)
-
-
-if __name__ == '__main__':
-    paddle.enable_static()
-    unittest.main()
diff --git a/test/legacy_test/test_fuse_resunit_pass.py b/test/legacy_test/test_fuse_resunit_pass.py
index 472d45338e22d..ce7c37d846b74 100644
--- a/test/legacy_test/test_fuse_resunit_pass.py
+++ b/test/legacy_test/test_fuse_resunit_pass.py
@@ -206,9 +206,7 @@ def cal_output(self, enable_fusion):
                 verify_node_count(
                     program._graph, "fused_scale_bias_add_relu", 2
                 ),
-                "[{}] The number of fused_scale_bias_add_relu is miss-matched in the computing graph.".format(
-                    type(self).__name__
-                ),
+                f"[{type(self).__name__}] The number of fused_scale_bias_add_relu is miss-matched in the computing graph.",
             )
             conv_bnstats_count = 6 if self.is_shortcut else 8
             self.assertTrue(
@@ -217,9 +215,7 @@ def cal_output(self, enable_fusion):
                     "fused_scale_bias_relu_conv_bn",
                     conv_bnstats_count,
                 ),
-                "[{}] The number of fused_scale_bias_relu_conv_bn is miss-matched in the computing graph.".format(
-                    type(self).__name__
-                ),
+                f"[{type(self).__name__}] The number of fused_scale_bias_relu_conv_bn is miss-matched in the computing graph.",
             )
 
         return np.array(loss_list)
diff --git a/test/legacy_test/test_fused_dconv_drelu_dbn_op.py b/test/legacy_test/test_fused_dconv_drelu_dbn_op.py
index d038d8a83caa2..dcba253b9ee77 100644
--- a/test/legacy_test/test_fused_dconv_drelu_dbn_op.py
+++ b/test/legacy_test/test_fused_dconv_drelu_dbn_op.py
@@ -34,7 +34,7 @@ def skip_unit_test():
 skip_msg = "only support with cuda and Ampere or later devices"
 
 
-@skip_check_grad_ci(reason="no grap op")
+@skip_check_grad_ci(reason="no grad op")
 @unittest.skipIf(skip_unit_test(), skip_msg)
 class TestFusedDconvDreluDbnOp(OpTest):
     def setUp(self):
diff --git a/test/legacy_test/test_fused_elemwise_activation_op.py b/test/legacy_test/test_fused_elemwise_activation_op.py
index d1f45a83ff17e..b14e86aba9ff8 100644
--- a/test/legacy_test/test_fused_elemwise_activation_op.py
+++ b/test/legacy_test/test_fused_elemwise_activation_op.py
@@ -35,7 +35,7 @@
 
 
 def create_test_class(
-    test_case, callback, attrs, dtype=np.float32, grad_chek=True
+    test_case, callback, attrs, dtype=np.float32, grad_check=True
 ):
     class TestFusedElementwiseActivationOp_base(OpTest):
         def setUp(self):
@@ -89,7 +89,7 @@ def test_check_output(self):
 
         # FIXME(zcd): the intermediate_out_grad is not checked.
         def test_check_grad_normal(self):
-            if not grad_chek:
+            if not grad_check:
                 return
             if self.attrs["save_intermediate_out"]:
                 self.check_grad(['X', 'Y'], ['Out'], check_dygraph=False)
@@ -97,7 +97,7 @@ def test_check_grad_normal(self):
                 self.check_grad(['X', 'Y'], ['Out'], check_dygraph=False)
 
         def test_check_grad_ignore_x(self):
-            if not grad_chek:
+            if not grad_check:
                 return
             if self.attrs["save_intermediate_out"]:
                 self.check_grad(
@@ -117,7 +117,7 @@ def test_check_grad_ignore_x(self):
                 )
 
         def test_check_grad_ignore_y(self):
-            if not grad_chek:
+            if not grad_check:
                 return
             if self.attrs["save_intermediate_out"]:
                 self.check_grad(
@@ -448,7 +448,7 @@ def gelu_add_func(x, y, x_bcast, y_bcast, mode=0):
                     'save_intermediate_out': save_intermediate_out,
                 },
                 dtype=np.float16,
-                grad_chek=False,
+                grad_check=False,
             )
             create_test_class(
                 'add_scale_fp16' + suffix,
@@ -459,7 +459,7 @@ def gelu_add_func(x, y, x_bcast, y_bcast, mode=0):
                     'save_intermediate_out': save_intermediate_out,
                 },
                 dtype=np.float16,
-                grad_chek=False,
+                grad_check=False,
             )
 
             create_test_class(
@@ -470,7 +470,7 @@ def gelu_add_func(x, y, x_bcast, y_bcast, mode=0):
                     'save_intermediate_out': save_intermediate_out,
                 },
                 dtype=np.float16,
-                grad_chek=False,
+                grad_check=False,
             )
             create_test_class(
                 'relu_add_fp16' + suffix,
@@ -480,7 +480,7 @@ def gelu_add_func(x, y, x_bcast, y_bcast, mode=0):
                     'save_intermediate_out': save_intermediate_out,
                 },
                 dtype=np.float16,
-                grad_chek=False,
+                grad_check=False,
             )
             create_test_class(
                 'mul_scale_fp16' + suffix,
@@ -491,7 +491,7 @@ def gelu_add_func(x, y, x_bcast, y_bcast, mode=0):
                     'save_intermediate_out': save_intermediate_out,
                 },
                 dtype=np.float16,
-                grad_chek=False,
+                grad_check=False,
             )
             create_test_class(
                 'gelu_add_fp16' + suffix,
@@ -501,7 +501,7 @@ def gelu_add_func(x, y, x_bcast, y_bcast, mode=0):
                     'save_intermediate_out': save_intermediate_out,
                 },
                 dtype=np.float16,
-                grad_chek=False,
+                grad_check=False,
             )
 
 if __name__ == '__main__':
diff --git a/test/legacy_test/test_fused_feedforward_pass.py b/test/legacy_test/test_fused_feedforward_pass.py
index 8e428862fa656..d52d9029894a8 100644
--- a/test/legacy_test/test_fused_feedforward_pass.py
+++ b/test/legacy_test/test_fused_feedforward_pass.py
@@ -79,7 +79,7 @@ def forward(self, x):
 @unittest.skipIf(
     not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
 )
-class TestFusedFeedforwadPass(unittest.TestCase):
+class TestFusedFeedforwardPass(unittest.TestCase):
     def setUp(self):
         self.pre_layer_norm = True
         self.add_residual = True
diff --git a/test/legacy_test/test_fused_layernorm_op.py b/test/legacy_test/test_fused_layernorm_op.py
index c564a3c11fbe6..2ab024b3a08b7 100644
--- a/test/legacy_test/test_fused_layernorm_op.py
+++ b/test/legacy_test/test_fused_layernorm_op.py
@@ -103,7 +103,8 @@ def naive_residual_biasadd_layer_norm_int8(
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA "
+    not core.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(),
+    "core is not compiled with CUDA or ROCM",
 )
 class TestlayernormOp(unittest.TestCase):
     def setUp(self):
@@ -381,7 +382,8 @@ def test_residual_bias_add_layernorm_int8(self):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA "
+    not core.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(),
+    "core is not compiled with CUDA or ROCM",
 )
 class TestlayernormStaticOp(unittest.TestCase):
     def setUp(self):
@@ -792,5 +794,388 @@ def test_residual_bias_add_layernorm_int8(self):
         )
 
 
+@unittest.skipIf(
+    not core.supports_avx512f() or not core.is_compiled_with_avx(),
+    "machine is not support AVX or is not compiled with AVX",
+)
+class TestlayernormOpCPU(unittest.TestCase):
+    def setUp(self):
+        import os
+
+        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+        np.random.seed(20)
+        batch = 16
+        cols = 256
+
+        self.x_np = np.random.uniform(-0.05, 0.05, [batch, cols])
+        self.residual_np = np.random.uniform(-0.05, 0.05, [batch, cols])
+        self.bias_np = np.random.uniform(-0.05, 0.05, [cols])
+        self.norm_weight_np = np.random.uniform(-0.05, 0.05, [cols])
+        self.norm_bias_np = np.random.uniform(-0.05, 0.05, [cols])
+        self.epsilon = 1e-5
+        self.residual_alpha = np.random.uniform(low=0.1, high=1.1, size=[1])
+
+    def check_layernorm(self, x_np, gamma_np, beta_np, dtype):
+        paddle.disable_static()
+        x = paddle.to_tensor(x_np.astype(dtype))
+        gamma = paddle.to_tensor(gamma_np.astype(np.float32))
+        beta = paddle.to_tensor(beta_np.astype(np.float32))
+
+        paddle_layernorm_out = paddle.incubate.nn.functional.fused_layer_norm(
+            x, gamma, beta, self.epsilon, begin_norm_axis=1
+        )
+        paddle_naive_layernorm_out = naive_layer_norm(
+            x, gamma, beta, self.epsilon
+        )
+        paddle.enable_static()
+        return paddle_layernorm_out, paddle_naive_layernorm_out
+
+    def check_residual_bias_add(self, x_np, residual_np, bias_np, dtype):
+        paddle.disable_static()
+        x = paddle.to_tensor(x_np.astype(dtype))
+        residual = paddle.to_tensor(residual_np.astype(dtype))
+        bias = paddle.to_tensor(bias_np.astype(dtype))
+
+        paddle_layernorm_out = paddle.incubate.nn.functional.fused_layer_norm(
+            x,
+            None,
+            None,
+            self.epsilon,
+            begin_norm_axis=1,
+            bias=bias,
+            residual=residual,
+            residual_alpha=self.residual_alpha,
+        )
+
+        paddle_naive_residual_out = naive_residual_bias_add(
+            x, residual, bias, self.residual_alpha
+        )
+        paddle.enable_static()
+        return (paddle_layernorm_out, paddle_naive_residual_out)
+
+    def check_residual_bias_layernorm(
+        self, x_np, gamma_np, beta_np, residual_np, bias_np, dtype
+    ):
+        paddle.disable_static()
+        x = paddle.to_tensor(x_np.astype(dtype))
+        gamma = paddle.to_tensor(gamma_np.astype(np.float32))
+        beta = paddle.to_tensor(beta_np.astype(np.float32))
+        residual = paddle.to_tensor(residual_np.astype(dtype))
+        bias = paddle.to_tensor(bias_np.astype(dtype))
+
+        paddle_layernorm_out = paddle.incubate.nn.functional.fused_layer_norm(
+            x,
+            gamma,
+            beta,
+            self.epsilon,
+            begin_norm_axis=1,
+            bias=bias,
+            residual=residual,
+            residual_alpha=self.residual_alpha,
+        )
+
+        (
+            paddle_naive_layernorm_out,
+            paddle_naive_residual_out,
+        ) = naive_residual_biasadd_layer_norm(
+            x, residual, bias, gamma, beta, self.epsilon, self.residual_alpha
+        )
+        paddle.enable_static()
+        return (
+            paddle_layernorm_out,
+            paddle_naive_layernorm_out,
+            paddle_naive_residual_out,
+        )
+
+    def test_residual_bias_add(self):
+        (
+            paddle_residual_bias_out,
+            paddle_naive_residual_bias_out,
+        ) = self.check_residual_bias_add(
+            self.x_np, self.residual_np, self.bias_np, 'float32'
+        )
+        np.testing.assert_allclose(
+            paddle_residual_bias_out[0].numpy(),
+            paddle_naive_residual_bias_out.numpy(),
+            rtol=1e-3,
+            atol=1e-3,
+        )
+
+    def test_layernorm(self):
+        paddle_layernorm, paddle_naive_layernorm = self.check_layernorm(
+            self.x_np, self.norm_weight_np, self.norm_bias_np, 'float32'
+        )
+
+        np.testing.assert_allclose(
+            paddle_layernorm[0].numpy(),
+            paddle_naive_layernorm.numpy(),
+            rtol=1e-3,
+            atol=1e-3,
+        )
+
+    def test_residual_bias_add_layernorm(self):
+        (
+            paddle_layernorm,
+            paddle_naive_layernorm,
+            paddle_naive_residual_out,
+        ) = self.check_residual_bias_layernorm(
+            self.x_np,
+            self.norm_weight_np,
+            self.norm_bias_np,
+            self.residual_np,
+            self.bias_np,
+            'float32',
+        )
+        np.testing.assert_allclose(
+            paddle_layernorm[0].numpy(),
+            paddle_naive_layernorm.numpy(),
+            rtol=1e-3,
+            atol=1e-3,
+        )
+        np.testing.assert_allclose(
+            paddle_layernorm[1].numpy(),
+            paddle_naive_residual_out.numpy(),
+            rtol=1e-3,
+            atol=1e-3,
+        )
+
+
+@unittest.skipIf(
+    not core.supports_avx512f() or not core.is_compiled_with_avx(),
+    "machine is not support AVX or is not compiled with AVX",
+)
+class TestlayernormStaticOpCPU(unittest.TestCase):
+    def setUp(self):
+        import os
+
+        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+        np.random.seed(20)
+        self.batch = 16
+        self.cols = 256
+
+        self.x_np = np.random.uniform(-0.05, 0.05, [self.batch, self.cols])
+        self.residual_np = np.random.uniform(
+            -0.05, 0.05, [self.batch, self.cols]
+        )
+        self.bias_np = np.random.uniform(-0.05, 0.05, [self.cols])
+        self.norm_weight_np = np.random.uniform(-0.05, 0.05, [self.cols])
+        self.norm_bias_np = np.random.uniform(-0.05, 0.05, [self.cols])
+        self.epsilon = 1e-5
+        self.residual_alpha = np.random.uniform(low=0.1, high=1.1, size=[1])
+
+        self.place = paddle.CPUPlace()
+
+    def check_layernorm(self, x_np, gamma_np, beta_np, dtype):
+        paddle.disable_static()
+        x = paddle.to_tensor(x_np.astype(dtype))
+        gamma = paddle.to_tensor(gamma_np.astype(np.float32))
+        beta = paddle.to_tensor(beta_np.astype(np.float32))
+
+        paddle_naive_layernorm_out = naive_layer_norm(
+            x, gamma, beta, self.epsilon
+        )
+        paddle.enable_static()
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            x_static = paddle.static.data(
+                name="x_static", shape=[self.batch, self.cols], dtype=dtype
+            )
+            gamma_static = paddle.static.data(
+                name="gamma_static", shape=[self.cols], dtype='float32'
+            )
+            beta_static = paddle.static.data(
+                name="beta_static", shape=[self.cols], dtype='float32'
+            )
+            outs = paddle.incubate.nn.functional.fused_layer_norm(
+                x_static,
+                gamma_static,
+                beta_static,
+                self.epsilon,
+                begin_norm_axis=1,
+            )
+            exe = paddle.static.Executor(self.place)
+            out_s = exe.run(
+                feed={
+                    "x_static": x_np.astype(dtype),
+                    "gamma_static": gamma_np.astype(np.float32),
+                    "beta_static": beta_np.astype(np.float32),
+                },
+                fetch_list=[outs],
+            )
+        return out_s, paddle_naive_layernorm_out
+
+    def check_residual_bias_add(self, x_np, residual_np, bias_np, dtype):
+        paddle.disable_static()
+        x = paddle.to_tensor(x_np.astype(dtype))
+        residual = paddle.to_tensor(residual_np.astype(dtype))
+        bias = paddle.to_tensor(bias_np.astype(dtype))
+
+        paddle_naive_residual_out = naive_residual_bias_add(
+            x, residual, bias, self.residual_alpha
+        )
+        paddle.enable_static()
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            x_static = paddle.static.data(
+                name="x_static", shape=[self.batch, self.cols], dtype=dtype
+            )
+            residual_static = paddle.static.data(
+                name="residual_static",
+                shape=[self.batch, self.cols],
+                dtype=dtype,
+            )
+            bias_static = paddle.static.data(
+                name="bias_static", shape=[self.cols], dtype=dtype
+            )
+            outs = paddle.incubate.nn.functional.fused_layer_norm(
+                x_static,
+                None,
+                None,
+                self.epsilon,
+                begin_norm_axis=1,
+                bias=bias_static,
+                residual=residual_static,
+                residual_alpha=self.residual_alpha,
+            )
+
+            exe = paddle.static.Executor(self.place)
+            out_s = exe.run(
+                feed={
+                    "x_static": x_np.astype(dtype),
+                    "residual_static": residual_np.astype(dtype),
+                    "bias_static": bias_np.astype(dtype),
+                },
+                fetch_list=[
+                    outs[0]
+                ],  # NOTE: Only fetch `out`, because `residual_out` will not be initialized if both `norm_weight` and `norm_bias` are None.
+            )
+        return out_s, paddle_naive_residual_out
+
+    def check_residual_bias_layernorm(
+        self, x_np, gamma_np, beta_np, residual_np, bias_np, dtype
+    ):
+        paddle.disable_static()
+        x = paddle.to_tensor(x_np.astype(dtype))
+        gamma = paddle.to_tensor(gamma_np.astype(np.float32))
+        beta = paddle.to_tensor(beta_np.astype(np.float32))
+        residual = paddle.to_tensor(residual_np.astype(dtype))
+        bias = paddle.to_tensor(bias_np.astype(dtype))
+
+        (
+            paddle_naive_layernorm_out,
+            paddle_naive_residual_out,
+        ) = naive_residual_biasadd_layer_norm(
+            x, residual, bias, gamma, beta, self.epsilon, self.residual_alpha
+        )
+        paddle.enable_static()
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            x_static = paddle.static.data(
+                name="x_static", shape=[self.batch, self.cols], dtype=dtype
+            )
+            residual_static = paddle.static.data(
+                name="residual_static",
+                shape=[self.batch, self.cols],
+                dtype=dtype,
+            )
+            bias_static = paddle.static.data(
+                name="bias_static", shape=[self.cols], dtype=dtype
+            )
+            gamma_static = paddle.static.data(
+                name="gamma_static", shape=[self.cols], dtype='float32'
+            )
+            beta_static = paddle.static.data(
+                name="beta_static", shape=[self.cols], dtype='float32'
+            )
+            outs = paddle.incubate.nn.functional.fused_layer_norm(
+                x_static,
+                gamma_static,
+                beta_static,
+                self.epsilon,
+                begin_norm_axis=1,
+                residual_alpha=self.residual_alpha,
+                bias=bias_static,
+                residual=residual_static,
+            )
+
+            exe = paddle.static.Executor(self.place)
+            out_s = exe.run(
+                feed={
+                    "x_static": x_np.astype(dtype),
+                    "gamma_static": gamma_np.astype(np.float32),
+                    "beta_static": beta_np.astype(np.float32),
+                    "residual_static": residual_np.astype(dtype),
+                    "bias_static": bias_np.astype(dtype),
+                },
+                fetch_list=[outs],
+            )
+        return out_s, paddle_naive_layernorm_out, paddle_naive_residual_out
+
+    @test_with_pir_api
+    def test_layernorm(self):
+        paddle_layernorm, paddle_naive_layernorm = self.check_layernorm(
+            self.x_np, self.norm_weight_np, self.norm_bias_np, 'float32'
+        )
+
+        np.testing.assert_allclose(
+            paddle_layernorm[0],
+            paddle_naive_layernorm.numpy(),
+            rtol=1e-3,
+            atol=1e-3,
+        )
+
+    @test_with_pir_api
+    def test_residual_bias_add(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+        (
+            paddle_layernorm,
+            paddle_naive_residual_out,
+        ) = self.check_residual_bias_add(
+            self.x_np,
+            self.residual_np,
+            self.bias_np,
+            'float32',
+        )
+
+        np.testing.assert_allclose(
+            paddle_layernorm[0],
+            paddle_naive_residual_out.numpy(),
+            rtol=1e-3,
+            atol=1e-3,
+        )
+
+    @test_with_pir_api
+    def test_residual_bias_add_layernorm(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+        (
+            paddle_layernorm,
+            paddle_naive_layernorm,
+            paddle_naive_residual_out,
+        ) = self.check_residual_bias_layernorm(
+            self.x_np,
+            self.norm_weight_np,
+            self.norm_bias_np,
+            self.residual_np,
+            self.bias_np,
+            'float32',
+        )
+
+        np.testing.assert_allclose(
+            paddle_layernorm[0],
+            paddle_naive_layernorm.numpy(),
+            rtol=1e-3,
+            atol=1e-3,
+        )
+
+        np.testing.assert_allclose(
+            paddle_layernorm[1],
+            paddle_naive_residual_out.numpy(),
+            rtol=1e-3,
+            atol=1e-3,
+        )
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/legacy_test/test_fused_linear_param_grad_add.py b/test/legacy_test/test_fused_linear_param_grad_add.py
index 762b2a99b52e9..3f6fa251863d7 100644
--- a/test/legacy_test/test_fused_linear_param_grad_add.py
+++ b/test/legacy_test/test_fused_linear_param_grad_add.py
@@ -70,7 +70,12 @@ def run_ground_truth(x, dy, dweight, dbias, multi_precision, has_bias):
         dweight += dweight_tmp
 
     if has_bias:
-        dbias_tmp = dy.reshape([-1, dy.shape[-1]]).sum(axis=0)
+        if multi_precision:
+            dbias_tmp = (
+                promote_dtype(dy).reshape([-1, dy.shape[-1]]).sum(axis=0)
+            )
+        else:
+            dbias_tmp = dy.reshape([-1, dy.shape[-1]]).sum(axis=0)
         if dbias is None:
             dbias = dbias_tmp
         else:
@@ -91,6 +96,10 @@ def run_fused_linear_param_grad_add(
     )
     if dweight is not None:
         assert dweight_new.data_ptr() == dweight.data_ptr()
+    if has_bias and dbias is not None:
+        assert (
+            dbias_new.data_ptr() == dbias.data_ptr()
+        ), f"multi_precision={multi_precision}, has_bias={has_bias}, dbias.dtype={dbias.dtype}."
     if has_bias:
         return (
             promote_dtype(dweight_new).numpy(),
@@ -152,7 +161,11 @@ def check_main(self, has_dweight, has_dbias, multi_precision, has_bias):
         self.assertEqual(len(res1), len(res2))
         for r1, r2 in zip(res1, res2):
             max_diff = np.max(np.abs(r1 - r2))
-            self.assertLess(max_diff, 1e-10)
+            self.assertLess(
+                max_diff,
+                1e-10,
+                f"Check failed when: has_dweight={has_dweight}, has_dbias={has_dbias}, multi_precision={multi_precision}, has_bias={has_bias}",
+            )
 
     def test_main(self):
         if not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm():
diff --git a/test/legacy_test/test_fused_rotary_position_embedding.py b/test/legacy_test/test_fused_rotary_position_embedding.py
index 33e6aef4a68c9..1e08b93bca925 100644
--- a/test/legacy_test/test_fused_rotary_position_embedding.py
+++ b/test/legacy_test/test_fused_rotary_position_embedding.py
@@ -68,7 +68,7 @@ def mult_qkv_rotate_half(value, cos_tensor, sin_tensor):
     return query
 
 
-def get_sin_cos_tensor(seq_len, head_dim, sign=1):
+def get_sin_cos_tensor(seq_len, head_dim, sign=1, rotate_half=False):
     pos_seq = paddle.arange(0, seq_len, 1, dtype="float32")
     indices = paddle.arange(0, head_dim, 2, dtype="float32")
 
@@ -82,12 +82,23 @@ def get_sin_cos_tensor(seq_len, head_dim, sign=1):
 
     i = 0
 
-    for value in iter_array:
-        sin_sin[i * 2] = sign * np.sin(value)
-        cos_cos[i * 2 + 0] = np.cos(value)
-        sin_sin[i * 2 + 1] = np.sin(value)
-        cos_cos[i * 2 + 1] = np.cos(value)
-        i += 1
+    if rotate_half:
+        stride = head_dim // 2
+        for value in iter_array:
+            sin_sin[i] = sign * np.sin(value)
+            cos_cos[i] = np.cos(value)
+            sin_sin[i + stride] = np.sin(value)
+            cos_cos[i + stride] = np.cos(value)
+            i += 1
+            if i % head_dim == stride:
+                i += stride
+    else:
+        for value in iter_array:
+            sin_sin[i * 2] = sign * np.sin(value)
+            cos_cos[i * 2 + 0] = np.cos(value)
+            sin_sin[i * 2 + 1] = np.sin(value)
+            cos_cos[i * 2 + 1] = np.cos(value)
+            i += 1
 
     tensor_sin = paddle.reshape(
         paddle.to_tensor(sin_sin),
@@ -109,7 +120,7 @@ def paddle_fused_rotary_position_embedding(
     cos_tensor=None,
     position_ids=None,
     use_neox_rotary_style=True,
-    **kwargs
+    **kwargs,
 ):
     # permute q, k, v from [batch_size, seq_len, num_heads, head_dim]
     # to [batch_size, num_heads, seq_len, head_dim]
@@ -152,7 +163,7 @@ def paddle_fused_rotary_position_embedding(
     "core is not compiled with CUDA ",
 )
 @param.parameterized_class(
-    ("name", 'shape_q', 'shape_k', 'shape_v', 'position_ids_list'),
+    ("name", "shape_q", "shape_k", "shape_v", "position_ids_list"),
     [
         (
             "qkv_input",
@@ -198,10 +209,11 @@ def paddle_fused_rotary_position_embedding(
 )
 class TestFusedRotaryPositionEmbedding(unittest.TestCase):
     def setUp(self):
-        self.dtype = 'float32'
+        self.dtype = "float32"
         self.training = True
         self.seed = 1203
         self.rtol = 1e-5
+        self.atol = 1e-6
 
     def get_paddle_tensor(self, shape):
         if shape is None:
@@ -211,7 +223,9 @@ def get_paddle_tensor(self, shape):
         tmp.stop_gradient = False
         return tmp
 
-    def get_inputs(self, seed, with_sin_cos):
+    def get_inputs(
+        self, seed, with_sin_cos, with_grads=False, rotate_half=False
+    ):
         paddle.disable_static()
         paddle.seed(seed)
         # tensor_q shape: [batch_size, seq_len, num_heads, head_dim]
@@ -220,11 +234,27 @@ def get_inputs(self, seed, with_sin_cos):
         tensor_v = self.get_paddle_tensor(self.shape_v)
 
         tensor_sin, tensor_cos = (
-            get_sin_cos_tensor(tensor_q.shape[1], tensor_q.shape[3], 1)
+            get_sin_cos_tensor(
+                tensor_q.shape[1], tensor_q.shape[3], 1, rotate_half=rotate_half
+            )
             if with_sin_cos
             else (None, None)
         )
-        return tensor_q, tensor_k, tensor_v, tensor_sin, tensor_cos
+        if not with_grads:
+            return (tensor_q, tensor_k, tensor_v, tensor_sin, tensor_cos)
+        tensor_grad_outq = self.get_paddle_tensor(self.shape_q)
+        tensor_grad_outk = self.get_paddle_tensor(self.shape_k)
+        tensor_grad_outv = self.get_paddle_tensor(self.shape_v)
+        return (
+            tensor_q,
+            tensor_k,
+            tensor_v,
+            tensor_sin,
+            tensor_cos,
+            tensor_grad_outq,
+            tensor_grad_outk,
+            tensor_grad_outv,
+        )
 
     def get_forward_backward(
         self,
@@ -239,8 +269,20 @@ def get_forward_backward(
         fw = []
         bw = []
 
-        tensor_q, tensor_k, tensor_v, tensor_sin, tensor_cos = self.get_inputs(
-            seed, with_sin_cos
+        (
+            tensor_q,
+            tensor_k,
+            tensor_v,
+            tensor_sin,
+            tensor_cos,
+            tensor_grad_outq,
+            tensor_grad_outk,
+            tensor_grad_outv,
+        ) = self.get_inputs(
+            seed,
+            with_sin_cos,
+            with_grads=True,
+            rotate_half=not use_neox_rotary_style,
         )
 
         if test_time_major:
@@ -251,7 +293,27 @@ def get_forward_backward(
                 tensor_k = paddle.transpose(tensor_k, perm=[1, 0])
             if tensor_v is not None:
                 tensor_v = paddle.transpose(tensor_v, perm=[1, 0])
+            if tensor_grad_outq is not None:
+                tensor_grad_outq = paddle.transpose(
+                    tensor_grad_outq, perm=[1, 0]
+                )
+            if tensor_grad_outk is not None:
+                tensor_grad_outk = paddle.transpose(
+                    tensor_grad_outk, perm=[1, 0]
+                )
+            if tensor_grad_outv is not None:
+                tensor_grad_outv = paddle.transpose(
+                    tensor_grad_outv, perm=[1, 0]
+                )
 
+        tensor_q = tensor_q.detach().clone()
+        tensor_q.stop_gradient = False
+        if tensor_k is not None:
+            tensor_k = tensor_k.detach().clone()
+            tensor_k.stop_gradient = False
+        if tensor_v is not None:
+            tensor_v = tensor_v.detach().clone()
+            tensor_v.stop_gradient = False
         out_q, out_k, out_v = rope_function(
             tensor_q,
             tensor_k,
@@ -268,12 +330,20 @@ def get_forward_backward(
             if out_value is None or not out_value._is_initialized():
                 continue
             fw.append(out_value)
-            out_init_grad.append(paddle.randn(out_value.shape, self.dtype))
+        for grad_value in [
+            tensor_grad_outq,
+            tensor_grad_outk,
+            tensor_grad_outv,
+        ]:
+            if grad_value is None or not grad_value._is_initialized():
+                continue
+            out_init_grad.append(grad_value)
 
         paddle.autograd.backward(fw, out_init_grad, True)
         bw = list(
             filter(lambda x: x is not None, [tensor_q, tensor_k, tensor_v])
         )
+        bw = [x.grad for x in bw]
 
         if test_time_major:
             # transpose back
@@ -289,6 +359,8 @@ def check_results(self, p_results, f_results):
                 p_results[i].numpy(),
                 f_results[i].numpy(),
                 rtol=self.rtol,
+                atol=self.atol,
+                err_msg=f"Tensor {i} not match",
             )
 
     def test_fused_rope(self):
@@ -388,7 +460,7 @@ def test_fused_rope_position_ids(self):
     def test_static(self):
         paddle.disable_static()
         tensor_q, tensor_k, tensor_v, tensor_sin, tensor_cos = self.get_inputs(
-            self.seed, True
+            self.seed, True, rotate_half=True
         )
         p_fw, p_bw = self.get_forward_backward(
             paddle_fused_rotary_position_embedding,
@@ -448,11 +520,11 @@ def test_static(self):
         exe = paddle.static.Executor()
 
         feed = {
-            'sin': tensor_sin.numpy(),
-            'cos': tensor_cos.numpy(),
+            "sin": tensor_sin.numpy(),
+            "cos": tensor_cos.numpy(),
         }
         for var_name, input_tensor in zip(
-            ['q', 'k', 'v'], [tensor_q, tensor_k, tensor_v]
+            ["q", "k", "v"], [tensor_q, tensor_k, tensor_v]
         ):
             if input_tensor is not None:
                 feed[var_name] = input_tensor.numpy()
@@ -473,9 +545,7 @@ def test_static(self):
 
         for i in range(len(p_fw)):
             np.testing.assert_allclose(
-                p_fw[i].numpy(),
-                outs[i],
-                rtol=self.rtol,
+                p_fw[i].numpy(), outs[i], rtol=self.rtol, atol=self.atol
             )
         paddle.disable_static()
 
@@ -483,7 +553,7 @@ def test_static(self):
     def test_static_time_major(self):
         paddle.disable_static()
         tensor_q, tensor_k, tensor_v, tensor_sin, tensor_cos = self.get_inputs(
-            self.seed, True
+            self.seed, True, rotate_half=True
         )
         p_fw, p_bw = self.get_forward_backward(
             paddle_fused_rotary_position_embedding,
@@ -562,11 +632,11 @@ def test_static_time_major(self):
         exe = paddle.static.Executor()
 
         feed = {
-            'sin': tensor_sin.numpy(),
-            'cos': tensor_cos.numpy(),
+            "sin": tensor_sin.numpy(),
+            "cos": tensor_cos.numpy(),
         }
         for var_name, input_tensor in zip(
-            ['q', 'k', 'v'], [tensor_q, tensor_k, tensor_v]
+            ["q", "k", "v"], [tensor_q, tensor_k, tensor_v]
         ):
             if input_tensor is not None:
                 feed[var_name] = input_tensor.numpy().transpose((1, 0, 2, 3))
@@ -590,9 +660,34 @@ def test_static_time_major(self):
                 p_fw[i].numpy(),
                 outs[i].transpose((1, 0, 2, 3)),
                 rtol=self.rtol,
+                atol=self.atol,
             )
         paddle.disable_static()
 
+    def test_errors(self):
+        def test_error1():
+            f_fw, f_bw = self.get_forward_backward(
+                fused_rotary_position_embedding,
+                seed=self.seed,
+                test_time_major=False,
+                with_sin_cos=False,
+                use_neox_rotary_style=False,
+            )
+
+        self.assertRaises(AssertionError, test_error1)
+
+        def test_error2():
+            position_ids = paddle.to_tensor(self.position_ids_list)
+            f_fw, f_bw = self.get_forward_backward(
+                fused_rotary_position_embedding,
+                seed=self.seed,
+                test_time_major=False,
+                with_sin_cos=False,
+                position_ids=position_ids,
+            )
+
+        self.assertRaises(AssertionError, test_error2)
+
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/test/legacy_test/test_fused_transformer_encoder_layer.py b/test/legacy_test/test_fused_transformer_encoder_layer.py
index f2336e9f2bf14..e472af63e30e9 100644
--- a/test/legacy_test/test_fused_transformer_encoder_layer.py
+++ b/test/legacy_test/test_fused_transformer_encoder_layer.py
@@ -173,30 +173,10 @@ def test_out(self):
         )
         paddle.autograd.backward([fused_out], [paddle.to_tensor(dout)], True)
 
-        correct_ffn_str = 'd_model={}, dim_feedforward={}, dropout_rate={}, epsilon={}, activation={}, act_dropout_rate={}, normalize_before={}, dtype={}'.format(
-            self.d_model,
-            self.dim_feedforward,
-            self.dropout_rate,
-            fused_encoder.ffn._epsilon,
-            self.activation,
-            self.dropout_rate,
-            self.pre_layer_norm,
-            self.dtype,
-        )
+        correct_ffn_str = f'd_model={self.d_model}, dim_feedforward={self.dim_feedforward}, dropout_rate={self.dropout_rate}, epsilon={fused_encoder.ffn._epsilon}, activation={self.activation}, act_dropout_rate={self.dropout_rate}, normalize_before={self.pre_layer_norm}, dtype={self.dtype}'
         self.assertTrue(fused_encoder.ffn.extra_repr(), correct_ffn_str)
 
-        correct_attn_str = 'embed_dim={}, num_heads={}, dropout_rate={}, attn_dropout_rate={}, epsilon={}, kdim={}, vdim={}, normalize_before={}, need_weights={}, dtype={}'.format(
-            self.embed_dim,
-            self.num_heads,
-            self.dropout_rate,
-            self.dropout_rate,
-            fused_encoder.fused_attn._epsilon,
-            None,
-            None,
-            self.pre_layer_norm,
-            False,
-            self.dtype,
-        )
+        correct_attn_str = f'embed_dim={self.embed_dim}, num_heads={self.num_heads}, dropout_rate={self.dropout_rate}, attn_dropout_rate={self.dropout_rate}, epsilon={fused_encoder.fused_attn._epsilon}, kdim={None}, vdim={None}, normalize_before={self.pre_layer_norm}, need_weights={False}, dtype={self.dtype}'
         self.assertTrue(fused_encoder.fused_attn.extra_repr(), correct_attn_str)
 
         np.testing.assert_allclose(
diff --git a/test/legacy_test/test_graph_send_ue_recv_op.py b/test/legacy_test/test_graph_send_ue_recv_op.py
index d5d3c18436308..29b0119ac887f 100644
--- a/test/legacy_test/test_graph_send_ue_recv_op.py
+++ b/test/legacy_test/test_graph_send_ue_recv_op.py
@@ -754,9 +754,7 @@ def test_compute_all_with_sum(self):
                 paddle_res,
                 rtol=1e-05,
                 atol=1e-06,
-                err_msg='two value is                {}\n{}, check diff!'.format(
-                    np_res, paddle_res
-                ),
+                err_msg=f'two value is                {np_res}\n{paddle_res}, check diff!',
             )
 
     def test_compute_all_with_mean(self):
@@ -793,9 +791,7 @@ def test_compute_all_with_mean(self):
                 paddle_res,
                 rtol=1e-05,
                 atol=1e-06,
-                err_msg='two value is                {}\n{}, check diff!'.format(
-                    np_res, paddle_res
-                ),
+                err_msg=f'two value is                {np_res}\n{paddle_res}, check diff!',
             )
 
     def test_compute_all_with_max(self):
@@ -833,9 +829,7 @@ def test_compute_all_with_max(self):
                 paddle_res,
                 rtol=1e-05,
                 atol=1e-06,
-                err_msg='two value is                {}\n{}, check diff!'.format(
-                    np_res, paddle_res
-                ),
+                err_msg=f'two value is                {np_res}\n{paddle_res}, check diff!',
             )
 
     def test_compute_all_with_max_fp16(self):
@@ -892,9 +886,7 @@ def test_compute_all_with_max_fp16(self):
                         paddle_res,
                         rtol=1e-05,
                         atol=1e-06,
-                        err_msg='two value is                        {}\n{}, check diff!'.format(
-                            np_res, paddle_res
-                        ),
+                        err_msg=f'two value is                        {np_res}\n{paddle_res}, check diff!',
                     )
 
     def test_compute_all_with_min(self):
@@ -931,9 +923,7 @@ def test_compute_all_with_min(self):
                 paddle_res,
                 rtol=1e-05,
                 atol=1e-06,
-                err_msg='two value is                {}\n{}, check diff!'.format(
-                    np_res, paddle_res
-                ),
+                err_msg=f'two value is                {np_res}\n{paddle_res}, check diff!',
             )
 
     def test_compute_all_with_min_fp16(self):
@@ -986,9 +976,7 @@ def test_compute_all_with_min_fp16(self):
                         paddle_res,
                         rtol=1e-05,
                         atol=1e-06,
-                        err_msg='two value is                        {}\n{}, check diff!'.format(
-                            np_res, paddle_res
-                        ),
+                        err_msg=f'two value is                        {np_res}\n{paddle_res}, check diff!',
                     )
 
     def test_reshape_lhs_rhs(self):
@@ -1011,9 +999,7 @@ def test_reshape_lhs_rhs(self):
             res_add,
             rtol=1e-05,
             atol=1e-06,
-            err_msg='two value is                        {}\n{}, check diff!'.format(
-                np_add, res_add
-            ),
+            err_msg=f'two value is                        {np_add}\n{res_add}, check diff!',
         )
 
     @test_with_pir_api
@@ -1056,9 +1042,7 @@ def test_out_size_tensor_static(self):
             ret[0],
             rtol=1e-05,
             atol=1e-06,
-            err_msg='two value is                        {}\n{}, check diff!'.format(
-                np_sum, ret[0]
-            ),
+            err_msg=f'two value is                        {np_sum}\n{ret[0]}, check diff!',
         )
 
 
diff --git a/test/legacy_test/test_graph_send_uv_op.py b/test/legacy_test/test_graph_send_uv_op.py
index 45162ce0b346f..c9c16685e7cb7 100644
--- a/test/legacy_test/test_graph_send_uv_op.py
+++ b/test/legacy_test/test_graph_send_uv_op.py
@@ -190,9 +190,7 @@ def test_compute_all_dygraph(self):
                 paddle_res,
                 rtol=1e-05,
                 atol=1e-06,
-                err_msg='two value is                {}\n{}, check diff!'.format(
-                    np_res, paddle_res
-                ),
+                err_msg=f'two value is                {np_res}\n{paddle_res}, check diff!',
             )
 
     @test_with_pir_api
@@ -260,9 +258,7 @@ def test_compute_all_static(self):
                     paddle_res,
                     rtol=1e-05,
                     atol=1e-06,
-                    err_msg='two value is                    {}\n{}, check diff!'.format(
-                        np_res, paddle_res
-                    ),
+                    err_msg=f'two value is                    {np_res}\n{paddle_res}, check diff!',
                 )
 
 
diff --git a/test/legacy_test/test_group_norm_op.py b/test/legacy_test/test_group_norm_op.py
index 8a6060d1f9eeb..4e8b18f13f47b 100644
--- a/test/legacy_test/test_group_norm_op.py
+++ b/test/legacy_test/test_group_norm_op.py
@@ -118,18 +118,18 @@ def setUp(self):
         self.attrs['data_layout'] = self.data_format
 
     def test_check_output(self):
+        self.fw_comp_atol = 1e-13
+        self.fw_comp_rtol = 1e-13
         atol = 0
         inplace_atol = 0
         place = core.CPUPlace()
 
-        check_prim_output = True if self.data_format == "NCHW" else False
+        check_prim_output = True
         self.check_output_with_place(
             place, atol=atol, check_pir=True, check_prim_pir=check_prim_output
         )
 
         if core.is_compiled_with_cuda():
-            self.fw_comp_atol = 1e-13
-            self.fw_comp_rtol = 1e-13
             place = core.CUDAPlace(0)
             # group_norm uses AtomicAdd on CUDAPlace, which do not ensure
             # computation order when multiple threads write the same address. So the
@@ -180,7 +180,7 @@ def test_check_grad(self):
             self.do_compare_between_place()
             return
 
-        check_prim_grad = True if self.data_format == "NCHW" else False
+        check_prim_grad = True
 
         self.rev_comp_atol = 1e-12
         self.rev_comp_rtol = 1e-12
@@ -216,7 +216,7 @@ def test_check_output(self):
         atol = 1e-3
         inplace_atol = 1e-3
 
-        check_prim_output = True if self.data_format == "NCHW" else False
+        check_prim_output = True
         place = core.CUDAPlace(0)
         # group_norm uses AtomicAdd on CUDAPlace, which do not ensure
         # computation order when multiple threads write the same address. So the
@@ -234,7 +234,7 @@ def test_check_grad(self):
         if self.compare_between_place:
             return
 
-        check_prim_grad = True if self.data_format == "NCHW" else False
+        check_prim_grad = True
         self.rev_comp_atol = 1e-2
         self.rev_comp_rtol = 1e-2
         place = core.CUDAPlace(0)
@@ -265,7 +265,7 @@ def setUp(self):
         self.data_format = "NCHW"
         self.dtype = np.uint16
         self.shape = (2, 100, 3, 5)
-        self.attrs = {'epsilon': 1e-5, 'groups': 2, 'data_layout': "NCHW"}
+        self.attrs = {'epsilon': 1e-5, 'groups': 10, 'data_layout': "NCHW"}
         self.compare_between_place = False
         self.init_test_case()
 
@@ -295,7 +295,7 @@ def test_check_output(self):
         atol = 1e-2
         inplace_atol = 1e-2
 
-        check_prim_output = True if self.data_format == "NCHW" else False
+        check_prim_output = True
         place = core.CUDAPlace(0)
         # group_norm uses AtomicAdd on CUDAPlace, which do not ensure
         # computation order when multiple threads write the same address. So the
@@ -313,12 +313,12 @@ def test_check_grad(self):
         if self.compare_between_place:
             return
 
-        check_prim_grad = True if self.data_format == "NCHW" else False
+        check_prim_grad = True
 
         self.rev_comp_atol = 1e-2
         self.rev_comp_rtol = 1e-2
         # prim bf16 has diff in windows
-        if sys.platform == "win32":
+        if sys.platform == "win32" or self.data_format == "NHWC":
             self.rev_comp_atol = 5e-2
             self.rev_comp_rtol = 5e-2
         place = core.CUDAPlace(0)
@@ -363,7 +363,7 @@ def init_test_case(self):
 
 class TestGroupNormBF16Op2(TestGroupNormBF16Op):
     def init_test_case(self):
-        self.attrs['groups'] = 4
+        self.attrs['groups'] = 10
 
 
 class TestGroupNormOpBigEps1(TestGroupNormOp):
@@ -398,7 +398,7 @@ def init_test_case(self):
 
 class TestGroupNormOp1_With_NHWC(TestGroupNormOp):
     def init_test_case(self):
-        self.attrs['groups'] = 1
+        self.attrs['groups'] = 2
         self.data_format = "NHWC"
 
 
@@ -411,7 +411,7 @@ def init_test_case(self):
 class TestGroupNormFP16Op_With_NHWC(TestGroupNormFP16OP):
     def init_test_case(self):
         self.no_need_check_inplace = True
-        self.attrs['groups'] = 1
+        self.attrs['groups'] = 10
         self.data_format = "NHWC"
         self.attrs['epsilon'] = 0.5
         self.shape = (1, 100, 4, 4)
@@ -435,13 +435,16 @@ class TestGroupNormBF16Op_With_NHWC(TestGroupNormBF16Op):
     def setUp(self):
         self.op_type = "group_norm"
         self.python_api = group_norm_wrapper
+        self.public_python_api = group_norm_wrapper
         self.python_out_sig = ["Y"]
         self.data_format = "NHWC"
+        self.prim_op_type = "comp"
+
         self.dtype = np.uint16
-        self.shape = (1, 3, 5, 100)
+        self.shape = (1, 3, 5, 512)
         self.attrs = {
             'epsilon': 5e-2,
-            'groups': 2,
+            'groups': 32,
             'data_layout': self.data_format,
         }
         self.compare_between_place = False
@@ -458,7 +461,7 @@ def setUp(self):
             .reshape(self.shape)
             .astype(np.float32)
         )
-        scale = np.sin(np.arange(self.shape[3])).astype(np.float32)
+        scale = np.ones(self.shape[3]).astype(np.float32)
         bias = np.sin(np.arange(self.shape[3])).astype(np.float32)
         output, mean, var = group_norm_naive(
             input,
@@ -477,9 +480,14 @@ def setUp(self):
         self.outputs = {'Y': output, 'Mean': mean, 'Variance': var}
 
     def test_check_output(self):
-        rtol = 2e-2
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, rtol=rtol, check_pir=True)
+        self.check_output_with_place(
+            place,
+            rtol=2e-2,
+            inplace_atol=1e-3,
+            check_pir=True,
+            check_prim_pir=True,
+        )
 
 
 class TestGroupNormOpBigEps1_With_NHWC(TestGroupNormOp):
diff --git a/test/legacy_test/test_initializer.py b/test/legacy_test/test_initializer.py
index 68645abbcdf58..5910a9c4297e0 100644
--- a/test/legacy_test/test_initializer.py
+++ b/test/legacy_test/test_initializer.py
@@ -543,7 +543,7 @@ def test_xavier_initializer_supplied_arguments(
                 lod_level=0,
                 name="param",
                 initializer=paddle.nn.initializer.XavierInitializer(
-                    uniform=uniform, fan_in=12, fan_out=23, seed=134
+                    uniform=uniform, fan_in=12, fan_out=23, seed=134, gain=0.2
                 ),
             )
         num_ops = (
@@ -555,7 +555,7 @@ def test_xavier_initializer_supplied_arguments(
         init_op = block.ops[0]
         if uniform:
             self.assertEqual(init_op.type, 'uniform_random')
-            limit = np.sqrt(6.0 / (12 + 23))
+            limit = 0.2 * np.sqrt(6.0 / (12 + 23))
             self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
             self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
         else:
@@ -741,7 +741,11 @@ def test_xavier_initializer_supplied_arguments(
                     shape=[5, 10],
                     name="param",
                     initializer=paddle.nn.initializer.XavierInitializer(
-                        uniform=uniform, fan_in=12, fan_out=23, seed=134
+                        uniform=uniform,
+                        fan_in=12,
+                        fan_out=23,
+                        seed=134,
+                        gain=0.2,
                     ),
                 )
                 block = startup.global_block()
@@ -755,7 +759,7 @@ def test_xavier_initializer_supplied_arguments(
                 self.assertEqual(len(checked_ops), 1)
                 init_op = checked_ops[0]
                 if uniform:
-                    limit = np.sqrt(6.0 / (12 + 23))
+                    limit = 0.2 * np.sqrt(6.0 / (12 + 23))
                     min = self.get_operand_definition_op_attrs(
                         init_op, "min", "value"
                     )
@@ -1553,6 +1557,31 @@ def test_xavier_initializer(self, dtype="float32"):
         paddle.enable_static()
 
 
+class TestXavierInitializerDygraph2(unittest.TestCase):
+    def test_xavier_initializer_with_gain(self, dtype="float32"):
+        """
+        In dygraph mode, we can use initializer directly to initialize a tensor.
+        """
+        paddle.disable_static()
+
+        tensor = paddle.zeros([1024, 1024, 16])
+        tensor.stop_gradient = False
+
+        xavier_ = paddle.nn.initializer.XavierNormal(
+            fan_in=3, fan_out=5, gain=2.5
+        )
+        xavier_(tensor)
+
+        hist, _ = output_hist(tensor.numpy())
+
+        hist2, _ = output_hist(
+            np.random.normal(0, 2.5 * np.sqrt(2.0 / (3 + 5)), [1024, 1024, 16])
+        )
+
+        np.testing.assert_allclose(hist, hist2, rtol=0, atol=0.01)
+        paddle.enable_static()
+
+
 class TestMSRAInitializerDygraph(unittest.TestCase):
     def test_msra_initializer(self, dtype="float32"):
         """
diff --git a/test/legacy_test/test_inner.py b/test/legacy_test/test_inner.py
index 3283cb9f788e2..1f02b6ac56b26 100644
--- a/test/legacy_test/test_inner.py
+++ b/test/legacy_test/test_inner.py
@@ -152,27 +152,27 @@ def test_errors_dynamic_case1(self):
         y_data = np.random.rand(10, 2)
         x = paddle.to_tensor(x_data)
         y = paddle.to_tensor(y_data)
-        self.assertRaises(ValueError, paddle.inner, x, y)
+        self.assertRaises(Exception, paddle.inner, x, y)
 
     def test_errors_dynamic_case2(self):
         # test dynamic computation graph: dtype must be Tensor type
         x_data = np.random.randn(200).astype(np.float64)
         y_data = np.random.randn(200).astype(np.float64)
         y = paddle.to_tensor(y_data)
-        self.assertRaises(TypeError, paddle.inner, x_data, y)
+        self.assertRaises(Exception, paddle.inner, x_data, y)
 
     def test_errors_dynamic_case3(self):
         # test dynamic computation graph: dtype must be Tensor type
         x_data = np.random.randn(200).astype(np.float64)
         y_data = np.random.randn(200).astype(np.float64)
         x = paddle.to_tensor(x_data)
-        self.assertRaises(TypeError, paddle.inner, x, y_data)
+        self.assertRaises(Exception, paddle.inner, x, y_data)
 
     def test_errors_dynamic_case4(self):
         # test dynamic computation graph: dtype must be Tensor type
         x_data = np.random.randn(200).astype(np.float32)
         y_data = np.random.randn(200).astype(np.float32)
-        self.assertRaises(TypeError, paddle.inner, x_data, y_data)
+        self.assertRaises(Exception, paddle.inner, x_data, y_data)
 
 
 if __name__ == '__main__':
diff --git a/test/legacy_test/test_inplace.py b/test/legacy_test/test_inplace.py
index 83652bfa2f288..7655afb82cc04 100755
--- a/test/legacy_test/test_inplace.py
+++ b/test/legacy_test/test_inplace.py
@@ -438,6 +438,18 @@ def inplace_api_processing(self, var):
         return var.flatten_()
 
 
+class TestDygraphInplaceFlattenStride(TestDygraphInplace):
+    def init_data(self):
+        self.input_var_numpy = np.random.randn(2, 3, 2)
+        self.dtype = "float32"
+
+    def non_inplace_api_processing(self, var):
+        return var.flatten(0, 1)
+
+    def inplace_api_processing(self, var):
+        return var.flatten_(0, 1)
+
+
 class TestDygraphInplaceScatter(TestDygraphInplace):
     def init_data(self):
         self.input_var_numpy = np.array([[1, 1], [2, 2], [3, 3]])
diff --git a/test/legacy_test/test_iou_similarity_op.py b/test/legacy_test/test_iou_similarity_op.py
deleted file mode 100644
index 974bd0b362ada..0000000000000
--- a/test/legacy_test/test_iou_similarity_op.py
+++ /dev/null
@@ -1,101 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from numpy import random
-from op_test import OpTest
-
-
-class TestIOUSimilarityOp(OpTest):
-    def test_check_output(self):
-        self.check_output(check_dygraph=False)
-
-    def setUp(self):
-        self.op_type = "iou_similarity"
-        self.boxes1 = random.rand(2, 4).astype('float32')
-        self.boxes2 = random.rand(3, 4).astype('float32')
-        self.output = random.rand(2, 3).astype('float32')
-        self.box_normalized = False
-        # run python iou computation
-        self._compute_iou()
-        self.inputs = {'X': self.boxes1, 'Y': self.boxes2}
-        self.attrs = {"box_normalized": self.box_normalized}
-        self.outputs = {'Out': self.output}
-
-    def _compute_iou(
-        self,
-    ):
-        for row in range(self.boxes1.shape[0]):
-            for col in range(self.boxes2.shape[0]):
-                xmin1, ymin1, xmax1, ymax1 = self.boxes1[row]
-                xmin2, ymin2, xmax2, ymax2 = self.boxes2[col]
-                if not self.box_normalized:
-                    area1 = (ymax1 - ymin1 + 1) * (xmax1 - xmin1 + 1)
-                    area2 = (ymax2 - ymin2 + 1) * (xmax2 - xmin2 + 1)
-                else:
-                    area1 = (ymax1 - ymin1) * (xmax1 - xmin1)
-                    area2 = (ymax2 - ymin2) * (xmax2 - xmin2)
-
-                inter_xmax = min(xmax1, xmax2)
-                inter_ymax = min(ymax1, ymax2)
-                inter_xmin = max(xmin1, xmin2)
-                inter_ymin = max(ymin1, ymin2)
-                inter_height = inter_ymax - inter_ymin
-                inter_width = inter_xmax - inter_xmin
-                if not self.box_normalized:
-                    inter_height += 1
-                    inter_width += 1
-                inter_height = max(inter_height, 0)
-                inter_width = max(inter_width, 0)
-                inter_area = inter_width * inter_height
-                union_area = area1 + area2 - inter_area
-                sim_score = inter_area / union_area
-                self.output[row, col] = sim_score
-
-
-class TestIOUSimilarityOpWithLoD(TestIOUSimilarityOp):
-    def test_check_output(self):
-        self.check_output(check_dygraph=False)
-
-    def setUp(self):
-        super().setUp()
-        self.boxes1_lod = [[1, 1]]
-        self.output_lod = [[1, 1]]
-        self.box_normalized = False
-        # run python iou computation
-        self._compute_iou()
-        self.inputs = {'X': (self.boxes1, self.boxes1_lod), 'Y': self.boxes2}
-        self.attrs = {"box_normalized": self.box_normalized}
-        self.outputs = {'Out': (self.output, self.output_lod)}
-
-
-class TestIOUSimilarityOpWithBoxNormalized(TestIOUSimilarityOp):
-    def test_check_output(self):
-        self.check_output(check_dygraph=False)
-
-    def setUp(self):
-        super().setUp()
-        self.boxes1_lod = [[1, 1]]
-        self.output_lod = [[1, 1]]
-        self.box_normalized = True
-        # run python iou computation
-        self._compute_iou()
-        self.inputs = {'X': (self.boxes1, self.boxes1_lod), 'Y': self.boxes2}
-        self.attrs = {"box_normalized": self.box_normalized}
-        self.outputs = {'Out': (self.output, self.output_lod)}
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_ir_inplace_pass.py b/test/legacy_test/test_ir_inplace_pass.py
deleted file mode 100644
index c5a5be1168f87..0000000000000
--- a/test/legacy_test/test_ir_inplace_pass.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-
-import numpy as np
-from parallel_executor_test_base import DeviceType, TestParallelExecutorBase
-
-import paddle
-from paddle import base
-from paddle.base import core
-
-
-def fc_with_batchnorm(use_feed):
-    img = paddle.static.data(name='image', shape=[-1, 784], dtype='float32')
-    label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64')
-
-    hidden = img
-    for _ in range(3):
-        hidden = paddle.static.nn.fc(
-            hidden,
-            size=200,
-            activation='tanh',
-            bias_attr=base.ParamAttr(
-                initializer=paddle.nn.initializer.Constant(value=1.0)
-            ),
-        )
-
-        hidden = paddle.static.nn.batch_norm(input=hidden)
-    prediction = paddle.static.nn.fc(hidden, size=10, activation='softmax')
-    loss = paddle.nn.functional.cross_entropy(
-        input=prediction, label=label, reduction='none', use_softmax=False
-    )
-    loss = paddle.mean(loss)
-    return loss
-
-
-class TestIrInplace(TestParallelExecutorBase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-
-    def _fc_with_batchnorm(self, ir_memory_optimize, enable_inplace):
-        if not core.is_compiled_with_cuda():
-            return
-        np.random.seed(5)
-        img = np.random.random(size=[32, 784]).astype(np.float32)
-        label = np.ones(shape=[32, 1], dtype='int64')
-        self.check_network_convergence(
-            fc_with_batchnorm,
-            feed_dict={"image": img, "label": label},
-            use_device=DeviceType.CUDA,
-            use_ir_memory_optimize=ir_memory_optimize,
-            enable_inplace=enable_inplace,
-        )
-
-    def test_fc_with_batchnorm(self, delta=1e-3):
-        loss00 = self._fc_with_batchnorm(False, False)
-        loss10 = self._fc_with_batchnorm(True, False)
-        loss01 = self._fc_with_batchnorm(False, True)
-        loss11 = self._fc_with_batchnorm(True, True)
-        self.assertAlmostEqual(loss00, loss10, delta=delta)
-        self.assertAlmostEqual(loss00, loss01, delta=delta)
-        self.assertAlmostEqual(loss00, loss11, delta=delta)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_ir_memory_optimize_pass.py b/test/legacy_test/test_ir_memory_optimize_pass.py
deleted file mode 100644
index 6112d0aedd7ad..0000000000000
--- a/test/legacy_test/test_ir_memory_optimize_pass.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from parallel_executor_test_base import DeviceType, TestParallelExecutorBase
-
-import paddle
-from paddle.base import core
-
-
-def _feed_data_helper():
-    img = paddle.static.data(name='image', shape=[-1, 784], dtype='float32')
-    label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64')
-    return img, label
-
-
-def simple_fc_net(use_feed):
-    assert use_feed
-    x, y = _feed_data_helper()
-    hidden_layer = 4
-    for _ in range(hidden_layer):
-        x = paddle.static.nn.fc(x, size=20, activation='relu')
-    y_predict = paddle.static.nn.fc(x, size=10, activation='softmax')
-    cost = paddle.nn.functional.cross_entropy(
-        input=y_predict, label=y, reduction='none', use_softmax=False
-    )
-    avg_cost = paddle.mean(cost)
-    return avg_cost
-
-
-def fc_with_inplace_net(use_feed):
-    assert use_feed
-    x, y = _feed_data_helper()
-    fc = paddle.static.nn.fc(x=x, size=20, activation='relu')
-    fc = paddle.static.nn.fc(x=fc, size=10, activation='relu')
-    reshape = paddle.reshape(x=fc, shape=[-1, 2, 5])
-    reshape = paddle.reshape(x=reshape, shape=[-1, 5, 2])
-    y_predict = paddle.static.nn.fc(x=reshape, size=10, activation='softmax')
-    cost = paddle.nn.functional.cross_entropy(
-        input=y_predict, label=y, reduction='none', use_softmax=False
-    )
-    avg_cost = paddle.mean(cost)
-    return avg_cost
-
-
-class TestMNIST(TestParallelExecutorBase):
-    def _dummy_data(self):
-        np.random.seed(5)
-        img = np.random.random(size=[32, 784]).astype(np.float32)
-        label = np.ones(shape=[32, 1], dtype='int64')
-        return img, label
-
-    def _compare_ir_memory_optimize(self, model, use_device):
-        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
-            return
-
-        img, label = self._dummy_data()
-        first_loss0, last_loss0, _ = self.check_network_convergence(
-            model,
-            feed_dict={"image": img, "label": label},
-            use_device=use_device,
-            use_ir_memory_optimize=False,
-        )
-        first_loss1, last_loss1, _ = self.check_network_convergence(
-            model,
-            feed_dict={"image": img, "label": label},
-            use_device=use_device,
-            use_ir_memory_optimize=True,
-        )
-
-        self.assertAlmostEqual(first_loss0, first_loss1, delta=1e-6)
-        self.assertAlmostEqual(last_loss0, last_loss1, delta=1e-6)
-
-    def test_simple_fc_net(self):
-        self._compare_ir_memory_optimize(simple_fc_net, DeviceType.CPU)
-        self._compare_ir_memory_optimize(simple_fc_net, DeviceType.CUDA)
-
-    def test_fc_with_reshape_net(self):
-        self._compare_ir_memory_optimize(fc_with_inplace_net, DeviceType.CPU)
-        self._compare_ir_memory_optimize(fc_with_inplace_net, DeviceType.CUDA)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_ir_memory_optimize_transformer.py b/test/legacy_test/test_ir_memory_optimize_transformer.py
deleted file mode 100644
index b3dc82c12e636..0000000000000
--- a/test/legacy_test/test_ir_memory_optimize_transformer.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-
-from paddle.base import core
-
-os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
-
-from parallel_executor_test_base import DeviceType, TestParallelExecutorBase
-from test_parallel_executor_transformer import get_feed_data_reader, transformer
-
-
-# NOTE(dzhwinter): test diferent strategy colisions.
-# open the eager delete tensor strategy by default.
-class TestTransformerWithIR(TestParallelExecutorBase):
-    def test_main(self):
-        if core.is_compiled_with_cuda():
-            # check python transpiler
-            self.check_network_convergence(
-                transformer,
-                use_device=DeviceType.CUDA,
-                feed_data_reader=get_feed_data_reader(),
-                use_ir_memory_optimize=False,
-                iter=2,
-            )
-            # check IR memory optimize
-            self.check_network_convergence(
-                transformer,
-                use_device=DeviceType.CUDA,
-                feed_data_reader=get_feed_data_reader(),
-                use_ir_memory_optimize=True,
-                iter=2,
-            )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_isfinite_v2_op.py b/test/legacy_test/test_isfinite_v2_op.py
index d33d8815ec8a5..36a6366e58b2c 100644
--- a/test/legacy_test/test_isfinite_v2_op.py
+++ b/test/legacy_test/test_isfinite_v2_op.py
@@ -111,9 +111,33 @@ def np_data_generator(
     },
 ]
 
+TEST_META_DATA_ADDITIONAL = [
+    {
+        'low': 0.1,
+        'high': 1,
+        'np_shape': [2, 3, 4, 5],
+        'type': 'int8',
+        'sv_list': [np.inf, np.nan],
+    },
+    {
+        'low': 0,
+        'high': 100,
+        'np_shape': [11, 17, 10],
+        'type': 'int16',
+        'sv_list': [np.inf, np.nan],
+    },
+    {
+        'low': 0,
+        'high': 999,
+        'np_shape': [132],
+        'type': 'uint8',
+        'sv_list': [np.inf, np.nan],
+    },
+]
+
 
-def test(test_case, op_str, use_gpu=False):
-    for meta_data in TEST_META_DATA:
+def test(test_case, op_str, use_gpu=False, data_set=TEST_META_DATA):
+    for meta_data in data_set:
         meta_data = dict(meta_data)
         meta_data['op_str'] = op_str
         x_np, result_np = np_data_generator(**meta_data)
@@ -144,6 +168,9 @@ def test_nan(self):
     def test_finite(self):
         test(self, 'isfinite')
 
+    def test_inf_additional(self):
+        test(self, 'isinf', data_set=TEST_META_DATA_ADDITIONAL)
+
 
 class TestCUDANormal(unittest.TestCase):
     def test_inf(self):
@@ -155,6 +182,9 @@ def test_nan(self):
     def test_finite(self):
         test(self, 'isfinite', True)
 
+    def test_inf_additional(self):
+        test(self, 'isinf', True, data_set=TEST_META_DATA_ADDITIONAL)
+
 
 class TestError(unittest.TestCase):
     @test_with_pir_api
diff --git a/test/legacy_test/test_jit_save_load.py b/test/legacy_test/test_jit_save_load.py
index 356eb9d3b33df..e9fbf29759b40 100644
--- a/test/legacy_test/test_jit_save_load.py
+++ b/test/legacy_test/test_jit_save_load.py
@@ -845,9 +845,7 @@ def verify_inference_correctness(
         np.testing.assert_array_equal(
             pred,
             loaded_pred,
-            err_msg='Result diff when load and inference:\nlayer result:\n{}\nloaded layer result:\n{}'.format(
-                pred, loaded_pred
-            ),
+            err_msg=f'Result diff when load and inference:\nlayer result:\n{pred}\nloaded layer result:\n{loaded_pred}',
         )
 
     def test_no_prune_to_static_after_train(self):
@@ -1649,9 +1647,7 @@ def verify_inference_correctness(self, layer, path):
         np.testing.assert_array_equal(
             pred,
             loaded_pred,
-            err_msg='Result diff when load and inference:\nlayer result:\n{}\nloaded layer result:\n{}'.format(
-                pred, loaded_pred
-            ),
+            err_msg=f'Result diff when load and inference:\nlayer result:\n{pred}\nloaded layer result:\n{loaded_pred}',
         )
 
     def test_jit_save_data_parallel_with_inputspec(self):
diff --git a/test/legacy_test/test_layers.py b/test/legacy_test/test_layers.py
index 8529245d1fe2d..b2e3691eac705 100644
--- a/test/legacy_test/test_layers.py
+++ b/test/legacy_test/test_layers.py
@@ -2465,6 +2465,28 @@ def test_support_tuple(self):
             self.assertTrue(model._linear.weight.dtype == paddle.float32)
 
 
+class TestLayerClearGradientSetToZero(unittest.TestCase):
+    def test_layer_clear_gradient_set_to_zero_true(self):
+        with base.dygraph.guard():
+            net = MyLayer()
+            inputs = paddle.randn([10, 1])
+            outputs = net(inputs)
+            outputs.backward()
+            net.clear_gradients()
+            self.assertTrue(
+                net._linear.weight.grad.numpy() == np.array([[0.0]])
+            )
+
+    def test_layer_clear_gradient_set_to_zero_false(self):
+        with base.dygraph.guard():
+            net = MyLayer()
+            inputs = paddle.randn([10, 1])
+            outputs = net(inputs)
+            outputs.backward()
+            net.clear_gradients(set_to_zero=False)
+            self.assertTrue(net._linear.weight.grad is None)
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/test/legacy_test/test_ldexp.py b/test/legacy_test/test_ldexp.py
index f2e08f7cc9f42..15abd83142bef 100644
--- a/test/legacy_test/test_ldexp.py
+++ b/test/legacy_test/test_ldexp.py
@@ -81,9 +81,7 @@ def _run_ldexp_static(x, y, device='cpu'):
 def check_dtype(input, desired_dtype):
     if input.dtype != desired_dtype:
         raise ValueError(
-            "The expected data type to be obtained is {}, but got {}".format(
-                desired_dtype, input.dtype
-            )
+            f"The expected data type to be obtained is {desired_dtype}, but got {input.dtype}"
         )
 
 
diff --git a/test/legacy_test/test_learning_rate_scheduler.py b/test/legacy_test/test_learning_rate_scheduler.py
index bf6387b43a980..c3af40b1ddbff 100644
--- a/test/legacy_test/test_learning_rate_scheduler.py
+++ b/test/legacy_test/test_learning_rate_scheduler.py
@@ -253,9 +253,7 @@ def test_NoamDecay(self):
                 self.assertAlmostEqual(
                     right_result,
                     base_result,
-                    msg='Failed lr scheduler in step {}, Python result is {}, Fluid result is {}'.format(
-                        step, right_result, base_result
-                    ),
+                    msg=f'Failed lr scheduler in step {step}, Python result is {right_result}, Fluid result is {base_result}',
                 )
 
     def test_LinearLrWarmup(self):
@@ -311,9 +309,7 @@ def test_MultiStepDecay(self):
                 self.assertAlmostEqual(
                     right_result,
                     base_result,
-                    msg='Failed lr scheduler in epoch {}, Python result is {}, Fluid result is {}'.format(
-                        epoch, right_result, base_result
-                    ),
+                    msg=f'Failed lr scheduler in epoch {epoch}, Python result is {right_result}, Fluid result is {base_result}',
                 )
 
             with self.assertRaises(ValueError):
@@ -350,9 +346,7 @@ def test_StepDecay(self):
                 self.assertAlmostEqual(
                     right_result,
                     base_result,
-                    msg='Failed lr scheduler in epoch {}, Python result is {}, Fluid result is {}'.format(
-                        epoch, right_result, base_result
-                    ),
+                    msg=f'Failed lr scheduler in epoch {epoch}, Python result is {right_result}, Fluid result is {base_result}',
                 )
 
             with self.assertRaises(TypeError):
@@ -382,9 +376,7 @@ def test_LambdaDecay(self):
                 self.assertAlmostEqual(
                     right_result,
                     base_result,
-                    msg='Failed lr scheduler in epoch {}, Python result is {}, Fluid result is {}'.format(
-                        epoch, right_result, base_result
-                    ),
+                    msg=f'Failed lr scheduler in epoch {epoch}, Python result is {right_result}, Fluid result is {base_result}',
                 )
 
             with self.assertRaises(TypeError):
@@ -426,12 +418,7 @@ def check_decay_with_place(
             self.assertAlmostEqual(
                 python_decayed_lr,
                 lr_val[0],
-                msg='Failed lr scheduler is {}, step {}, Python result is {}, Fluid result is {}'.format(
-                    python_decay_fn.__name__,
-                    str(step),
-                    str(python_decayed_lr),
-                    str(lr_val[0]),
-                ),
+                msg=f'Failed lr scheduler is {python_decay_fn.__name__}, step {str(step)}, Python result is {str(python_decayed_lr)}, Fluid result is {str(lr_val[0])}',
             )
 
     def test_decay(self):
@@ -553,12 +540,7 @@ def check_decay_with_place(
             self.assertAlmostEqual(
                 python_decayed_lr,
                 lr_val[0],
-                msg='Test {} Failed, step {}, Python result is {}, Fluid result is {}'.format(
-                    python_decay_fn.__name__,
-                    str(step),
-                    str(python_decayed_lr),
-                    str(lr_val[0]),
-                ),
+                msg=f'Test {python_decay_fn.__name__} Failed, step {str(step)}, Python result is {str(python_decayed_lr)}, Fluid result is {str(lr_val[0])}',
             )
 
 
@@ -588,9 +570,7 @@ def run_scalar_lr(self, place, lr, start_lr, end_lr):
             self.assertAlmostEqual(
                 expected_lr,
                 lr_val[0],
-                msg='Test failed, step {}, expected {}, but got {}'.format(
-                    step, expected_lr, lr_val[0]
-                ),
+                msg=f'Test failed, step {step}, expected {expected_lr}, but got {lr_val[0]}',
             )
 
     def test_scalar_lr(self):
diff --git a/test/legacy_test/test_linear_chain_crf_op.py b/test/legacy_test/test_linear_chain_crf_op.py
deleted file mode 100755
index 6899a34063378..0000000000000
--- a/test/legacy_test/test_linear_chain_crf_op.py
+++ /dev/null
@@ -1,266 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-class LinearChainCrfForward:
-    def __init__(
-        self,
-        seq_start_positions,
-        emission_weights,
-        emission_row_max,
-        emission_exps,
-        transition_weights,
-        transition_exps,
-        labels,
-    ):
-        self.tag_num = emission_weights.shape[1]
-        self.seq_num = len(seq_start_positions) - 1
-
-        self.seq_start_positions = seq_start_positions
-        self.labels = labels
-        self.x = emission_weights
-
-        self.x_row_max = emission_row_max
-        self.x_exps = emission_exps
-
-        # unnormalized logits of the transition weights for the start mark.
-        self.a = transition_weights[0, :]
-        self.a_exps = transition_exps[0, :]
-        # unnormalized logits of the transition weights for the end mark.
-        self.b = transition_weights[1, :]
-        self.b_exps = transition_exps[1, :]
-        # unnormalized logits of the transition weights for all the other tags.
-        self.w = transition_weights[2:, :]
-        self.w_exps = transition_exps[2:, :]
-
-        # The output of linear chain crf operator.
-        # alpha is a memo table in dynamic programming to calculate
-        # nomalization factor.
-        self.alpha = np.zeros(
-            (seq_start_positions[-1], self.tag_num), dtype="float64"
-        )
-        self.log_likelihood = np.zeros((self.seq_num, 1))
-
-    def _l1_norm(self, x):
-        s = np.sum(x)
-        x /= s
-        return s
-
-    def _forward_a_sequence(self, x, x_row_max, x_exps, label, alpha):
-        seq_len = x_row_max.shape[0]
-        log_likelihood = 0.0
-
-        for i in range(self.tag_num):
-            alpha[0, i] = self.a_exps[i] * x_exps[0, i]
-        log_likelihood = -x_row_max[0] - np.log(self._l1_norm(alpha[0, :]))
-
-        # calculate the unnormalized logits of the normalization factor.
-        for k in range(1, seq_len):
-            for i in range(self.tag_num):
-                s = 0.0
-                for j in range(self.tag_num):
-                    s += alpha[k - 1, j] * self.w_exps[j, i]
-                alpha[k, i] = x_exps[k, i] * s
-            log_likelihood -= x_row_max[k] + np.log(self._l1_norm(alpha[k, :]))
-        s = 0.0
-        for i in range(self.tag_num):
-            s += alpha[-1, i] * self.b_exps[i]
-        log_likelihood -= np.log(s)
-
-        # calculate the nominator part.
-        log_likelihood += self.a[label[0]] + x[0, label[0]] + self.b[label[-1]]
-
-        for k in range(1, seq_len):
-            log_likelihood += x[k, label[k]] + self.w[label[k - 1], label[k]]
-        return -log_likelihood
-
-    def crf_forward_compute(self):
-        for i in range(self.seq_num):
-            start = self.seq_start_positions[i]
-            end = self.seq_start_positions[i + 1]
-            if start >= end:
-                continue
-            self.log_likelihood[i] = self._forward_a_sequence(
-                self.x[start:end, :],
-                self.x_row_max[start:end, :],
-                self.x_exps[start:end, :],
-                self.labels[start:end, :],
-                self.alpha[start:end, :],
-            )
-        return self.alpha, self.log_likelihood
-
-
-class TestLinearChainCrfOp(OpTest):
-    def set_test_data(self):
-        # TODO(caoying) Fix the unittest by: add the boundary cases when
-        # sequence lengths are 1, 2, and 3.
-
-        SEQ_NUM = 3
-        TAG_NUM = 17
-        MAX_SEQ_LEN = 5
-
-        # the linear_chain_crf operator only supports sequence (LoD level = 1)
-        lod = [[]]
-        seq_start_pos = [0]
-        for i in range(SEQ_NUM):
-            lod[-1].append(random.randint(1, MAX_SEQ_LEN))
-            seq_start_pos.append(seq_start_pos[-1] + lod[-1][-1])
-        emission = np.random.uniform(
-            -1, 1, [seq_start_pos[-1], TAG_NUM]
-        ).astype("float64")
-        emission_row_max = np.amax(emission, axis=1, keepdims=True)
-        emission_exps = np.exp(emission - emission_row_max)
-
-        transition = np.random.uniform(
-            -0.5, 0.5, [TAG_NUM + 2, TAG_NUM]
-        ).astype("float64")
-        transition_exps = np.exp(transition)
-
-        labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(seq_start_pos[-1], 1), dtype="int64"
-        )
-
-        self.inputs = {
-            "Emission": (emission, lod),
-            "Transition": transition,
-            "Label": (labels, lod),
-        }
-        crf = LinearChainCrfForward(
-            seq_start_pos,
-            emission,
-            emission_row_max,
-            emission_exps,
-            transition,
-            transition_exps,
-            labels,
-        )
-        alpha, log_likelihood = crf.crf_forward_compute()
-
-        self.outputs = {
-            "Alpha": alpha,
-            "EmissionExps": emission_exps,
-            "TransitionExps": transition_exps,
-            "LogLikelihood": log_likelihood,
-        }
-
-    def setUp(self):
-        self.op_type = "linear_chain_crf"
-        self.set_test_data()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["Emission", "Transition"], "LogLikelihood")
-
-    def test_check_grad_ignore_transition(self):
-        self.check_grad(
-            ["Emission"], "LogLikelihood", no_grad_set=set("Transition")
-        )
-
-
-class TestLinearChainCrfPaddingTensor(OpTest):
-    def seq_pad(self, data, length):
-        max_len = np.max(length)
-        shape = [len(length), max_len] + list(data.shape[1:])
-        padded = np.zeros(shape).astype(data.dtype)
-        offset = 0
-        for i, l in enumerate(length):
-            padded[i, 0:l] = data[offset : offset + l]
-            offset += l
-        return padded
-
-    def seq_pad_exps(self, data, length):
-        # Adding for transition_exps
-        max_len = np.max(length)
-        shape = [len(length), max_len] + list(data.shape[1:])
-        padded = np.ones(shape).astype(data.dtype)
-        offset = 0
-        for i, l in enumerate(length):
-            padded[i, 0:l] = data[offset : offset + l]
-            offset += l
-        return padded
-
-    def set_test_data_1(self):
-        # Fix the unittest by: add padding tensor in inputs
-        SEQ_NUM = 3
-        TAG_NUM = 17
-        MAX_SEQ_LEN = 5
-
-        # the linear_chain_crf operator only supports sequence (LoD level = 1)
-        lod = [[]]
-        seq_start_pos = [0]
-        for i in range(SEQ_NUM):
-            lod[-1].append(random.randint(1, MAX_SEQ_LEN))
-            seq_start_pos.append(seq_start_pos[-1] + lod[-1][-1])
-        emission = np.random.uniform(
-            -1, 1, [seq_start_pos[-1], TAG_NUM]
-        ).astype("float64")
-        emission_row_max = np.amax(emission, axis=1, keepdims=True)
-        emission_exps = np.exp(emission - emission_row_max)
-        transition = np.random.uniform(
-            -0.5, 0.5, [TAG_NUM + 2, TAG_NUM]
-        ).astype("float64")
-        transition_exps = np.exp(transition)
-
-        labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(seq_start_pos[-1], 1), dtype="int64"
-        )
-        self.inputs = {
-            "Emission": self.seq_pad(emission, lod[0]),
-            "Transition": transition,
-            "Label": self.seq_pad(labels, lod[0]),
-            "Length": np.array(lod).astype("int64"),
-        }
-        crf = LinearChainCrfForward(
-            seq_start_pos,
-            emission,
-            emission_row_max,
-            emission_exps,
-            transition,
-            transition_exps,
-            labels,
-        )
-        alpha, log_likelihood = crf.crf_forward_compute()
-        self.outputs = {
-            "Alpha": self.seq_pad(alpha, lod[0]),
-            "EmissionExps": self.seq_pad_exps(emission_exps, lod[0]),
-            "TransitionExps": transition_exps,
-            "LogLikelihood": log_likelihood,
-        }
-
-    def setUp(self):
-        self.op_type = "linear_chain_crf"
-        self.set_test_data_1()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["Emission", "Transition"], "LogLikelihood")
-
-    def test_check_grad_ignore_transition(self):
-        self.check_grad(
-            ["Emission"], "LogLikelihood", no_grad_set=set("Transition")
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_locality_aware_nms_op.py b/test/legacy_test/test_locality_aware_nms_op.py
deleted file mode 100644
index fd2e64e90c77f..0000000000000
--- a/test/legacy_test/test_locality_aware_nms_op.py
+++ /dev/null
@@ -1,411 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import unittest
-
-import numpy as np
-from op_test import OpTest
-from test_multiclass_nms_op import iou
-
-
-def weight_merge(box1, box2, score1, score2):
-    for i in range(len(box1)):
-        box2[i] = (box1[i] * score1 + box2[i] * score2) / (score1 + score2)
-
-
-def nms(
-    boxes,
-    scores,
-    score_threshold,
-    nms_threshold,
-    top_k=200,
-    normalized=True,
-    eta=1.0,
-):
-    """Apply non-maximum suppression at test time to avoid detecting too many
-    overlapping bounding boxes for a given object.
-    Args:
-        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
-        scores: (tensor) The class predscores for the img, Shape:[num_priors].
-        score_threshold: (float) The confidence thresh for filtering low
-            confidence boxes.
-        nms_threshold: (float) The overlap thresh for suppressing unnecessary
-            boxes.
-        top_k: (int) The maximum number of box preds to consider.
-        eta: (float) The parameter for adaptive NMS.
-    Return:
-        The indices of the kept boxes with respect to num_priors.
-    """
-    index = -1
-    for i in range(boxes.shape[0]):
-        if (
-            index > -1
-            and iou(boxes[i], boxes[index], normalized) > nms_threshold
-        ):
-            weight_merge(boxes[i], boxes[index], scores[i], scores[index])
-            scores[index] += scores[i]
-            scores[i] = score_threshold - 1.0
-        else:
-            index = i
-
-    all_scores = copy.deepcopy(scores)
-    all_scores = all_scores.flatten()
-
-    selected_indices = np.argwhere(all_scores > score_threshold)
-    selected_indices = selected_indices.flatten()
-    all_scores = all_scores[selected_indices]
-
-    sorted_indices = np.argsort(-all_scores, axis=0, kind='mergesort')
-    sorted_scores = all_scores[sorted_indices]
-    sorted_indices = selected_indices[sorted_indices]
-
-    if top_k > -1 and top_k < sorted_indices.shape[0]:
-        sorted_indices = sorted_indices[:top_k]
-        sorted_scores = sorted_scores[:top_k]
-
-    selected_indices = []
-    adaptive_threshold = nms_threshold
-    for i in range(sorted_scores.shape[0]):
-        idx = sorted_indices[i]
-        keep = True
-        for k in range(len(selected_indices)):
-            if keep:
-                kept_idx = selected_indices[k]
-                overlap = iou(boxes[idx], boxes[kept_idx], normalized)
-                keep = True if overlap <= adaptive_threshold else False
-            else:
-                break
-        if keep:
-            selected_indices.append(idx)
-        if keep and eta < 1 and adaptive_threshold > 0.5:
-            adaptive_threshold *= eta
-    return selected_indices
-
-
-def multiclass_nms(
-    boxes,
-    scores,
-    background,
-    score_threshold,
-    nms_threshold,
-    nms_top_k,
-    keep_top_k,
-    normalized,
-    shared,
-):
-    if shared:
-        class_num = scores.shape[0]
-        priorbox_num = scores.shape[1]
-    else:
-        box_num = scores.shape[0]
-        class_num = scores.shape[1]
-
-    selected_indices = {}
-    num_det = 0
-    for c in range(class_num):
-        if c == background:
-            continue
-        if shared:
-            indices = nms(
-                boxes,
-                scores[c],
-                score_threshold,
-                nms_threshold,
-                nms_top_k,
-                normalized,
-            )
-        else:
-            indices = nms(
-                boxes[:, c, :],
-                scores[:, c],
-                score_threshold,
-                nms_threshold,
-                nms_top_k,
-                normalized,
-            )
-        selected_indices[c] = indices
-        num_det += len(indices)
-
-    if keep_top_k > -1 and num_det > keep_top_k:
-        score_index = []
-        for c, indices in selected_indices.items():
-            for idx in indices:
-                if shared:
-                    score_index.append((scores[c][idx], c, idx))
-                else:
-                    score_index.append((scores[idx][c], c, idx))
-
-        sorted_score_index = sorted(
-            score_index, key=lambda tup: tup[0], reverse=True
-        )
-        sorted_score_index = sorted_score_index[:keep_top_k]
-        selected_indices = {}
-
-        for _, c, _ in sorted_score_index:
-            selected_indices[c] = []
-        for s, c, idx in sorted_score_index:
-            selected_indices[c].append(idx)
-        if not shared:
-            for labels in selected_indices:
-                selected_indices[labels].sort()
-        num_det = keep_top_k
-
-    return selected_indices, num_det
-
-
-def batched_multiclass_nms(
-    boxes,
-    scores,
-    background,
-    score_threshold,
-    nms_threshold,
-    nms_top_k,
-    keep_top_k,
-    normalized=True,
-):
-    batch_size = scores.shape[0]
-    num_boxes = scores.shape[2]
-    det_outs = []
-
-    lod = []
-    for n in range(batch_size):
-        nmsed_outs, nmsed_num = multiclass_nms(
-            boxes[n],
-            scores[n],
-            background,
-            score_threshold,
-            nms_threshold,
-            nms_top_k,
-            keep_top_k,
-            normalized,
-            shared=True,
-        )
-        lod.append(nmsed_num)
-
-        if nmsed_num == 0:
-            continue
-        tmp_det_out = []
-        for c, indices in nmsed_outs.items():
-            for idx in indices:
-                xmin, ymin, xmax, ymax = boxes[n][idx][:]
-                tmp_det_out.append(
-                    [
-                        c,
-                        scores[n][c][idx],
-                        xmin,
-                        ymin,
-                        xmax,
-                        ymax,
-                        idx + n * num_boxes,
-                    ]
-                )
-        sorted_det_out = sorted(
-            tmp_det_out, key=lambda tup: tup[0], reverse=False
-        )
-        det_outs.extend(sorted_det_out)
-    return det_outs, lod
-
-
-class TestLocalAwareNMSOp(OpTest):
-    def set_argument(self):
-        self.score_threshold = 0.01
-
-    def setUp(self):
-        self.set_argument()
-        N = 10
-        M = 1200
-        C = 1
-        BOX_SIZE = 4
-        background = -1
-        nms_threshold = 0.3
-        nms_top_k = 400
-        keep_top_k = 10
-        score_threshold = self.score_threshold
-
-        scores = np.random.random((N * M, C)).astype('float32')
-
-        def softmax(x):
-            # clip to shiftx, otherwise, when calc loss with
-            # log(exp(shiftx)), may get log(0)=INF
-            shiftx = (x - np.max(x)).clip(-64.0)
-            exps = np.exp(shiftx)
-            return exps / np.sum(exps)
-
-        scores = np.apply_along_axis(softmax, 1, scores)
-        scores = np.reshape(scores, (N, M, C))
-        scores = np.transpose(scores, (0, 2, 1))
-
-        boxes = np.random.random((N, M, BOX_SIZE)).astype('float32')
-        boxes[:, :, 0:2] = boxes[:, :, 0:2] * 0.5
-        boxes[:, :, 2:4] = boxes[:, :, 2:4] * 0.5 + 0.5
-
-        boxes_copy = copy.deepcopy(boxes)
-        scores_copy = copy.deepcopy(scores)
-        det_outs, lod = batched_multiclass_nms(
-            boxes_copy,
-            scores_copy,
-            background,
-            score_threshold,
-            nms_threshold,
-            nms_top_k,
-            keep_top_k,
-        )
-
-        lod = [1] if not det_outs else lod
-        det_outs = [[-1, 0]] if not det_outs else det_outs
-        det_outs = np.array(det_outs)
-        nmsed_outs = det_outs[:, :-1].astype('float32')
-
-        self.op_type = 'locality_aware_nms'
-        self.inputs = {'BBoxes': boxes, 'Scores': scores}
-        self.outputs = {'Out': (nmsed_outs, [lod])}
-        self.attrs = {
-            'background_label': background,
-            'nms_threshold': nms_threshold,
-            'nms_top_k': nms_top_k,
-            'keep_top_k': keep_top_k,
-            'score_threshold': score_threshold,
-            'nms_eta': 1.0,
-            'normalized': True,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestLocalAwareNMSOpNoBoxes(TestLocalAwareNMSOp):
-    def set_argument(self):
-        self.score_threshold = 2.0
-
-
-class TestLocalAwareNMSOp4Points(OpTest):
-    def set_argument(self):
-        self.score_threshold = 0.01
-
-    def setUp(self):
-        self.set_argument()
-        N = 2
-        M = 2
-        C = 1
-        BOX_SIZE = 8
-        nms_top_k = 400
-        keep_top_k = 200
-        nms_threshold = 0.3
-        score_threshold = self.score_threshold
-
-        scores = np.array(
-            [[[0.76319082, 0.73770091]], [[0.68513154, 0.45952697]]]
-        )
-        boxes = np.array(
-            [
-                [
-                    [
-                        0.42078365,
-                        0.58117018,
-                        2.92776169,
-                        3.28557757,
-                        4.24344318,
-                        0.92196165,
-                        2.72370856,
-                        -1.66141214,
-                    ],
-                    [
-                        0.13856006,
-                        1.86871034,
-                        2.81287224,
-                        3.61381734,
-                        4.5505249,
-                        0.51766346,
-                        2.75630304,
-                        -1.91459389,
-                    ],
-                ],
-                [
-                    [
-                        1.57533883,
-                        1.3217477,
-                        3.07904942,
-                        3.89512545,
-                        4.78680923,
-                        1.96914586,
-                        3.539482,
-                        -1.59739244,
-                    ],
-                    [
-                        0.55084125,
-                        1.71596215,
-                        2.52476074,
-                        3.18940435,
-                        5.09035159,
-                        0.91959482,
-                        3.71442385,
-                        -0.57299128,
-                    ],
-                ],
-            ]
-        )
-
-        det_outs = np.array(
-            [
-                [
-                    0.0,
-                    1.5008917,
-                    0.28206837,
-                    1.2140071,
-                    2.8712926,
-                    3.4469104,
-                    4.3943763,
-                    0.7232457,
-                    2.7397292,
-                    -1.7858533,
-                ],
-                [
-                    0.0,
-                    1.1446586,
-                    1.1640508,
-                    1.4800063,
-                    2.856528,
-                    3.6118112,
-                    4.908667,
-                    1.5478,
-                    3.609713,
-                    -1.1861432,
-                ],
-            ]
-        )
-        lod = [1, 1]
-        nmsed_outs = det_outs.astype('float32')
-
-        self.op_type = 'locality_aware_nms'
-        self.inputs = {
-            'BBoxes': boxes.astype('float32'),
-            'Scores': scores.astype('float32'),
-        }
-        self.outputs = {'Out': (nmsed_outs, [lod])}
-        self.attrs = {
-            'score_threshold': score_threshold,
-            'nms_threshold': nms_threshold,
-            'nms_top_k': nms_top_k,
-            'keep_top_k': keep_top_k,
-            'background_label': -1,
-            'normalized': False,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_logcumsumexp_op.py b/test/legacy_test/test_logcumsumexp_op.py
index 6480c5c79de70..75d13cba17264 100644
--- a/test/legacy_test/test_logcumsumexp_op.py
+++ b/test/legacy_test/test_logcumsumexp_op.py
@@ -200,10 +200,12 @@ def test_gpu(self):
         self.run_static(use_gpu=True)
 
     def test_name(self):
+        paddle.enable_static()
         with base.program_guard(base.Program()):
             x = paddle.static.data('x', [3, 4])
             y = paddle.logcumsumexp(x, name='out')
             self.assertTrue('out' in y.name)
+        paddle.disable_static()
 
     @test_with_pir_api
     def test_type_error(self):
diff --git a/test/legacy_test/test_math_op_patch_pir.py b/test/legacy_test/test_math_op_patch_pir.py
index d30e4abd408dd..a5e1cafeb11a8 100644
--- a/test/legacy_test/test_math_op_patch_pir.py
+++ b/test/legacy_test/test_math_op_patch_pir.py
@@ -408,7 +408,7 @@ def test_place(self):
             warnings.simplefilter("always")
             with paddle.pir_utils.IrGuard():
                 x = paddle.static.data(name='x', shape=[3, 2, 1])
-                x.place()
+                _ = x.place
                 self.assertTrue(len(w) == 1)
                 self.assertTrue("place" in str(w[-1].message))
 
diff --git a/test/legacy_test/test_matmul_op.py b/test/legacy_test/test_matmul_op.py
index 1ae05d4696582..2d264bff97c30 100644
--- a/test/legacy_test/test_matmul_op.py
+++ b/test/legacy_test/test_matmul_op.py
@@ -149,11 +149,7 @@ def generate_compatible_shapes_ndim(dim, transpose_X, transpose_Y):
 for dim in [4]:
     for transpose_X in [False, True]:
         for transpose_Y in [False, True]:
-            test_name = (
-                'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
-                    dim, dim, transpose_X, transpose_Y
-                )
-            )
+            test_name = f'TestMatMulOp_dimX_{dim}_dim_Y_{dim}_transX_{transpose_X}_transY_{transpose_Y}'
             shape_X, shape_Y = generate_compatible_shapes_ndim(
                 dim, transpose_X, transpose_Y
             )
@@ -190,9 +186,7 @@ def test_out(self):
                 expected_result,
                 rtol=1e-05,
                 atol=1e-05,
-                err_msg='two value is            {}\n{}, check diff!'.format(
-                    np_res, expected_result
-                ),
+                err_msg=f'two value is            {np_res}\n{expected_result}, check diff!',
             )
 
     def test_dygraph_without_out(self):
diff --git a/test/legacy_test/test_matmul_op_with_head.py b/test/legacy_test/test_matmul_op_with_head.py
index 1c3cbe8d926c9..856940cdc5f5e 100644
--- a/test/legacy_test/test_matmul_op_with_head.py
+++ b/test/legacy_test/test_matmul_op_with_head.py
@@ -128,11 +128,7 @@ def test_check_output(self):
 
 
 def inject_test_multiple_head(dim_x, dim_y, trans_x, trans_y, head_number):
-    test_name = (
-        'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}_head_{}'.format(
-            dim_x, dim_y, trans_x, trans_y, head_number
-        )
-    )
+    test_name = f'TestMatMulOp_dimX_{dim_x}_dim_Y_{dim_y}_transX_{trans_x}_transY_{trans_y}_head_{head_number}'
     shape_x, shape_y = generate_compatible_shapes_mul_head(
         dim_x, dim_y, trans_x, trans_y
     )
@@ -260,11 +256,7 @@ def test_check_output(self):
 
 
 def inject_test_multiple_head2(dim_x, dim_y, trans_x, trans_y, head_number):
-    test_name = (
-        'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}_head2_{}'.format(
-            dim_x, dim_y, trans_x, trans_y, head_number
-        )
-    )
+    test_name = f'TestMatMulOp_dimX_{dim_x}_dim_Y_{dim_y}_transX_{trans_x}_transY_{trans_y}_head2_{head_number}'
     shape_x, shape_y = generate_compatible_shapes_mul_head2(
         dim_x, dim_y, trans_x, trans_y
     )
diff --git a/test/legacy_test/test_merged_momentum_op.py b/test/legacy_test/test_merged_momentum_op.py
index 289c86fef3b4e..ac1d696ef775d 100644
--- a/test/legacy_test/test_merged_momentum_op.py
+++ b/test/legacy_test/test_merged_momentum_op.py
@@ -24,7 +24,7 @@
 def run_momentum_op(
     params,
     grads,
-    velocitys,
+    velocities,
     master_params,
     learning_rate,
     place,
@@ -34,7 +34,7 @@ def run_momentum_op(
     use_merged=False,
 ):
     assert len(params) == len(grads)
-    assert len(params) == len(velocitys)
+    assert len(params) == len(velocities)
     if multi_precision:
         assert len(params) == len(master_params)
     op_type = 'merged_momentum' if use_merged else 'momentum'
@@ -61,7 +61,7 @@ def run_momentum_op(
             helper.create_variable(
                 persistable=True, shape=v.shape, dtype=v.dtype
             )
-            for v in velocitys
+            for v in velocities
         ]
         lr_var = helper.create_variable(
             persistable=True,
@@ -83,7 +83,7 @@ def run_momentum_op(
             OrderedDict(
                 [
                     (v_var.name, v_val)
-                    for v_var, v_val in zip(velocity_vars, velocitys)
+                    for v_var, v_val in zip(velocity_vars, velocities)
                 ]
             )
         )
@@ -162,7 +162,7 @@ def run_momentum_op(
 def run_momentum_op2(
     params,
     grads,
-    velocitys,
+    velocities,
     master_params,
     learning_rate,
     place,
@@ -173,7 +173,7 @@ def run_momentum_op2(
     use_nesterov=True,
 ):
     assert len(params) == len(grads)
-    assert len(params) == len(velocitys)
+    assert len(params) == len(velocities)
     if multi_precision:
         assert len(params) == len(master_params)
     op_type = 'merged_momentum' if use_merged else 'momentum'
@@ -195,7 +195,7 @@ def run_momentum_op2(
             helper.create_variable(
                 persistable=True, shape=v.shape, dtype=v.dtype
             )
-            for v in velocitys
+            for v in velocities
         ]
         lr_var = helper.create_variable(
             persistable=True,
@@ -217,7 +217,7 @@ def run_momentum_op2(
             OrderedDict(
                 [
                     (v_var.name, v_val)
-                    for v_var, v_val in zip(velocity_vars, velocitys)
+                    for v_var, v_val in zip(velocity_vars, velocities)
                 ]
             )
         )
@@ -331,19 +331,19 @@ def prepare_data(self, shapes, multi_precision, seed, place):
         )
         params = self.gen_rand_data(shapes, dtype)
         grads = self.gen_rand_data(shapes, dtype)
-        velocitys = self.gen_rand_data(shapes, mp_dtype)
+        velocities = self.gen_rand_data(shapes, mp_dtype)
         learning_rate = self.gen_rand_data([[1]], mp_dtype)[0]
         if multi_precision:
             master_params = [p.astype(mp_dtype) for p in params]
         else:
             master_params = None
-        return params, grads, velocitys, master_params, learning_rate
+        return params, grads, velocities, master_params, learning_rate
 
     def check_with_place(self, place, multi_precision):
         (
             params,
             grads,
-            velocitys,
+            velocities,
             master_params,
             learning_rate,
         ) = self.prepare_data(self.shapes, multi_precision, self.seed, place)
@@ -354,7 +354,7 @@ def run_op(use_merged):
             return run_momentum_op(
                 params,
                 grads,
-                velocitys,
+                velocities,
                 master_params,
                 learning_rate,
                 place,
@@ -403,19 +403,19 @@ def prepare_data(self, shapes, multi_precision, seed, place):
         )
         params = self.gen_rand_data(shapes, dtype)
         grads = self.gen_rand_data(shapes, dtype)
-        velocitys = self.gen_rand_data(shapes, mp_dtype)
+        velocities = self.gen_rand_data(shapes, mp_dtype)
         learning_rate = self.gen_rand_data([[1]], mp_dtype)[0]
         if multi_precision:
             master_params = [p.astype(mp_dtype) for p in params]
         else:
             master_params = None
-        return params, grads, velocitys, master_params, learning_rate
+        return params, grads, velocities, master_params, learning_rate
 
     def check_with_place(self, place, multi_precision):
         (
             params,
             grads,
-            velocitys,
+            velocities,
             master_params,
             learning_rate,
         ) = self.prepare_data(self.shapes, multi_precision, self.seed, place)
@@ -426,7 +426,7 @@ def run_op(use_nesterov, use_merged):
             return run_momentum_op2(
                 params,
                 grads,
-                velocitys,
+                velocities,
                 master_params,
                 learning_rate,
                 place,
diff --git a/test/legacy_test/test_mine_hard_examples_op.py b/test/legacy_test/test_mine_hard_examples_op.py
deleted file mode 100644
index f3f1ec4d76ad7..0000000000000
--- a/test/legacy_test/test_mine_hard_examples_op.py
+++ /dev/null
@@ -1,108 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-class TestMineHardExamplesOp(OpTest):
-    def set_data(self):
-        self.init_test_data()
-        self.inputs = {
-            'ClsLoss': self.cls_loss,
-            'LocLoss': self.loc_loss,
-            'MatchIndices': self.match_indices,
-            'MatchDist': self.match_dis,
-        }
-
-        self.attrs = {
-            'neg_pos_ratio': self.neg_pos_ratio,
-            'neg_overlap': self.neg_overlap,
-            'sample_size': self.sample_size,
-            'mining_type': self.mining_type,
-        }
-
-        self.outputs = {
-            'NegIndices': (self.neg_indices, self.neg_indices_lod),
-            'UpdatedMatchIndices': self.updated_match_indices,
-        }
-
-    def test_check_output(self):
-        # NODE(yjjiang11): This op will be deprecated.
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        return
-
-    def setUp(self):
-        self.op_type = "mine_hard_examples"
-        self.set_data()
-
-    def init_test_data(self):
-        self.neg_pos_ratio = 1.0
-        self.neg_overlap = 0.5
-        self.sample_size = 0
-        self.mining_type = "max_negative"
-        self.cls_loss = np.array([[0.1, 0.1, 0.3], [0.3, 0.1, 0.1]]).astype(
-            'float64'
-        )
-
-        self.loc_loss = np.array([[0.1, 0.2, 0.3], [0.3, 0.4, 0.1]]).astype(
-            'float64'
-        )
-
-        self.match_dis = np.array([[0.2, 0.4, 0.8], [0.1, 0.9, 0.3]]).astype(
-            'float64'
-        )
-
-        self.match_indices = np.array([[0, -1, -1], [-1, 0, -1]]).astype(
-            'int32'
-        )
-
-        self.updated_match_indices = self.match_indices
-
-        self.neg_indices_lod = [[1, 1]]
-        self.neg_indices = np.array([[1], [0]]).astype('int32')
-
-
-class TestMineHardExamplesOpHardExample(TestMineHardExamplesOp):
-    def init_test_data(self):
-        super().init_test_data()
-        self.mining_type = "hard_example"
-        self.sample_size = 2
-
-        self.cls_loss = np.array([[0.5, 0.1, 0.3], [0.3, 0.1, 0.1]]).astype(
-            'float64'
-        )
-
-        self.loc_loss = np.array([[0.2, 0.2, 0.3], [0.3, 0.1, 0.2]]).astype(
-            'float64'
-        )
-
-        self.match_indices = np.array([[0, -1, -1], [-1, 0, -1]]).astype(
-            'int32'
-        )
-
-        self.updated_match_indices = np.array(
-            [[0, -1, -1], [-1, -1, -1]]
-        ).astype('int32')
-
-        self.neg_indices_lod = [[1, 2]]
-        self.neg_indices = np.array([[2], [0], [2]]).astype('int32')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_minus_op.py b/test/legacy_test/test_minus_op.py
deleted file mode 100644
index 26d01a179ff46..0000000000000
--- a/test/legacy_test/test_minus_op.py
+++ /dev/null
@@ -1,42 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-import paddle
-
-
-class TestMinusOp(OpTest):
-    def setUp(self):
-        self.op_type = "minus"
-        self.inputs = {
-            'X': np.random.random((32, 84)).astype("float32"),
-            'Y': np.random.random((32, 84)).astype("float32"),
-        }
-        self.outputs = {'Out': (self.inputs['X'] - self.inputs['Y'])}
-
-    def test_check_output(self):
-        # NODE(yjjiang11): This op will be deprecated.
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        self.check_grad(['X', 'Y'], 'Out', check_dygraph=False)
-
-
-if __name__ == "__main__":
-    paddle.enable_static()
-    unittest.main()
diff --git a/test/legacy_test/test_mix_precision_all_reduce_fuse.py b/test/legacy_test/test_mix_precision_all_reduce_fuse.py
deleted file mode 100644
index 6887b2d0de631..0000000000000
--- a/test/legacy_test/test_mix_precision_all_reduce_fuse.py
+++ /dev/null
@@ -1,95 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import nets
-import numpy as np
-from parallel_executor_test_base import DeviceType, TestParallelExecutorBase
-from simple_nets import init_data
-
-import paddle
-from paddle.base import core
-
-batch_size = 12
-img_shape = [1, 28, 28]
-
-
-def loss_net(hidden, label):
-    prediction = paddle.static.nn.fc(x=hidden, size=10, activation='softmax')
-    loss = paddle.nn.functional.cross_entropy(
-        input=prediction, label=label, reduction='none', use_softmax=False
-    )
-    avg_loss = paddle.mean(loss)
-    return avg_loss
-
-
-def conv_net(use_feed):
-    img = paddle.static.data(
-        name='image', shape=[-1] + img_shape, dtype='float16'
-    )
-    label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64')
-
-    conv_pool_1 = nets.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu",
-    )
-    conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
-
-    conv_pool_1 = paddle.cast(conv_pool_1, np.float32)
-    conv_pool_2 = nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu",
-    )
-    hidden = paddle.cast(conv_pool_2, np.float32)
-    return loss_net(hidden, label)
-
-
-def _optimizer(learning_rate=1e-6):
-    optimizer = paddle.optimizer.SGD(learning_rate=learning_rate)
-    return optimizer
-
-
-class TestResnet(TestParallelExecutorBase):
-    def check_model(self, use_device):
-        img, label = init_data(
-            batch_size=batch_size, img_shape=img_shape, label_range=9
-        )
-        img = np.float16(img)
-        feed_dict = {"image": img, "label": label}
-
-        TestParallelExecutorBase.check_network_convergence(
-            conv_net,
-            feed_dict=feed_dict,
-            iter=10,
-            use_device=use_device,
-            fuse_all_reduce_ops=True,
-            optimizer=_optimizer,
-        )
-
-    def test_model(self):
-        if core.is_compiled_with_cuda():
-            self.check_model(DeviceType.CUDA)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_momentum_op.py b/test/legacy_test/test_momentum_op.py
index 150bd56bf98a5..296ddc7685f41 100644
--- a/test/legacy_test/test_momentum_op.py
+++ b/test/legacy_test/test_momentum_op.py
@@ -184,7 +184,7 @@ def setUp(self):
 
         params = []
         grads = []
-        velocitys = []
+        velocities = []
         learning_rates = []
         master_params = []
         param_outs = []
@@ -216,7 +216,7 @@ def setUp(self):
 
             params.append(("SubParam_" + str(i), param))
             grads.append(("SubGrad_" + str(i), grad))
-            velocitys.append(("SubVelocity_" + str(i), velocity))
+            velocities.append(("SubVelocity_" + str(i), velocity))
             learning_rates.append(("SubLearning_rate_" + str(i), learning_rate))
             velocity_outs.append(("SubVelocity_out_" + str(i), velocity_out))
             param_outs.append(("SubParam_out_" + str(i), param_out))
@@ -228,7 +228,7 @@ def setUp(self):
         self.inputs = {
             'Param': params,
             'Grad': grads,
-            'Velocity': velocitys,
+            'Velocity': velocities,
             'LearningRate': learning_rates,
             'MasterParam': master_params,
         }
@@ -268,7 +268,7 @@ def setUp(self):
 
         params = []
         grads = []
-        velocitys = []
+        velocities = []
         param_outs = []
         velocity_outs = []
         learning_rates = []
@@ -292,7 +292,7 @@ def setUp(self):
 
             params.append(("SubParam_" + str(i), param))
             grads.append(("SubGrad_" + str(i), grad))
-            velocitys.append(("SubVelocity_" + str(i), velocity))
+            velocities.append(("SubVelocity_" + str(i), velocity))
             learning_rates.append(("SubLearning_rate_" + str(i), learning_rate))
             velocity_outs.append(("SubVelocity_out_" + str(i), velocity_out))
             param_outs.append(("SubParam_out_" + str(i), param_out))
@@ -300,7 +300,7 @@ def setUp(self):
         self.inputs = {
             'Param': params,
             'Grad': grads,
-            'Velocity': velocitys,
+            'Velocity': velocities,
             'LearningRate': learning_rates,
         }
 
diff --git a/test/legacy_test/test_mul_op.py b/test/legacy_test/test_mul_op.py
index aeeda411137d6..20f5f267f9b4a 100644
--- a/test/legacy_test/test_mul_op.py
+++ b/test/legacy_test/test_mul_op.py
@@ -312,7 +312,7 @@ def test_check_grad_ignore_y(self):
         )
 
 
-# TODO: verify the requirments of CUDA ARCH
+# TODO: verify the requirements of CUDA ARCH
 @unittest.skipIf(
     not core.is_compiled_with_cuda() or get_cuda_version() < 11060,
     "MatmulInt8 requires CUDA >= 11.6",
diff --git a/test/legacy_test/test_multi_label_soft_margin_loss.py b/test/legacy_test/test_multi_label_soft_margin_loss.py
index c9b455bd4ac40..4aaa09b1b3440 100644
--- a/test/legacy_test/test_multi_label_soft_margin_loss.py
+++ b/test/legacy_test/test_multi_label_soft_margin_loss.py
@@ -26,10 +26,10 @@ def call_MultiLabelSoftMarginLoss_layer(
     weight=None,
     reduction='mean',
 ):
-    multilabel_margin_loss = paddle.nn.MultiLabelSoftMarginLoss(
+    multi_label_margin_loss = paddle.nn.MultiLabelSoftMarginLoss(
         weight=weight, reduction=reduction
     )
-    res = multilabel_margin_loss(
+    res = multi_label_margin_loss(
         input=input,
         label=label,
     )
@@ -115,7 +115,7 @@ def test_dygraph(
         return dy_result
 
 
-def calc_multilabel_margin_loss(
+def calc_multi_label_margin_loss(
     input,
     label,
     weight=None,
@@ -151,7 +151,7 @@ def test_MultiLabelSoftMarginLoss(self):
         reductions = ['sum', 'mean', 'none']
         for place in places:
             for reduction in reductions:
-                expected = calc_multilabel_margin_loss(
+                expected = calc_multi_label_margin_loss(
                     input=input, label=label, reduction=reduction
                 )
 
@@ -218,7 +218,7 @@ def test_MultiLabelSoftMarginLoss_weights(self):
         weight = np.random.randint(0, 2, size=(5, 5)).astype(np.float64)
         place = 'cpu'
         reduction = 'mean'
-        expected = calc_multilabel_margin_loss(
+        expected = calc_multi_label_margin_loss(
             input=input, label=label, weight=weight, reduction=reduction
         )
 
diff --git a/test/legacy_test/test_multinomial_op.py b/test/legacy_test/test_multinomial_op.py
index 2f512533543de..f6fc6e281193b 100644
--- a/test/legacy_test/test_multinomial_op.py
+++ b/test/legacy_test/test_multinomial_op.py
@@ -393,7 +393,7 @@ def test_fixed_random_number(self):
         if not paddle.is_compiled_with_cuda():
             return
 
-        # Different GPU generatte different random value. Only test V100 here.
+        # Different GPU generate different random value. Only test V100 here.
         if "V100" not in paddle.device.cuda.get_device_name():
             return
 
diff --git a/test/legacy_test/test_multiprocess_dataloader_dataset.py b/test/legacy_test/test_multiprocess_dataloader_dataset.py
index e23e73eb99bca..21e21943b2e0b 100755
--- a/test/legacy_test/test_multiprocess_dataloader_dataset.py
+++ b/test/legacy_test/test_multiprocess_dataloader_dataset.py
@@ -274,7 +274,7 @@ def run_main(self, num_workers, places):
                 assert isinstance(label, base.core.eager.Tensor)
 
 
-class ComplextDataset(Dataset):
+class ComplexDataset(Dataset):
     def __init__(self, sample_num):
         self.sample_num = sample_num
 
@@ -294,12 +294,12 @@ def __getitem__(self, idx):
         )
 
 
-class TestComplextDataset(unittest.TestCase):
+class TestComplexDataset(unittest.TestCase):
     def run_main(self, num_workers):
         paddle.seed(1)
         place = paddle.CPUPlace()
         with base.dygraph.guard(place):
-            dataset = ComplextDataset(16)
+            dataset = ComplexDataset(16)
             assert len(dataset) == 16
             dataloader = DataLoader(
                 dataset,
diff --git a/test/legacy_test/test_multiprocess_dataloader_exception.py b/test/legacy_test/test_multiprocess_dataloader_exception.py
index 398e3bf4b99be..1983112477113 100644
--- a/test/legacy_test/test_multiprocess_dataloader_exception.py
+++ b/test/legacy_test/test_multiprocess_dataloader_exception.py
@@ -139,7 +139,7 @@ def test_main(self):
             pass
 
 
-# CI Converage cannot record stub in subprocess,
+# CI Coverage cannot record stub in subprocess,
 # HACK a _worker_loop in main process call here
 @unittest.skipIf(
     not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
diff --git a/test/legacy_test/test_nan_inf.py b/test/legacy_test/test_nan_inf.py
index 6db010ece73e7..0cdd02465b856 100644
--- a/test/legacy_test/test_nan_inf.py
+++ b/test/legacy_test/test_nan_inf.py
@@ -179,7 +179,7 @@ def check_stack(self, file_name):
     def test_check_stack(self):
         self.check_stack(" check_nan_inf_backward_stack.py")
 
-    def test_statck_check_stack(self):
+    def test_static_check_stack(self):
         self.check_stack(" check_nan_inf_backward_static_stack.py")
 
 
diff --git a/test/legacy_test/test_nanmedian.py b/test/legacy_test/test_nanmedian.py
index 7f4044613e6e6..51c2c7d9f5875 100644
--- a/test/legacy_test/test_nanmedian.py
+++ b/test/legacy_test/test_nanmedian.py
@@ -344,6 +344,22 @@ def test_check_grad_0d(self):
         self.assertEqual(x.grad.shape, [])
         np.testing.assert_allclose(x.grad, np.array(0.0))
 
+    def test_dygraph_cpu(self):
+        paddle.disable_static(place=paddle.CPUPlace())
+        with paddle.base.dygraph.guard():
+            data = np.array(
+                [[1.4907, 1.0593, 1.5696], [1.4907, 1.0593, 1.5696]]
+            )
+            out, index = paddle.nanmedian(
+                paddle.to_tensor(data), axis=1, keepdim=False, mode='min'
+            )
+        np_res = np_nanmedain_axis(data, axis=1)
+        np.testing.assert_allclose(np_res, out, rtol=1e-05, equal_nan=True)
+        np.testing.assert_allclose(
+            np.array([0, 0]), index, rtol=1e-05, equal_nan=True
+        )
+        paddle.enable_static()
+
 
 class TestNanmedianModeMean(unittest.TestCase):
     def setUp(self):
@@ -570,6 +586,19 @@ def test_check_grad_0d(self):
         self.assertEqual(x.grad.shape, [])
         np.testing.assert_allclose(x.grad, np.array(0.0))
 
+    def test_dygraph_cpu(self):
+        paddle.disable_static(place=paddle.CPUPlace())
+        with paddle.base.dygraph.guard():
+            data = np.array(
+                [[1.4907, 1.0593, 1.5696], [1.4907, 1.0593, 1.5696]]
+            )
+            out = paddle.nanmedian(
+                paddle.to_tensor(data), axis=1, keepdim=False
+            )
+        np_res = np.nanmedian(data, axis=1)
+        np.testing.assert_allclose(np_res, out, rtol=1e-05, equal_nan=True)
+        paddle.enable_static()
+
 
 class TestNanmedianFP16Op(OpTest):
     def setUp(self):
diff --git a/test/legacy_test/test_nn_grad.py b/test/legacy_test/test_nn_grad.py
index 8a4738b26522b..d7b17d476caf0 100644
--- a/test/legacy_test/test_nn_grad.py
+++ b/test/legacy_test/test_nn_grad.py
@@ -405,7 +405,6 @@ def concat_wrapper(self, x):
     @prog_scope()
     def func(self, place):
         x_shape = [2, 3, 4, 5]
-        pad = [1, 1, 1, 1]
         dtype = np.float64
 
         x1 = paddle.static.data('x', x_shape, dtype)
@@ -437,6 +436,45 @@ def test_grad(self):
             self.func(p)
 
 
+class TestStackDoubleGradCheck(unittest.TestCase):
+    def stack_wrapper(self, x):
+        return paddle.stack(x, axis=1)
+
+    @test_with_pir_api
+    @prog_scope()
+    def func(self, place):
+        x_shape = [2, 3, 4, 5]
+        dtype = np.float64
+
+        x1 = paddle.static.data('x', x_shape, dtype)
+        x2 = paddle.static.data('x', x_shape, dtype)
+        x1.persistable = True
+        x1.stop_gradient = False
+        x2.persistable = True
+        x2.stop_gradient = False
+        out = paddle.stack([x1, x2], axis=0)
+        x2_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+        x1_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x1, x2], out, x_init=[x1_arr, x2_arr], place=place
+        )
+        gradient_checker.double_grad_check_for_dygraph(
+            self.stack_wrapper,
+            [x1, x2],
+            out,
+            x_init=[x1_arr, x2_arr],
+            place=place,
+        )
+
+    def test_grad(self):
+        places = [base.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(base.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 class TestAvgPool2DDoubleGradCheckCase1(unittest.TestCase):
     @test_with_pir_api
     @prog_scope()
diff --git a/test/legacy_test/test_norm_all.py b/test/legacy_test/test_norm_all.py
index ef1f8c7729cbb..bde35f6a4c2a0 100644
--- a/test/legacy_test/test_norm_all.py
+++ b/test/legacy_test/test_norm_all.py
@@ -31,6 +31,17 @@ def p_norm_python_api(
         return _C_ops.p_norm(x, p, axis, epsilon, keepdim, as_vector)
 
 
+def norm_public_python_api(
+    x, p=2.0, axis=-1, epsilon=1e-12, keepdim=False, as_vector=False
+):
+    return paddle.linalg.norm(
+        x,
+        p,
+        axis,
+        keepdim,
+    )
+
+
 def np_linalg_vector_norm(x, axis, porder, keepdims=False):
     x_shape = list(x.shape)
 
@@ -175,8 +186,14 @@ class TestPnormOp(OpTest):
     def setUp(self):
         self.op_type = "p_norm"
         self.python_api = p_norm_python_api
+        self.public_python_api = norm_public_python_api
+        self.prim_op_type = "comp"
         self.init_test_case()
         self.init_dtype()
+        self.fw_comp_atol = 1e-6
+        self.fw_comp_rtol = 1e-6
+        self.rev_comp_atol = 1e-6
+        self.rev_comp_rtol = 1e-6
         x = (np.random.random(self.shape) + 0.5).astype(self.dtype)
         norm = np_linalg_norm(x, self.axis, self.porder, self.keepdim)
         self.inputs = {'X': x}
@@ -191,10 +208,10 @@ def setUp(self):
         self.gradient = self.calc_gradient()
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_prim_pir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_prim_pir=True)
 
     def init_test_case(self):
         self.shape = [2, 3, 4, 5]
@@ -257,7 +274,7 @@ def init_dtype(self):
         self.dtype = "float32"
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_prim_pir=True)
 
 
 class TestPnormOp3(TestPnormOp):
@@ -273,7 +290,9 @@ def init_dtype(self):
         self.dtype = "float32"
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
+        self.check_grad(
+            ['X'], 'Out', user_defined_grads=self.gradient, check_prim_pir=True
+        )
 
 
 class TestPnormOp4(TestPnormOp):
@@ -289,7 +308,9 @@ def init_dtype(self):
         self.dtype = "float32"
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
+        self.check_grad(
+            ['X'], 'Out', user_defined_grads=self.gradient, check_prim_pir=True
+        )
 
 
 class TestPnormOp5(TestPnormOp):
@@ -321,7 +342,9 @@ def init_dtype(self):
         self.dtype = "float32"
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
+        self.check_grad(
+            ['X'], 'Out', user_defined_grads=self.gradient, check_prim_pir=True
+        )
 
 
 def create_test_fp16_class(parent, max_relative_error=2e-3):
@@ -367,7 +390,9 @@ def test_check_grad(self):
 class TestPnormBF16Op(OpTest):
     def setUp(self):
         self.op_type = "p_norm"
+        self.prim_op_type = "comp"
         self.python_api = p_norm_python_api
+        self.public_python_api = norm_public_python_api
         self.init_test_case()
         self.x = (np.random.random(self.shape) + 0.5).astype(np.float32)
         self.norm = np_linalg_norm(self.x, self.axis, self.porder, self.keepdim)
@@ -384,7 +409,7 @@ def setUp(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, atol=1e-3)
+        self.check_output_with_place(place, atol=1e-3, check_prim_pir=True)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
@@ -393,6 +418,7 @@ def test_check_grad(self):
             ['X'],
             'Out',
             user_defined_grads=self.gradient,
+            check_prim_pir=True,
         )
 
     def init_test_case(self):
diff --git a/test/legacy_test/test_outer.py b/test/legacy_test/test_outer.py
index eb452ab65da11..776616a932009 100644
--- a/test/legacy_test/test_outer.py
+++ b/test/legacy_test/test_outer.py
@@ -160,18 +160,18 @@ def test_errors_dynamic(self):
         x_data = np.random.randn(200).astype(np.float64)
         y_data = np.random.randn(200).astype(np.float64)
         y = paddle.to_tensor(y_data)
-        self.assertRaises(TypeError, paddle.outer, x_data, y)
+        self.assertRaises(Exception, paddle.outer, x_data, y)
 
         # test dynamic computation graph: dtype must be Tensor type
         x_data = np.random.randn(200).astype(np.float32)
         y_data = np.random.randn(200).astype(np.float32)
         x = paddle.to_tensor(x_data)
-        self.assertRaises(TypeError, paddle.outer, x, y_data)
+        self.assertRaises(Exception, paddle.outer, x, y_data)
 
         # test dynamic computation graph: dtype must be Tensor type
         x_data = np.random.randn(200).astype(np.float32)
         y_data = np.random.randn(200).astype(np.float32)
-        self.assertRaises(TypeError, paddle.outer, x_data, y_data)
+        self.assertRaises(Exception, paddle.outer, x_data, y_data)
 
 
 if __name__ == '__main__':
diff --git a/test/legacy_test/test_parallel_executor_run_cinn.py b/test/legacy_test/test_parallel_executor_run_cinn.py
deleted file mode 100644
index 2ca34842f0b90..0000000000000
--- a/test/legacy_test/test_parallel_executor_run_cinn.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import shutil
-import tempfile
-import unittest
-
-import numpy as np
-
-import paddle
-
-paddle.enable_static()
-
-logging.basicConfig(
-    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO
-)
-logger = logging.getLogger("paddle_with_cinn")
-
-
-def set_cinn_flag(val):
-    cinn_compiled = False
-    try:
-        paddle.set_flags({'FLAGS_use_cinn': val})
-        cinn_compiled = True
-    except ValueError:
-        logger.warning("The used paddle is not compiled with CINN.")
-    return cinn_compiled
-
-
-def reader(limit):
-    for _ in range(limit):
-        yield np.random.random([1, 28]).astype('float32'), np.random.randint(
-            0, 2, size=[1]
-        ).astype('int64')
-
-
-def rand_data(img, label, loop_num=10):
-    feed = []
-    data = reader(loop_num)
-    for _ in range(loop_num):
-        d, l = next(data)
-        feed.append({img: d, label: l})
-    return feed
-
-
-def build_program(main_program, startup_program):
-    with paddle.static.program_guard(main_program, startup_program):
-        img = paddle.static.data(name='img', shape=[1, 28], dtype='float32')
-        param = paddle.create_parameter(
-            name="bias",
-            shape=[1, 28],
-            dtype="float32",
-            attr=paddle.ParamAttr(
-                initializer=paddle.nn.initializer.Assign(
-                    np.random.rand(1, 28).astype(np.float32)
-                )
-            ),
-        )
-        label = paddle.static.data(name="label", shape=[1], dtype='int64')
-
-        hidden = paddle.add(img, param)
-        prediction = paddle.nn.functional.relu(hidden)
-
-        loss = paddle.nn.functional.cross_entropy(input=prediction, label=label)
-        avg_loss = paddle.mean(loss)
-        adam = paddle.optimizer.Adam(learning_rate=0.001)
-        adam.minimize(avg_loss)
-    return img, label, avg_loss
-
-
-def train(dot_save_dir, prefix, seed=1234):
-    np.random.seed(seed)
-    paddle.seed(seed)
-    if paddle.is_compiled_with_cuda():
-        paddle.set_flags({'FLAGS_cudnn_deterministic': 1})
-
-    startup_program = paddle.static.Program()
-    main_program = paddle.static.Program()
-    img, label, loss = build_program(main_program, startup_program)
-
-    place = (
-        paddle.CUDAPlace(0)
-        if paddle.is_compiled_with_cuda()
-        else paddle.CPUPlace()
-    )
-    exe = paddle.static.Executor(place)
-    exe.run(startup_program)
-
-    build_strategy = paddle.static.BuildStrategy()
-    build_strategy.debug_graphviz_path = os.path.join(dot_save_dir, prefix)
-    compiled_program = paddle.static.CompiledProgram(
-        main_program, build_strategy
-    )
-
-    iters = 100
-    feed = rand_data(img.name, label.name, iters)
-    loss_values = []
-    for step in range(iters):
-        loss_v = exe.run(compiled_program, feed=feed[step], fetch_list=[loss])
-        loss_values.append(loss_v[0])
-    return loss_values
-
-
-@unittest.skipIf(not set_cinn_flag(True), "Paddle is not compiled with CINN.")
-class TestParallelExecutorRunCinn(unittest.TestCase):
-    def setUp(self):
-        self.tmpdir = tempfile.mkdtemp(prefix="dots_")
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdir)
-
-    def test_run_with_cinn(self):
-        cinn_losses = np.array(train(self.tmpdir, "paddle")).flatten()
-        set_cinn_flag(False)
-        pd_losses = np.array(train(self.tmpdir, "cinn")).flatten()
-        np.testing.assert_allclose(
-            cinn_losses, pd_losses, rtol=1e-05, atol=1e-05
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_parallel_executor_seresnext_base_cpu.py b/test/legacy_test/test_parallel_executor_seresnext_base_cpu.py
deleted file mode 100644
index 7c9c9968c4a18..0000000000000
--- a/test/legacy_test/test_parallel_executor_seresnext_base_cpu.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from functools import partial
-
-import seresnext_net
-from seresnext_test_base import DeviceType, TestResnetBase
-
-
-class TestResnetCPU(TestResnetBase):
-    def test_seresnext_with_learning_rate_decay(self):
-        # NOTE(zcd): This test is compare the result of use parallel_executor
-        # and executor, and the result of drop_out op and batch_norm op in
-        # this two executor have diff, so the two ops should be removed
-        # from the model.
-        check_func = partial(
-            self.check_network_convergence,
-            optimizer=seresnext_net.optimizer,
-            use_parallel_executor=False,
-        )
-        self._compare_result_with_origin_model(
-            check_func,
-            use_device=DeviceType.CPU,
-            compare_separately=False,
-            delta2=1e-3,
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_parallel_executor_seresnext_base_gpu.py b/test/legacy_test/test_parallel_executor_seresnext_base_gpu.py
deleted file mode 100644
index 75bd61f5c6c7d..0000000000000
--- a/test/legacy_test/test_parallel_executor_seresnext_base_gpu.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from functools import partial
-
-import seresnext_net
-from seresnext_test_base import DeviceType, TestResnetBase
-
-
-class TestResnetGPU(TestResnetBase):
-    def test_seresnext_with_learning_rate_decay(self):
-        # NOTE(zcd): This test is compare the result of use parallel_executor
-        # and executor, and the result of drop_out op and batch_norm op in
-        # this two executor have diff, so the two ops should be removed
-        # from the model.
-        check_func = partial(
-            self.check_network_convergence,
-            optimizer=seresnext_net.optimizer,
-            use_parallel_executor=False,
-        )
-        self._compare_result_with_origin_model(
-            check_func,
-            use_device=DeviceType.CUDA,
-            delta2=1e-3,
-            compare_separately=False,
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py b/test/legacy_test/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py
deleted file mode 100644
index 75d3d85e20e5b..0000000000000
--- a/test/legacy_test/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle import base
-
-base.core._set_fuse_parameter_group_size(3)
-base.core._set_fuse_parameter_memory_size(131072)
-
-import unittest
-from functools import partial
-
-import seresnext_net
-from seresnext_test_base import DeviceType, TestResnetBase
-
-
-class TestResnetWithFuseAllReduceCPU(TestResnetBase):
-    def test_seresnext_with_fused_all_reduce(self):
-        # NOTE(zcd): In order to make the program faster,
-        # this unit test remove drop_out and batch_norm.
-        check_func = partial(
-            self.check_network_convergence,
-            optimizer=seresnext_net.optimizer,
-            fuse_all_reduce_ops=True,
-        )
-        self._compare_result_with_origin_model(
-            check_func, use_device=DeviceType.CPU
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py b/test/legacy_test/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
deleted file mode 100644
index 752538efaa059..0000000000000
--- a/test/legacy_test/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle import base
-
-base.core._set_fuse_parameter_group_size(3)
-base.core._set_fuse_parameter_memory_size(131072)
-
-import unittest
-from functools import partial
-
-import seresnext_net
-from seresnext_test_base import DeviceType, TestResnetBase
-
-
-class TestResnetWithFuseAllReduceGPU(TestResnetBase):
-    def test_seresnext_with_fused_all_reduce(self):
-        # NOTE(zcd): In order to make the program faster,
-        # this unit test remove drop_out and batch_norm.
-        check_func = partial(
-            self.check_network_convergence,
-            optimizer=seresnext_net.optimizer,
-            fuse_all_reduce_ops=True,
-        )
-        self._compare_result_with_origin_model(
-            check_func, use_device=DeviceType.CUDA, delta2=1e-2
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_parallel_executor_seresnext_with_reduce_cpu.py b/test/legacy_test/test_parallel_executor_seresnext_with_reduce_cpu.py
deleted file mode 100644
index 9dead36622763..0000000000000
--- a/test/legacy_test/test_parallel_executor_seresnext_with_reduce_cpu.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import seresnext_net
-from parallel_executor_test_base import DeviceType, TestParallelExecutorBase
-
-from paddle.base import core
-
-
-class TestResnetWithReduceBase(TestParallelExecutorBase):
-    def _compare_reduce_and_allreduce(self, use_device, delta2=1e-5):
-        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
-            return
-
-        (
-            all_reduce_first_loss,
-            all_reduce_last_loss,
-            _,
-        ) = self.check_network_convergence(
-            seresnext_net.model,
-            feed_dict=seresnext_net.feed_dict(use_device),
-            iter=seresnext_net.iter(use_device),
-            batch_size=seresnext_net.batch_size(use_device),
-            use_device=use_device,
-            use_reduce=False,
-            optimizer=seresnext_net.optimizer,
-        )
-        reduce_first_loss, reduce_last_loss, _ = self.check_network_convergence(
-            seresnext_net.model,
-            feed_dict=seresnext_net.feed_dict(use_device),
-            iter=seresnext_net.iter(use_device),
-            batch_size=seresnext_net.batch_size(use_device),
-            use_device=use_device,
-            use_reduce=True,
-            optimizer=seresnext_net.optimizer,
-        )
-
-        self.assertAlmostEqual(
-            all_reduce_first_loss, reduce_first_loss, delta=1e-5
-        )
-        self.assertAlmostEqual(
-            all_reduce_last_loss,
-            reduce_last_loss,
-            delta=all_reduce_last_loss * delta2,
-        )
-
-        if not use_device:
-            return
-
-        (
-            all_reduce_first_loss_seq,
-            all_reduce_last_loss_seq,
-            _,
-        ) = self.check_network_convergence(
-            seresnext_net.model,
-            feed_dict=seresnext_net.feed_dict(use_device),
-            iter=seresnext_net.iter(use_device),
-            batch_size=seresnext_net.batch_size(use_device),
-            use_device=use_device,
-            use_reduce=False,
-            optimizer=seresnext_net.optimizer,
-            enable_sequential_execution=True,
-        )
-
-        (
-            reduce_first_loss_seq,
-            reduce_last_loss_seq,
-            _,
-        ) = self.check_network_convergence(
-            seresnext_net.model,
-            feed_dict=seresnext_net.feed_dict(use_device),
-            iter=seresnext_net.iter(use_device),
-            batch_size=seresnext_net.batch_size(use_device),
-            use_device=use_device,
-            use_reduce=True,
-            optimizer=seresnext_net.optimizer,
-            enable_sequential_execution=True,
-        )
-
-        self.assertAlmostEqual(
-            all_reduce_first_loss, all_reduce_first_loss_seq, delta=1e-5
-        )
-        self.assertAlmostEqual(
-            all_reduce_last_loss,
-            all_reduce_last_loss_seq,
-            delta=all_reduce_last_loss * delta2,
-        )
-
-        self.assertAlmostEqual(
-            reduce_first_loss, reduce_first_loss_seq, delta=1e-5
-        )
-        self.assertAlmostEqual(
-            reduce_last_loss,
-            reduce_last_loss_seq,
-            delta=reduce_last_loss * delta2,
-        )
-
-        self.assertAlmostEqual(
-            all_reduce_first_loss_seq, reduce_first_loss_seq, delta=1e-5
-        )
-        self.assertAlmostEqual(
-            all_reduce_last_loss_seq,
-            reduce_last_loss_seq,
-            delta=all_reduce_last_loss_seq * delta2,
-        )
-
-
-class TestResnetWithReduceCPU(TestResnetWithReduceBase):
-    def test_seresnext_with_reduce(self):
-        self._compare_reduce_and_allreduce(
-            use_device=DeviceType.CPU, delta2=1e-3
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_parallel_executor_transformer.py b/test/legacy_test/test_parallel_executor_transformer.py
deleted file mode 100644
index d6bcf26c24bbd..0000000000000
--- a/test/legacy_test/test_parallel_executor_transformer.py
+++ /dev/null
@@ -1,251 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-
-import numpy as np
-import transformer_model
-from feed_data_reader import FeedDataReader
-from parallel_executor_test_base import DeviceType, TestParallelExecutorBase
-
-import paddle
-from paddle.base import core
-from paddle.dataset import wmt16
-
-os.environ['CPU_NUM'] = str(4)
-
-
-class ModelHyperParams:
-    # Dictionary size for source and target language. This model directly uses
-    # paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
-    # already been added, but the <pad> token is not added. Transformer requires
-    # sequences in a mini-batch are padded to have the same length. A <pad> token is
-    # added into the original dictionary in paddle.dateset.wmt16.
-
-    # size of source word dictionary.
-    src_vocab_size = 10000
-    # index for <pad> token in source language.
-    src_pad_idx = src_vocab_size
-
-    # size of target word dictionary
-    trg_vocab_size = 10000
-    # index for <pad> token in target language.
-    trg_pad_idx = trg_vocab_size
-
-    # position value corresponding to the <pad> token.
-    pos_pad_idx = 0
-
-    # max length of sequences. It should plus 1 to include position
-    # padding token for position encoding.
-    max_length = 50
-
-    # the dimension for word embeddings, which is also the last dimension of
-    # the input and output of multi-head attention, position-wise feed-forward
-    # networks, encoder and decoder.
-
-    d_model = 512
-    # size of the hidden layer in position-wise feed-forward networks.
-    d_inner_hid = 1024
-    # the dimension that keys are projected to for dot-product attention.
-    d_key = 64
-    # the dimension that values are projected to for dot-product attention.
-    d_value = 64
-    # number of head used in multi-head attention.
-    n_head = 8
-    # number of sub-layers to be stacked in the encoder and decoder.
-    # NOTE(zcd): the origin number of layer is 6, to make this unit test faster,
-    # we should reduce the layer number to 4.
-    n_layer = 4
-    # dropout rate used by all dropout layers.
-    dropout = 0.1
-
-
-def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
-    """
-    Pad the instances to the max sequence length in batch, and generate the
-    corresponding position data and attention bias. Then, convert the numpy
-    data to tensors and return a dict mapping names to tensors.
-    """
-
-    def __pad_batch_data(
-        insts,
-        pad_idx,
-        is_target=False,
-        return_pos=True,
-        return_attn_bias=True,
-        return_max_len=True,
-    ):
-        """
-        Pad the instances to the max sequence length in batch, and generate the
-        corresponding position data and attention bias.
-        """
-        return_list = []
-        max_len = max(len(inst) for inst in insts)
-        inst_data = np.array(
-            [inst + [pad_idx] * (max_len - len(inst)) for inst in insts]
-        )
-        return_list += [inst_data.astype("int64").reshape([-1, 1])]
-        if return_pos:
-            inst_pos = np.array(
-                [
-                    [
-                        pos_i + 1 if w_i != pad_idx else 0
-                        for pos_i, w_i in enumerate(inst)
-                    ]
-                    for inst in inst_data
-                ]
-            )
-
-            return_list += [inst_pos.astype("int64").reshape([-1, 1])]
-        if return_attn_bias:
-            if is_target:
-                # This is used to avoid attention on paddings and subsequent
-                # words.
-                slf_attn_bias_data = np.ones(
-                    (inst_data.shape[0], max_len, max_len)
-                )
-                slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
-                    [-1, 1, max_len, max_len]
-                )
-                slf_attn_bias_data = np.tile(
-                    slf_attn_bias_data, [1, n_head, 1, 1]
-                ) * [-1e9]
-            else:
-                # This is used to avoid attention on paddings.
-                slf_attn_bias_data = np.array(
-                    [
-                        [0] * len(inst) + [-1e9] * (max_len - len(inst))
-                        for inst in insts
-                    ]
-                )
-                slf_attn_bias_data = np.tile(
-                    slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
-                    [1, n_head, max_len, 1],
-                )
-            return_list += [slf_attn_bias_data.astype("float32")]
-        if return_max_len:
-            return_list += [max_len]
-        return return_list if len(return_list) > 1 else return_list[0]
-
-    src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
-        [inst[0] for inst in insts], src_pad_idx, is_target=False
-    )
-    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
-        [inst[1] for inst in insts], trg_pad_idx, is_target=True
-    )
-    trg_src_attn_bias = np.tile(
-        src_slf_attn_bias[:, :, ::src_max_len, :], [1, 1, trg_max_len, 1]
-    ).astype("float32")
-    lbl_word = __pad_batch_data(
-        [inst[2] for inst in insts], trg_pad_idx, False, False, False, False
-    )
-    lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
-
-    return [
-        src_word,
-        src_pos,
-        trg_word,
-        trg_pos,
-        src_slf_attn_bias,
-        trg_slf_attn_bias,
-        trg_src_attn_bias,
-        lbl_word,
-        lbl_weight,
-    ]
-
-
-feed_data_reader = None
-
-
-def transformer(use_feed):
-    assert not use_feed, "transformer doesn't support feed yet"
-    return transformer_model.transformer(
-        ModelHyperParams.src_vocab_size + 1,
-        ModelHyperParams.trg_vocab_size + 1,
-        ModelHyperParams.max_length + 1,
-        ModelHyperParams.n_layer,
-        ModelHyperParams.n_head,
-        ModelHyperParams.d_key,
-        ModelHyperParams.d_value,
-        ModelHyperParams.d_model,
-        ModelHyperParams.d_inner_hid,
-        ModelHyperParams.dropout,
-        ModelHyperParams.src_pad_idx,
-        ModelHyperParams.trg_pad_idx,
-        ModelHyperParams.pos_pad_idx,
-    )
-
-
-def get_feed_data_reader():
-    global feed_data_reader
-    if feed_data_reader is not None:
-        return feed_data_reader
-
-    reader = paddle.batch(
-        wmt16.train(
-            ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size
-        ),
-        batch_size=transformer_model.batch_size,
-    )
-    all_batch_tensors = []
-    for batch in reader():
-        tensors = []
-        for tensor in prepare_batch_input(
-            batch,
-            ModelHyperParams.src_pad_idx,
-            ModelHyperParams.trg_pad_idx,
-            ModelHyperParams.n_head,
-        ):
-            tensors.append(np.array(tensor))
-        all_batch_tensors.append(tensors)
-
-    def __reader__():
-        yield from all_batch_tensors
-
-    feed_data_reader = FeedDataReader(
-        feed_list=transformer_model.build_inputs(
-            ModelHyperParams.max_length + 1, ModelHyperParams.n_head
-        ),
-        reader=__reader__,
-    )
-
-    return feed_data_reader
-
-
-class TestTransformer(TestParallelExecutorBase):
-    def test_main(self):
-        if core.is_compiled_with_cuda():
-            self.check_network_convergence(
-                transformer,
-                use_device=DeviceType.CUDA,
-                feed_data_reader=get_feed_data_reader(),
-            )
-            self.check_network_convergence(
-                transformer,
-                use_device=DeviceType.CUDA,
-                enable_sequential_execution=True,
-                feed_data_reader=get_feed_data_reader(),
-            )
-        self.check_network_convergence(
-            transformer,
-            use_device=DeviceType.CPU,
-            iter=2,
-            feed_data_reader=get_feed_data_reader(),
-        )
-
-
-if __name__ == '__main__':
-    paddle.enable_static()
-    unittest.main()
diff --git a/test/legacy_test/test_pool2d_op.py b/test/legacy_test/test_pool2d_op.py
index 0a63c3f85352d..b2f10e3af1b26 100644
--- a/test/legacy_test/test_pool2d_op.py
+++ b/test/legacy_test/test_pool2d_op.py
@@ -419,7 +419,7 @@ def has_cudnn(self):
         return core.is_compiled_with_cuda() and self.use_cudnn
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         if self.has_cudnn():
             place = core.CUDAPlace(0)
             self.check_output_with_place(
@@ -440,7 +440,7 @@ def test_check_output(self):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         if self.has_cudnn() and self.pool_type != "max":
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
@@ -595,7 +595,7 @@ def init_kernel_type(self):
             self.dtype = np.float16
 
         def test_check_output(self):
-            # TODO(wangzhongpu): support mkldnn op in dygraph mode
+            # TODO(wangzhongpu): support onednn op in dygraph mode
             if core.is_compiled_with_cuda():
                 place = core.CUDAPlace(0)
                 if core.is_float16_supported(place):
@@ -607,7 +607,7 @@ def test_check_output(self):
                     )
 
         def test_check_grad(self):
-            # TODO(wangzhongpu): support mkldnn op in dygraph mode
+            # TODO(wangzhongpu): support onednn op in dygraph mode
             place = core.CUDAPlace(0)
             if (
                 core.is_float16_supported(place)
@@ -638,7 +638,7 @@ def init_kernel_type(self):
             self.dtype = np.float16
 
         def test_check_output(self):
-            # TODO(wangzhongpu): support mkldnn op in dygraph mode
+            # TODO(wangzhongpu): support onednn op in dygraph mode
             if core.is_compiled_with_cuda():
                 place = core.CUDAPlace(0)
                 if core.is_float16_supported(place):
@@ -650,7 +650,7 @@ def test_check_output(self):
                     )
 
         def test_check_grad(self):
-            # TODO(wangzhongpu): support mkldnn op in dygraph mode
+            # TODO(wangzhongpu): support onednn op in dygraph mode
             place = core.CUDAPlace(0)
             if (
                 core.is_float16_supported(place)
diff --git a/test/legacy_test/test_pow2_decay_with_linear_warmup_op.py b/test/legacy_test/test_pow2_decay_with_linear_warmup_op.py
deleted file mode 100644
index a7987243ff00d..0000000000000
--- a/test/legacy_test/test_pow2_decay_with_linear_warmup_op.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle
-from paddle.incubate.layers.nn import pow2_decay_with_linear_warmup
-from paddle.optimizer.lr import LinearWarmup, PolynomialDecay
-
-
-def gen_pow2_warmup_op_lr(warmup_steps, total_steps, base_lr, end_lr, place):
-    main = paddle.static.Program()
-    startup = paddle.static.Program()
-    with paddle.static.program_guard(main, startup):
-        lr = pow2_decay_with_linear_warmup(
-            warmup_steps, total_steps, base_lr, end_lr
-        )
-        exe = paddle.static.Executor(place)
-    with paddle.static.scope_guard(paddle.static.Scope()):
-        exe.run(startup)
-        while True:
-            lr_np = exe.run(main, fetch_list=[lr])[0]
-            yield lr_np[0]
-
-
-class Pow2Warmup(LinearWarmup):
-    def __init__(self, warmup_steps, total_steps, base_lr, end_lr):
-        assert total_steps > warmup_steps
-        lr_sch = PolynomialDecay(
-            learning_rate=base_lr,
-            decay_steps=total_steps - warmup_steps,
-            end_lr=end_lr,
-            power=2,
-        )
-
-        super().__init__(
-            learning_rate=lr_sch,
-            warmup_steps=warmup_steps,
-            start_lr=0.0,
-            end_lr=base_lr,
-        )
-
-
-def gen_pow2_warmup_py_lr(warmup_steps, total_steps, base_lr, end_lr, place):
-    lr_sch = Pow2Warmup(warmup_steps, total_steps, base_lr, end_lr)
-    lr_sch.step()
-    while True:
-        yield lr_sch()
-        lr_sch.step()
-
-
-class TestPow2WarmupLRScheduler(unittest.TestCase):
-    def setUp(self):
-        paddle.enable_static()
-        self.params = {
-            'warmup_steps': 30,
-            'total_steps': 100,
-            'base_lr': 0.02,
-            'end_lr': 0.001,
-        }
-        self.step_num = 1000
-
-    def check_with_place(self, place):
-        kwargs = dict(self.params)
-        kwargs['place'] = place
-        lr_sch_op = gen_pow2_warmup_op_lr(**kwargs)
-        lr_sch_py = gen_pow2_warmup_py_lr(**kwargs)
-        for i, (lr_op, lr_py) in enumerate(zip(lr_sch_op, lr_sch_py)):
-            self.assertLess(abs(lr_op - lr_py), 1e-6)
-            if i > self.step_num:
-                break
-
-    def test_main(self):
-        self.check_with_place(paddle.CPUPlace())
-        if paddle.is_compiled_with_cuda():
-            self.check_with_place(paddle.CUDAPlace(0))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_precision_recall_op.py b/test/legacy_test/test_precision_recall_op.py
deleted file mode 100644
index 97f3d7e7724a4..0000000000000
--- a/test/legacy_test/test_precision_recall_op.py
+++ /dev/null
@@ -1,206 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-def calc_precision(tp_count, fp_count):
-    if tp_count > 0.0 or fp_count > 0.0:
-        return tp_count / (tp_count + fp_count)
-    return 1.0
-
-
-def calc_recall(tp_count, fn_count):
-    if tp_count > 0.0 or fn_count > 0.0:
-        return tp_count / (tp_count + fn_count)
-    return 1.0
-
-
-def calc_f1_score(precision, recall):
-    if precision > 0.0 or recall > 0.0:
-        return 2 * precision * recall / (precision + recall)
-    return 0.0
-
-
-def get_states(idxs, labels, cls_num, weights=None):
-    ins_num = idxs.shape[0]
-    # TP FP TN FN
-    states = np.zeros((cls_num, 4)).astype('float32')
-    for i in range(ins_num):
-        w = weights[i] if weights is not None else 1.0
-        idx = idxs[i][0]
-        label = labels[i][0]
-        if idx == label:
-            states[idx][0] += w
-            for j in range(cls_num):
-                states[j][2] += w
-            states[idx][2] -= w
-        else:
-            states[label][3] += w
-            states[idx][1] += w
-            for j in range(cls_num):
-                states[j][2] += w
-            states[label][2] -= w
-            states[idx][2] -= w
-    return states
-
-
-def compute_metrics(states, cls_num):
-    total_tp_count = 0.0
-    total_fp_count = 0.0
-    total_fn_count = 0.0
-    macro_avg_precision = 0.0
-    macro_avg_recall = 0.0
-    for i in range(cls_num):
-        total_tp_count += states[i][0]
-        total_fp_count += states[i][1]
-        total_fn_count += states[i][3]
-        macro_avg_precision += calc_precision(states[i][0], states[i][1])
-        macro_avg_recall += calc_recall(states[i][0], states[i][3])
-    metrics = []
-    macro_avg_precision /= cls_num
-    macro_avg_recall /= cls_num
-    metrics.append(macro_avg_precision)
-    metrics.append(macro_avg_recall)
-    metrics.append(calc_f1_score(macro_avg_precision, macro_avg_recall))
-    micro_avg_precision = calc_precision(total_tp_count, total_fp_count)
-    metrics.append(micro_avg_precision)
-    micro_avg_recall = calc_recall(total_tp_count, total_fn_count)
-    metrics.append(micro_avg_recall)
-    metrics.append(calc_f1_score(micro_avg_precision, micro_avg_recall))
-    return np.array(metrics).astype('float32')
-
-
-class TestPrecisionRecallOp_0(OpTest):
-    def setUp(self):
-        self.op_type = "precision_recall"
-        ins_num = 64
-        cls_num = 10
-        max_probs = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        idxs = (
-            np.random.choice(range(cls_num), ins_num)
-            .reshape((ins_num, 1))
-            .astype('int32')
-        )
-        labels = (
-            np.random.choice(range(cls_num), ins_num)
-            .reshape((ins_num, 1))
-            .astype('int32')
-        )
-        states = get_states(idxs, labels, cls_num)
-        metrics = compute_metrics(states, cls_num)
-
-        self.attrs = {'class_number': cls_num}
-
-        self.inputs = {'MaxProbs': max_probs, 'Indices': idxs, 'Labels': labels}
-
-        self.outputs = {
-            'BatchMetrics': metrics,
-            'AccumMetrics': metrics,
-            'AccumStatesInfo': states,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestPrecisionRecallOp_1(OpTest):
-    def setUp(self):
-        self.op_type = "precision_recall"
-        ins_num = 64
-        cls_num = 10
-        max_probs = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        idxs = (
-            np.random.choice(range(cls_num), ins_num)
-            .reshape((ins_num, 1))
-            .astype('int32')
-        )
-        weights = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        labels = (
-            np.random.choice(range(cls_num), ins_num)
-            .reshape((ins_num, 1))
-            .astype('int32')
-        )
-
-        states = get_states(idxs, labels, cls_num, weights)
-        metrics = compute_metrics(states, cls_num)
-
-        self.attrs = {'class_number': cls_num}
-
-        self.inputs = {
-            'MaxProbs': max_probs,
-            'Indices': idxs,
-            'Labels': labels,
-            'Weights': weights,
-        }
-
-        self.outputs = {
-            'BatchMetrics': metrics,
-            'AccumMetrics': metrics,
-            'AccumStatesInfo': states,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestPrecisionRecallOp_2(OpTest):
-    def setUp(self):
-        self.op_type = "precision_recall"
-        ins_num = 64
-        cls_num = 10
-        max_probs = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        idxs = (
-            np.random.choice(range(cls_num), ins_num)
-            .reshape((ins_num, 1))
-            .astype('int32')
-        )
-        weights = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        labels = (
-            np.random.choice(range(cls_num), ins_num)
-            .reshape((ins_num, 1))
-            .astype('int32')
-        )
-        states = np.random.randint(0, 30, (cls_num, 4)).astype('float32')
-
-        accum_states = get_states(idxs, labels, cls_num, weights)
-        batch_metrics = compute_metrics(accum_states, cls_num)
-        accum_states += states
-        accum_metrics = compute_metrics(accum_states, cls_num)
-
-        self.attrs = {'class_number': cls_num}
-
-        self.inputs = {
-            'MaxProbs': max_probs,
-            'Indices': idxs,
-            'Labels': labels,
-            'Weights': weights,
-            'StatesInfo': states,
-        }
-
-        self.outputs = {
-            'BatchMetrics': batch_metrics,
-            'AccumMetrics': accum_metrics,
-            'AccumStatesInfo': accum_states,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_program_prune_backward.py b/test/legacy_test/test_program_prune_backward.py
index 581635d5a68ad..36e3fb67c254e 100755
--- a/test/legacy_test/test_program_prune_backward.py
+++ b/test/legacy_test/test_program_prune_backward.py
@@ -17,16 +17,213 @@
 
 import numpy as np
 import seresnext_net
+import transformer_model
+from feed_data_reader import FeedDataReader
 from simple_nets import fc_with_batchnorm, init_data, simple_fc_net
-from test_parallel_executor_transformer import (
-    DeviceType,
-    get_feed_data_reader,
-    transformer,
-)
 
 import paddle
 from paddle import base
 from paddle.base import core
+from paddle.dataset import wmt16
+
+DeviceType = core.DeviceType
+
+
+class ModelHyperParams:
+    # Dictionary size for source and target language. This model directly uses
+    # paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
+    # already been added, but the <pad> token is not added. Transformer requires
+    # sequences in a mini-batch are padded to have the same length. A <pad> token is
+    # added into the original dictionary in paddle.dateset.wmt16.
+
+    # size of source word dictionary.
+    src_vocab_size = 10000
+    # index for <pad> token in source language.
+    src_pad_idx = src_vocab_size
+
+    # size of target word dictionary
+    trg_vocab_size = 10000
+    # index for <pad> token in target language.
+    trg_pad_idx = trg_vocab_size
+
+    # position value corresponding to the <pad> token.
+    pos_pad_idx = 0
+
+    # max length of sequences. It should plus 1 to include position
+    # padding token for position encoding.
+    max_length = 50
+
+    # the dimension for word embeddings, which is also the last dimension of
+    # the input and output of multi-head attention, position-wise feed-forward
+    # networks, encoder and decoder.
+
+    d_model = 512
+    # size of the hidden layer in position-wise feed-forward networks.
+    d_inner_hid = 1024
+    # the dimension that keys are projected to for dot-product attention.
+    d_key = 64
+    # the dimension that values are projected to for dot-product attention.
+    d_value = 64
+    # number of head used in multi-head attention.
+    n_head = 8
+    # number of sub-layers to be stacked in the encoder and decoder.
+    # NOTE(zcd): the origin number of layer is 6, to make this unit test faster,
+    # we should reduce the layer number to 4.
+    n_layer = 4
+    # dropout rate used by all dropout layers.
+    dropout = 0.1
+
+
+def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and attention bias. Then, convert the numpy
+    data to tensors and return a dict mapping names to tensors.
+    """
+
+    def __pad_batch_data(
+        insts,
+        pad_idx,
+        is_target=False,
+        return_pos=True,
+        return_attn_bias=True,
+        return_max_len=True,
+    ):
+        """
+        Pad the instances to the max sequence length in batch, and generate the
+        corresponding position data and attention bias.
+        """
+        return_list = []
+        max_len = max(len(inst) for inst in insts)
+        inst_data = np.array(
+            [inst + [pad_idx] * (max_len - len(inst)) for inst in insts]
+        )
+        return_list += [inst_data.astype("int64").reshape([-1, 1])]
+        if return_pos:
+            inst_pos = np.array(
+                [
+                    [
+                        pos_i + 1 if w_i != pad_idx else 0
+                        for pos_i, w_i in enumerate(inst)
+                    ]
+                    for inst in inst_data
+                ]
+            )
+
+            return_list += [inst_pos.astype("int64").reshape([-1, 1])]
+        if return_attn_bias:
+            if is_target:
+                # This is used to avoid attention on paddings and subsequent
+                # words.
+                slf_attn_bias_data = np.ones(
+                    (inst_data.shape[0], max_len, max_len)
+                )
+                slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
+                    [-1, 1, max_len, max_len]
+                )
+                slf_attn_bias_data = np.tile(
+                    slf_attn_bias_data, [1, n_head, 1, 1]
+                ) * [-1e9]
+            else:
+                # This is used to avoid attention on paddings.
+                slf_attn_bias_data = np.array(
+                    [
+                        [0] * len(inst) + [-1e9] * (max_len - len(inst))
+                        for inst in insts
+                    ]
+                )
+                slf_attn_bias_data = np.tile(
+                    slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
+                    [1, n_head, max_len, 1],
+                )
+            return_list += [slf_attn_bias_data.astype("float32")]
+        if return_max_len:
+            return_list += [max_len]
+        return return_list if len(return_list) > 1 else return_list[0]
+
+    src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
+        [inst[0] for inst in insts], src_pad_idx, is_target=False
+    )
+    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
+        [inst[1] for inst in insts], trg_pad_idx, is_target=True
+    )
+    trg_src_attn_bias = np.tile(
+        src_slf_attn_bias[:, :, ::src_max_len, :], [1, 1, trg_max_len, 1]
+    ).astype("float32")
+    lbl_word = __pad_batch_data(
+        [inst[2] for inst in insts], trg_pad_idx, False, False, False, False
+    )
+    lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
+
+    return [
+        src_word,
+        src_pos,
+        trg_word,
+        trg_pos,
+        src_slf_attn_bias,
+        trg_slf_attn_bias,
+        trg_src_attn_bias,
+        lbl_word,
+        lbl_weight,
+    ]
+
+
+feed_data_reader = None
+
+
+def transformer(use_feed):
+    assert not use_feed, "transformer doesn't support feed yet"
+    return transformer_model.transformer(
+        ModelHyperParams.src_vocab_size + 1,
+        ModelHyperParams.trg_vocab_size + 1,
+        ModelHyperParams.max_length + 1,
+        ModelHyperParams.n_layer,
+        ModelHyperParams.n_head,
+        ModelHyperParams.d_key,
+        ModelHyperParams.d_value,
+        ModelHyperParams.d_model,
+        ModelHyperParams.d_inner_hid,
+        ModelHyperParams.dropout,
+        ModelHyperParams.src_pad_idx,
+        ModelHyperParams.trg_pad_idx,
+        ModelHyperParams.pos_pad_idx,
+    )
+
+
+def get_feed_data_reader():
+    global feed_data_reader
+    if feed_data_reader is not None:
+        return feed_data_reader
+
+    reader = paddle.batch(
+        wmt16.train(
+            ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size
+        ),
+        batch_size=transformer_model.batch_size,
+    )
+    all_batch_tensors = []
+    for batch in reader():
+        tensors = []
+        for tensor in prepare_batch_input(
+            batch,
+            ModelHyperParams.src_pad_idx,
+            ModelHyperParams.trg_pad_idx,
+            ModelHyperParams.n_head,
+        ):
+            tensors.append(np.array(tensor))
+        all_batch_tensors.append(tensors)
+
+    def __reader__():
+        yield from all_batch_tensors
+
+    feed_data_reader = FeedDataReader(
+        feed_list=transformer_model.build_inputs(
+            ModelHyperParams.max_length + 1, ModelHyperParams.n_head
+        ),
+        reader=__reader__,
+    )
+
+    return feed_data_reader
 
 
 def simple_fc_net_with_accuracy(use_feed):
diff --git a/test/legacy_test/test_py_func_op.py b/test/legacy_test/test_py_func_op.py
index 1706ad14d644d..3fa249935406f 100644
--- a/test/legacy_test/test_py_func_op.py
+++ b/test/legacy_test/test_py_func_op.py
@@ -19,7 +19,6 @@
 
 import paddle
 from paddle import base
-from paddle.base import compiler
 
 dev_cnt = 2
 if base.core.is_compiled_with_cuda():
@@ -171,7 +170,7 @@ def reader():
         )
 
 
-def test_main(use_cuda, use_py_func_op, use_parallel_executor):
+def test_main(use_cuda, use_py_func_op):
     if use_cuda and not base.core.is_compiled_with_cuda():
         return None
 
@@ -197,12 +196,7 @@ def test_main(use_cuda, use_py_func_op, use_parallel_executor):
             exe.run(base.default_startup_program())
 
             train_cp = base.default_main_program()
-
-            if use_parallel_executor:
-                train_cp = compiler.CompiledProgram(base.default_main_program())
-                fetch_list = [loss.name]
-            else:
-                fetch_list = [loss]
+            fetch_list = [loss]
 
             ret = []
             for epoch_id in range(2):
@@ -215,16 +209,11 @@ def test_main(use_cuda, use_py_func_op, use_parallel_executor):
 
 
 class TestPyFuncOpUseExecutor(unittest.TestCase):
-    def setUp(self):
-        self.use_parallel_executor = False
-
     def test_loss_diff(self):
         for use_cuda in [True, False]:
             losses = []
             for use_py_func_op in [True, False]:
-                L = test_main(
-                    use_cuda, use_py_func_op, self.use_parallel_executor
-                )
+                L = test_main(use_cuda, use_py_func_op)
                 if L is not None:
                     losses.append(L)
 
@@ -233,10 +222,5 @@ def test_loss_diff(self):
                     self.assertAlmostEqual(max_diff, 0, delta=1e-3)
 
 
-class TestPyFuncOpUseParallelExecutor(TestPyFuncOpUseExecutor):
-    def setUp(self):
-        self.use_parallel_executor = True
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/legacy_test/test_queue.py b/test/legacy_test/test_queue.py
deleted file mode 100644
index 5a1cbd53d43aa..0000000000000
--- a/test/legacy_test/test_queue.py
+++ /dev/null
@@ -1,82 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-import paddle
-from paddle import base
-from paddle.base import core
-
-
-class TestQueue(unittest.TestCase):
-    def test_eq(self):
-        """
-        test queue_generator op, enqueue op and dequeue op.
-        """
-
-        main_program = base.Program()
-        startup_program = base.Program()
-        value = np.random.rand(1)
-        with base.program_guard(main_program, startup_program):
-            data_in = paddle.static.create_global_var(
-                shape=[2, 3],
-                value=value,
-                dtype="float32",
-                persistable=True,
-                name='var_in',
-            )
-            data_out = paddle.static.create_global_var(
-                shape=[2, 3],
-                value=value - 1.0,
-                dtype="float32",
-                persistable=True,
-                name='var_out',
-            )
-        startup_block = startup_program.block(0)
-        queue_name = 'blocking_queue'
-        startup_block.create_var(
-            name=queue_name, persistable=True, type=core.VarDesc.VarType.RAW
-        )
-        startup_block.append_op(
-            type="queue_generator", attrs={'names': [queue_name]}
-        )
-        block = main_program.block(0)
-        block.append_op(
-            type='enqueue',
-            inputs={'X': data_in},
-            attrs={'queue_name': queue_name},
-        )
-        block.append_op(
-            type='dequeue',
-            outputs={'Out': [data_out]},
-            attrs={'queue_name': queue_name},
-        )
-
-        place = (
-            base.CUDAPlace(0)
-            if core.is_compiled_with_cuda()
-            else base.CPUPlace()
-        )
-        exe = base.Executor(place)
-        exe.run(startup_program)
-        (ret,) = exe.run(main_program, fetch_list=[data_out.name])
-        np.testing.assert_allclose(
-            np.asarray(ret), np.full((2, 3), value, np.float32), rtol=1e-05
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_rank_loss_op.py b/test/legacy_test/test_rank_loss_op.py
deleted file mode 100644
index e246310ddaaca..0000000000000
--- a/test/legacy_test/test_rank_loss_op.py
+++ /dev/null
@@ -1,87 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-class TestRankLossOp(OpTest):
-    def setUp(self):
-        self.op_type = "rank_loss"
-        shape = (100, 1)
-        # labels_{i} = {0, 1.0} or {0, 0.5, 1.0}
-        label_shape, left_shape, right_shape = self.set_shape()
-        label = np.random.randint(0, 2, size=shape).astype("float32")
-        left = np.random.random(shape).astype("float32")
-        right = np.random.random(shape).astype("float32")
-        loss = np.log(1.0 + np.exp(left - right)) - label * (left - right)
-        loss = np.reshape(loss, label_shape)
-        self.inputs = {
-            'Label': label.reshape(label_shape),
-            'Left': left.reshape(left_shape),
-            'Right': right.reshape(right_shape),
-        }
-        self.outputs = {'Out': loss.reshape(label_shape)}
-
-    def set_shape(self):
-        batch_size = 100
-        return (batch_size, 1), (batch_size, 1), (batch_size, 1)
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["Left", "Right"], "Out")
-
-    def test_check_grad_ignore_left(self):
-        self.check_grad(["Right"], "Out", no_grad_set=set('Left'))
-
-    def test_check_grad_ignore_right(self):
-        self.check_grad(["Left"], "Out", no_grad_set=set('Right'))
-
-
-class TestRankLossOp1(TestRankLossOp):
-    def set_shape(self):
-        batch_size = 100
-        return (batch_size), (batch_size, 1), (batch_size, 1)
-
-
-class TestRankLossOp2(TestRankLossOp):
-    def set_shape(self):
-        batch_size = 100
-        return (batch_size, 1), (batch_size), (batch_size, 1)
-
-
-class TestRankLossOp3(TestRankLossOp):
-    def set_shape(self):
-        batch_size = 100
-        return (batch_size, 1), (batch_size, 1), (batch_size)
-
-
-class TestRankLossOp4(TestRankLossOp):
-    def set_shape(self):
-        batch_size = 100
-        return (batch_size), (batch_size), (batch_size, 1)
-
-
-class TestRankLossOp5(TestRankLossOp):
-    def set_shape(self):
-        batch_size = 100
-        return (batch_size), (batch_size), (batch_size)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_require_version.py b/test/legacy_test/test_require_version.py
index 2b7f5909d6675..65a60079e57e8 100644
--- a/test/legacy_test/test_require_version.py
+++ b/test/legacy_test/test_require_version.py
@@ -23,14 +23,7 @@
 class VersionTest(unittest.TestCase):
     def test_check_output(self):
         warnings.warn(
-            "paddle.__version__: {}, base_version.full_version: {}, base_version.major: {}, base_version.minor: {}, base_version.patch: {}, base_version.rc: {}.".format(
-                paddle.__version__,
-                base_version.full_version,
-                base_version.major,
-                base_version.minor,
-                base_version.patch,
-                base_version.rc,
-            )
+            f"paddle.__version__: {paddle.__version__}, base_version.full_version: {base_version.full_version}, base_version.major: {base_version.major}, base_version.minor: {base_version.minor}, base_version.patch: {base_version.patch}, base_version.rc: {base_version.rc}."
         )
         ori_full_version = base_version.full_version
         ori_sep_version = [
diff --git a/test/legacy_test/test_retinanet_detection_output.py b/test/legacy_test/test_retinanet_detection_output.py
deleted file mode 100644
index a120dfd50eefc..0000000000000
--- a/test/legacy_test/test_retinanet_detection_output.py
+++ /dev/null
@@ -1,511 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License")
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import unittest
-
-import numpy as np
-from op_test import OpTest
-from test_anchor_generator_op import anchor_generator_in_python
-from test_multiclass_nms_op import nms
-
-import paddle
-
-
-def multiclass_nms(prediction, class_num, keep_top_k, nms_threshold):
-    selected_indices = {}
-    num_det = 0
-    for c in range(class_num):
-        if c not in prediction.keys():
-            continue
-        cls_dets = prediction[c]
-        all_scores = np.zeros(len(cls_dets))
-        for i in range(all_scores.shape[0]):
-            all_scores[i] = cls_dets[i][4]
-        indices = nms(cls_dets, all_scores, 0.0, nms_threshold, -1, False, 1.0)
-        selected_indices[c] = indices
-        num_det += len(indices)
-
-    score_index = []
-    for c, indices in selected_indices.items():
-        for idx in indices:
-            score_index.append((prediction[c][idx][4], c, idx))
-
-    sorted_score_index = sorted(
-        score_index, key=lambda tup: tup[0], reverse=True
-    )
-    if keep_top_k > -1 and num_det > keep_top_k:
-        sorted_score_index = sorted_score_index[:keep_top_k]
-        num_det = keep_top_k
-    nmsed_outs = []
-    for s, c, idx in sorted_score_index:
-        xmin = prediction[c][idx][0]
-        ymin = prediction[c][idx][1]
-        xmax = prediction[c][idx][2]
-        ymax = prediction[c][idx][3]
-        nmsed_outs.append([c + 1, s, xmin, ymin, xmax, ymax])
-
-    return nmsed_outs, num_det
-
-
-def retinanet_detection_out(
-    boxes_list,
-    scores_list,
-    anchors_list,
-    im_info,
-    score_threshold,
-    nms_threshold,
-    nms_top_k,
-    keep_top_k,
-):
-    class_num = scores_list[0].shape[-1]
-    im_height, im_width, im_scale = im_info
-
-    num_level = len(scores_list)
-    prediction = {}
-    for lvl in range(num_level):
-        scores_per_level = scores_list[lvl]
-        scores_per_level = scores_per_level.flatten()
-        bboxes_per_level = boxes_list[lvl]
-        bboxes_per_level = bboxes_per_level.flatten()
-        anchors_per_level = anchors_list[lvl]
-        anchors_per_level = anchors_per_level.flatten()
-
-        thresh = score_threshold if lvl < (num_level - 1) else 0.0
-        selected_indices = np.argwhere(scores_per_level > thresh)
-        scores = scores_per_level[selected_indices]
-        sorted_indices = np.argsort(-scores, axis=0, kind='mergesort')
-        if nms_top_k > -1 and nms_top_k < sorted_indices.shape[0]:
-            sorted_indices = sorted_indices[:nms_top_k]
-
-        for i in range(sorted_indices.shape[0]):
-            idx = selected_indices[sorted_indices[i]]
-            idx = idx[0][0]
-            a = int(idx / class_num)
-            c = int(idx % class_num)
-            box_offset = a * 4
-            anchor_box_width = (
-                anchors_per_level[box_offset + 2]
-                - anchors_per_level[box_offset]
-                + 1
-            )
-            anchor_box_height = (
-                anchors_per_level[box_offset + 3]
-                - anchors_per_level[box_offset + 1]
-                + 1
-            )
-            anchor_box_center_x = (
-                anchors_per_level[box_offset] + anchor_box_width / 2
-            )
-            anchor_box_center_y = (
-                anchors_per_level[box_offset + 1] + anchor_box_height / 2
-            )
-
-            target_box_center_x = (
-                bboxes_per_level[box_offset] * anchor_box_width
-                + anchor_box_center_x
-            )
-            target_box_center_y = (
-                bboxes_per_level[box_offset + 1] * anchor_box_height
-                + anchor_box_center_y
-            )
-            target_box_width = (
-                math.exp(bboxes_per_level[box_offset + 2]) * anchor_box_width
-            )
-            target_box_height = (
-                math.exp(bboxes_per_level[box_offset + 3]) * anchor_box_height
-            )
-
-            pred_box_xmin = target_box_center_x - target_box_width / 2
-            pred_box_ymin = target_box_center_y - target_box_height / 2
-            pred_box_xmax = target_box_center_x + target_box_width / 2 - 1
-            pred_box_ymax = target_box_center_y + target_box_height / 2 - 1
-
-            pred_box_xmin = pred_box_xmin / im_scale
-            pred_box_ymin = pred_box_ymin / im_scale
-            pred_box_xmax = pred_box_xmax / im_scale
-            pred_box_ymax = pred_box_ymax / im_scale
-
-            pred_box_xmin = max(
-                min(pred_box_xmin, np.round(im_width / im_scale) - 1), 0.0
-            )
-            pred_box_ymin = max(
-                min(pred_box_ymin, np.round(im_height / im_scale) - 1), 0.0
-            )
-            pred_box_xmax = max(
-                min(pred_box_xmax, np.round(im_width / im_scale) - 1), 0.0
-            )
-            pred_box_ymax = max(
-                min(pred_box_ymax, np.round(im_height / im_scale) - 1), 0.0
-            )
-
-            if c not in prediction.keys():
-                prediction[c] = []
-            prediction[c].append(
-                [
-                    pred_box_xmin,
-                    pred_box_ymin,
-                    pred_box_xmax,
-                    pred_box_ymax,
-                    scores_per_level[idx],
-                ]
-            )
-
-    nmsed_outs, nmsed_num = multiclass_nms(
-        prediction, class_num, keep_top_k, nms_threshold
-    )
-    return nmsed_outs, nmsed_num
-
-
-def batched_retinanet_detection_out(
-    boxes,
-    scores,
-    anchors,
-    im_info,
-    score_threshold,
-    nms_threshold,
-    nms_top_k,
-    keep_top_k,
-):
-    batch_size = scores[0].shape[0]
-    det_outs = []
-    lod = []
-
-    for n in range(batch_size):
-        boxes_per_batch = []
-        scores_per_batch = []
-
-        num_level = len(scores)
-        for lvl in range(num_level):
-            boxes_per_batch.append(boxes[lvl][n])
-            scores_per_batch.append(scores[lvl][n])
-
-        nmsed_outs, nmsed_num = retinanet_detection_out(
-            boxes_per_batch,
-            scores_per_batch,
-            anchors,
-            im_info[n],
-            score_threshold,
-            nms_threshold,
-            nms_top_k,
-            keep_top_k,
-        )
-        lod.append(nmsed_num)
-        if nmsed_num == 0:
-            continue
-
-        det_outs.extend(nmsed_outs)
-    return det_outs, lod
-
-
-class TestRetinanetDetectionOutOp1(OpTest):
-    def set_argument(self):
-        self.score_threshold = 0.05
-        self.min_level = 3
-        self.max_level = 7
-        self.nms_threshold = 0.3
-        self.nms_top_k = 1000
-        self.keep_top_k = 200
-
-        self.scales_per_octave = 3
-        self.aspect_ratios = [1.0, 2.0, 0.5]
-        self.anchor_scale = 4
-        self.anchor_strides = [8, 16, 32, 64, 128]
-
-        self.box_size = 4
-        self.class_num = 80
-        self.batch_size = 1
-        self.input_channels = 20
-
-        self.layer_h = []
-        self.layer_w = []
-        num_levels = self.max_level - self.min_level + 1
-        for i in range(num_levels):
-            self.layer_h.append(2 ** (num_levels - i))
-            self.layer_w.append(2 ** (num_levels - i))
-
-    def init_test_input(self):
-        anchor_num = len(self.aspect_ratios) * self.scales_per_octave
-        num_levels = self.max_level - self.min_level + 1
-        self.scores_list = []
-        self.bboxes_list = []
-        self.anchors_list = []
-
-        for i in range(num_levels):
-            layer_h = self.layer_h[i]
-            layer_w = self.layer_w[i]
-
-            input_feat = np.random.random(
-                (self.batch_size, self.input_channels, layer_h, layer_w)
-            ).astype('float32')
-            score = np.random.random(
-                (self.batch_size, self.class_num * anchor_num, layer_h, layer_w)
-            ).astype('float32')
-            score = np.transpose(score, [0, 2, 3, 1])
-            score = score.reshape((self.batch_size, -1, self.class_num))
-            box = np.random.random(
-                (self.batch_size, self.box_size * anchor_num, layer_h, layer_w)
-            ).astype('float32')
-            box = np.transpose(box, [0, 2, 3, 1])
-            box = box.reshape((self.batch_size, -1, self.box_size))
-            anchor_sizes = []
-            for octave in range(self.scales_per_octave):
-                anchor_sizes.append(
-                    float(self.anchor_strides[i] * (2**octave))
-                    / float(self.scales_per_octave)
-                    * self.anchor_scale
-                )
-            anchor, var = anchor_generator_in_python(
-                input_feat=input_feat,
-                anchor_sizes=anchor_sizes,
-                aspect_ratios=self.aspect_ratios,
-                variances=[1.0, 1.0, 1.0, 1.0],
-                stride=[self.anchor_strides[i], self.anchor_strides[i]],
-                offset=0.5,
-            )
-            anchor = np.reshape(anchor, [-1, 4])
-            self.scores_list.append(score.astype('float32'))
-            self.bboxes_list.append(box.astype('float32'))
-            self.anchors_list.append(anchor.astype('float32'))
-
-        self.im_info = np.array([[256.0, 256.0, 1.5]]).astype(
-            'float32'
-        )  # im_height, im_width, scale
-
-    def setUp(self):
-        self.set_argument()
-        self.init_test_input()
-
-        nmsed_outs, lod = batched_retinanet_detection_out(
-            self.bboxes_list,
-            self.scores_list,
-            self.anchors_list,
-            self.im_info,
-            self.score_threshold,
-            self.nms_threshold,
-            self.nms_top_k,
-            self.keep_top_k,
-        )
-        nmsed_outs = np.array(nmsed_outs).astype('float32')
-        self.op_type = 'retinanet_detection_output'
-        self.inputs = {
-            'BBoxes': [
-                ('b0', self.bboxes_list[0]),
-                ('b1', self.bboxes_list[1]),
-                ('b2', self.bboxes_list[2]),
-                ('b3', self.bboxes_list[3]),
-                ('b4', self.bboxes_list[4]),
-            ],
-            'Scores': [
-                ('s0', self.scores_list[0]),
-                ('s1', self.scores_list[1]),
-                ('s2', self.scores_list[2]),
-                ('s3', self.scores_list[3]),
-                ('s4', self.scores_list[4]),
-            ],
-            'Anchors': [
-                ('a0', self.anchors_list[0]),
-                ('a1', self.anchors_list[1]),
-                ('a2', self.anchors_list[2]),
-                ('a3', self.anchors_list[3]),
-                ('a4', self.anchors_list[4]),
-            ],
-            'ImInfo': (
-                self.im_info,
-                [
-                    [
-                        1,
-                    ]
-                ],
-            ),
-        }
-        self.outputs = {'Out': (nmsed_outs, [lod])}
-        self.attrs = {
-            'score_threshold': self.score_threshold,
-            'nms_top_k': self.nms_top_k,
-            'nms_threshold': self.nms_threshold,
-            'keep_top_k': self.keep_top_k,
-            'nms_eta': 1.0,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestRetinanetDetectionOutOp2(OpTest):
-    def set_argument(self):
-        self.score_threshold = 0.05
-        self.min_level = 3
-        self.max_level = 7
-        self.nms_threshold = 0.3
-        self.nms_top_k = 1000
-        self.keep_top_k = 200
-
-        self.scales_per_octave = 3
-        self.aspect_ratios = [1.0, 2.0, 0.5]
-        self.anchor_scale = 4
-        self.anchor_strides = [8, 16, 32, 64, 128]
-
-        self.box_size = 4
-        self.class_num = 80
-        self.batch_size = 1
-        self.input_channels = 20
-        # Here test the case there the shape of each FPN level
-        # is irrelevant.
-        self.layer_h = [1, 4, 8, 8, 16]
-        self.layer_w = [1, 4, 8, 8, 16]
-
-
-class TestRetinanetDetectionOutOpNo3(TestRetinanetDetectionOutOp1):
-    def set_argument(self):
-        # Here set 2.0 to test the case there is no outputs.
-        # In practical use, 0.0 < score_threshold < 1.0
-        self.score_threshold = 2.0
-        self.min_level = 3
-        self.max_level = 7
-        self.nms_threshold = 0.3
-        self.nms_top_k = 1000
-        self.keep_top_k = 200
-
-        self.scales_per_octave = 3
-        self.aspect_ratios = [1.0, 2.0, 0.5]
-        self.anchor_scale = 4
-        self.anchor_strides = [8, 16, 32, 64, 128]
-
-        self.box_size = 4
-        self.class_num = 80
-        self.batch_size = 1
-        self.input_channels = 20
-
-        self.layer_h = []
-        self.layer_w = []
-        num_levels = self.max_level - self.min_level + 1
-        for i in range(num_levels):
-            self.layer_h.append(2 ** (num_levels - i))
-            self.layer_w.append(2 ** (num_levels - i))
-
-
-class TestRetinanetDetectionOutOpNo4(TestRetinanetDetectionOutOp1):
-    def set_argument(self):
-        self.score_threshold = 0.05
-        self.min_level = 2
-        self.max_level = 5
-        self.nms_threshold = 0.3
-        self.nms_top_k = 1000
-        self.keep_top_k = 200
-
-        self.scales_per_octave = 3
-        self.aspect_ratios = [1.0, 2.0, 0.5]
-        self.anchor_scale = 4
-        self.anchor_strides = [8, 16, 32, 64, 128]
-
-        self.box_size = 4
-        self.class_num = 80
-        self.batch_size = 1
-        self.input_channels = 20
-
-        self.layer_h = []
-        self.layer_w = []
-        num_levels = self.max_level - self.min_level + 1
-        for i in range(num_levels):
-            self.layer_h.append(2 ** (num_levels - i))
-            self.layer_w.append(2 ** (num_levels - i))
-
-    def setUp(self):
-        self.set_argument()
-        self.init_test_input()
-
-        nmsed_outs, lod = batched_retinanet_detection_out(
-            self.bboxes_list,
-            self.scores_list,
-            self.anchors_list,
-            self.im_info,
-            self.score_threshold,
-            self.nms_threshold,
-            self.nms_top_k,
-            self.keep_top_k,
-        )
-        nmsed_outs = np.array(nmsed_outs).astype('float32')
-        self.op_type = 'retinanet_detection_output'
-        self.inputs = {
-            'BBoxes': [
-                ('b0', self.bboxes_list[0]),
-                ('b1', self.bboxes_list[1]),
-                ('b2', self.bboxes_list[2]),
-                ('b3', self.bboxes_list[3]),
-            ],
-            'Scores': [
-                ('s0', self.scores_list[0]),
-                ('s1', self.scores_list[1]),
-                ('s2', self.scores_list[2]),
-                ('s3', self.scores_list[3]),
-            ],
-            'Anchors': [
-                ('a0', self.anchors_list[0]),
-                ('a1', self.anchors_list[1]),
-                ('a2', self.anchors_list[2]),
-                ('a3', self.anchors_list[3]),
-            ],
-            'ImInfo': (
-                self.im_info,
-                [
-                    [
-                        1,
-                    ]
-                ],
-            ),
-        }
-        self.outputs = {'Out': (nmsed_outs, [lod])}
-        self.attrs = {
-            'score_threshold': self.score_threshold,
-            'nms_top_k': self.nms_top_k,
-            'nms_threshold': self.nms_threshold,
-            'keep_top_k': self.keep_top_k,
-            'nms_eta': 1.0,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestRetinanetDetectionOutOpNo5(TestRetinanetDetectionOutOp1):
-    def set_argument(self):
-        self.score_threshold = 0.05
-        self.min_level = 3
-        self.max_level = 7
-        self.nms_threshold = 0.3
-        self.nms_top_k = 100
-        self.keep_top_k = 10
-
-        self.scales_per_octave = 3
-        self.aspect_ratios = [1.0, 2.0, 0.5]
-        self.anchor_scale = 4
-        self.anchor_strides = [8, 16, 32, 64, 128]
-
-        self.box_size = 4
-        self.class_num = 80
-        self.batch_size = 1
-        self.input_channels = 20
-
-        self.layer_h = []
-        self.layer_w = []
-        num_levels = self.max_level - self.min_level + 1
-        for i in range(num_levels):
-            self.layer_h.append(2 ** (num_levels - i))
-            self.layer_w.append(2 ** (num_levels - i))
-
-
-if __name__ == '__main__':
-    paddle.enable_static()
-    unittest.main()
diff --git a/test/legacy_test/test_rms_norm_op.py b/test/legacy_test/test_rms_norm_op.py
index f8ae5769cfaaf..5ab03576c8f19 100644
--- a/test/legacy_test/test_rms_norm_op.py
+++ b/test/legacy_test/test_rms_norm_op.py
@@ -35,6 +35,10 @@ def quant_helper(
     )
 
 
+def naive_residual_bias_add(x, residual, bias):
+    return x + residual + bias
+
+
 def naive_rms_norm(x, gamma, beta=None, epsilon=1e-5):
     variance = x.pow(2).mean(-1, keepdim=True)
     out = paddle.rsqrt(variance + epsilon) * x
@@ -98,7 +102,8 @@ def naive_residual_biasadd_rms_norm_int8(
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA "
+    not core.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(),
+    "core is not compiled with CUDA or ROCM",
 )
 class TestRMSNormOp(unittest.TestCase):
     def setUp(self):
@@ -347,7 +352,8 @@ def get_forward_backward(func, seed, dtype):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA "
+    not core.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(),
+    "core is not compiled with CUDA or ROCM",
 )
 class TestRMSNormStaticOp(unittest.TestCase):
     def setUp(self):
@@ -559,5 +565,258 @@ def test_rmsnorm_int8(self):
         )
 
 
+@unittest.skipIf(
+    not core.supports_avx512f() or not core.is_compiled_with_avx(),
+    "machine is not support AVX or is not compiled with AVX",
+)
+class TestRMSNormOpCPU(unittest.TestCase):
+    def setUp(self):
+        import os
+
+        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+        np.random.seed(20)
+        batch = 32
+        cols = 256
+        self.x_np = np.random.random([batch, cols])
+        self.residual_np = np.random.random([batch, cols])
+        self.bias_np = np.random.random([cols])
+
+        self.norm_weight_np = np.random.random([cols])
+        self.norm_bias_np = np.random.random([cols])
+        self.epsilon = 1e-6
+
+    def check_rmsnorm(self, x_np, gamma_np, beta_np, dtype):
+        paddle.disable_static()
+        x = paddle.to_tensor(x_np.astype(dtype))
+        gamma = paddle.to_tensor(gamma_np.astype(dtype))
+        beta = paddle.to_tensor(beta_np.astype(dtype))
+
+        paddle_rmsnorm_out = paddle.incubate.nn.functional.fused_rms_norm(
+            x, gamma, beta, self.epsilon, begin_norm_axis=1
+        )
+        paddle_naive_rmsnorm_out = naive_rms_norm(x, gamma, beta, self.epsilon)
+        paddle.enable_static()
+        return paddle_rmsnorm_out, paddle_naive_rmsnorm_out
+
+    def check_residual_bias_rmsnorm(
+        self, x_np, gamma_np, beta_np, residual_np, bias_np, dtype
+    ):
+        paddle.disable_static()
+        x = paddle.to_tensor(x_np.astype(dtype))
+        gamma = paddle.to_tensor(gamma_np.astype(dtype))
+        beta = paddle.to_tensor(beta_np.astype(dtype))
+        residual = paddle.to_tensor(residual_np.astype(dtype))
+        bias = paddle.to_tensor(bias_np.astype(dtype))
+
+        paddle_rmsnorm_out = paddle.incubate.nn.functional.fused_rms_norm(
+            x,
+            gamma,
+            beta,
+            self.epsilon,
+            begin_norm_axis=1,
+            bias=bias,
+            residual=residual,
+        )
+
+        paddle_naive_rmsnorm_out = naive_residual_biasadd_rms_norm(
+            x, residual, bias, gamma, beta, self.epsilon
+        )
+
+        paddle_naive_residual_out = naive_residual_bias_add(x, residual, bias)
+        paddle.enable_static()
+        return (
+            paddle_rmsnorm_out,
+            paddle_naive_rmsnorm_out,
+            paddle_naive_residual_out,
+        )
+
+    def test_rmsnorm(self):
+        paddle_rmsnorm, paddle_naive_rmsnorm = self.check_rmsnorm(
+            self.x_np, self.norm_weight_np, self.norm_bias_np, 'float32'
+        )
+        np.testing.assert_allclose(
+            paddle_rmsnorm[0].numpy(),
+            paddle_naive_rmsnorm.numpy(),
+            rtol=1e-3,
+            atol=1e-3,
+        )
+
+    def test_residual_bias_add_rmsnorm(self):
+        (
+            paddle_rmsnorm,
+            paddle_naive_rmsnorm,
+            paddle_naive_residual_out,
+        ) = self.check_residual_bias_rmsnorm(
+            self.x_np,
+            self.norm_weight_np,
+            self.norm_bias_np,
+            self.residual_np,
+            self.bias_np,
+            'float32',
+        )
+
+        np.testing.assert_allclose(
+            paddle_rmsnorm[0].numpy(),
+            paddle_naive_rmsnorm.numpy(),
+            rtol=1e-3,
+            atol=1e-3,
+        )
+        np.testing.assert_allclose(
+            paddle_rmsnorm[1].numpy(),
+            paddle_naive_residual_out.numpy(),
+            rtol=1e-3,
+            atol=1e-3,
+        )
+
+
+@unittest.skipIf(
+    not core.supports_avx512f() or not core.is_compiled_with_avx(),
+    "machine is not support AVX or is not compiled with AVX",
+)
+class TestRMSNormStaticOpCPU(unittest.TestCase):
+    def setUp(self):
+        import os
+
+        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+        np.random.seed(20)
+        self.batch = 32
+        self.cols = 256
+        self.x_np = np.random.random([self.batch, 256])
+        self.norm_weight_np = np.random.random([256])
+        self.norm_bias_np = np.random.random([256])
+        self.residual_np = np.random.random([self.batch, 256])
+        self.bias_np = np.random.random([256])
+        self.epsilon = 1e-6
+        self.place = paddle.CPUPlace()
+
+    def check_rmsnorm(self, x_np, gamma_np, beta_np, dtype):
+        paddle.disable_static()
+        x = paddle.to_tensor(x_np.astype(dtype))
+        gamma = paddle.to_tensor(gamma_np.astype(dtype))
+        beta = paddle.to_tensor(beta_np.astype(dtype))
+
+        paddle_naive_rmsnorm_out = naive_rms_norm(x, gamma, beta, self.epsilon)
+        paddle.enable_static()
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            x_static = paddle.static.data(
+                name="x_static", shape=[self.batch, self.cols], dtype=dtype
+            )
+            gamma_static = paddle.static.data(
+                name="gamma_static", shape=[self.cols], dtype=dtype
+            )
+            beta_static = paddle.static.data(
+                name="beta_static", shape=[self.cols], dtype=dtype
+            )
+            outs = paddle.incubate.nn.functional.fused_rms_norm(
+                x_static,
+                gamma_static,
+                beta_static,
+                self.epsilon,
+                begin_norm_axis=1,
+            )
+            exe = base.Executor(self.place)
+            out_s = exe.run(
+                feed={
+                    "x_static": x_np.astype(dtype),
+                    "gamma_static": gamma_np.astype(dtype),
+                    "beta_static": beta_np.astype(dtype),
+                },
+                fetch_list=[outs],
+            )
+        return out_s[0], paddle_naive_rmsnorm_out
+
+    def check_residual_bias_rmsnorm(
+        self, x_np, gamma_np, beta_np, residual_np, bias_np, dtype
+    ):
+        paddle.disable_static()
+        x = paddle.to_tensor(x_np.astype(dtype))
+        gamma = paddle.to_tensor(gamma_np.astype(dtype))
+        beta = paddle.to_tensor(beta_np.astype(dtype))
+        residual = paddle.to_tensor(residual_np.astype(dtype))
+        bias = paddle.to_tensor(bias_np.astype(dtype))
+
+        paddle_naive_rmsnorm_out = naive_residual_biasadd_rms_norm(
+            x, residual, bias, gamma, beta, self.epsilon
+        )
+        paddle.enable_static()
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            x_static = paddle.static.data(
+                name="x_static", shape=[self.batch, self.cols], dtype=dtype
+            )
+            residual_static = paddle.static.data(
+                name="residual_static",
+                shape=[self.batch, self.cols],
+                dtype=dtype,
+            )
+            bias_static = paddle.static.data(
+                name="bias_static", shape=[self.cols], dtype=dtype
+            )
+            gamma_static = paddle.static.data(
+                name="gamma_static", shape=[self.cols], dtype=dtype
+            )
+            beta_static = paddle.static.data(
+                name="beta_static", shape=[self.cols], dtype=dtype
+            )
+            outs = paddle.incubate.nn.functional.fused_rms_norm(
+                x_static,
+                gamma_static,
+                beta_static,
+                self.epsilon,
+                begin_norm_axis=1,
+                bias=bias_static,
+                residual=residual_static,
+            )
+
+            exe = base.Executor(self.place)
+            out_s = exe.run(
+                feed={
+                    "x_static": x_np.astype(dtype),
+                    "gamma_static": gamma_np.astype(dtype),
+                    "beta_static": beta_np.astype(dtype),
+                    "residual_static": residual_np.astype(dtype),
+                    "bias_static": bias_np.astype(dtype),
+                },
+                fetch_list=[outs],
+            )
+        return out_s[0], paddle_naive_rmsnorm_out
+
+    @test_with_pir_api
+    def test_rmsnorm(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+        paddle_rmsnorm, paddle_naive_rmsnorm = self.check_rmsnorm(
+            self.x_np, self.norm_weight_np, self.norm_bias_np, 'float32'
+        )
+
+        np.testing.assert_allclose(
+            paddle_rmsnorm,
+            paddle_naive_rmsnorm.numpy(),
+            rtol=1e-3,
+            atol=1e-3,
+        )
+
+    @test_with_pir_api
+    def test_residual_bias_add_rmsnorm(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+        paddle_rmsnorm, paddle_naive_rmsnorm = self.check_residual_bias_rmsnorm(
+            self.x_np,
+            self.norm_weight_np,
+            self.norm_bias_np,
+            self.residual_np,
+            self.bias_np,
+            'float32',
+        )
+
+        np.testing.assert_allclose(
+            paddle_rmsnorm,
+            paddle_naive_rmsnorm.numpy(),
+            rtol=1e-3,
+            atol=1e-3,
+        )
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/legacy_test/test_rpn_target_assign_op.py b/test/legacy_test/test_rpn_target_assign_op.py
deleted file mode 100644
index d0147d8b700f1..0000000000000
--- a/test/legacy_test/test_rpn_target_assign_op.py
+++ /dev/null
@@ -1,486 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-from test_anchor_generator_op import anchor_generator_in_python
-from test_generate_proposal_labels_op import (
-    _bbox_overlaps,
-    _box_to_delta,
-    _generate_groundtruth,
-)
-
-
-def rpn_target_assign(
-    anchor_by_gt_overlap,
-    rpn_batch_size_per_im,
-    rpn_positive_overlap,
-    rpn_negative_overlap,
-    rpn_fg_fraction,
-    use_random=True,
-):
-    anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1)
-    anchor_to_gt_max = anchor_by_gt_overlap[
-        np.arange(anchor_by_gt_overlap.shape[0]), anchor_to_gt_argmax
-    ]
-
-    gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0)
-    gt_to_anchor_max = anchor_by_gt_overlap[
-        gt_to_anchor_argmax, np.arange(anchor_by_gt_overlap.shape[1])
-    ]
-    anchors_with_max_overlap = np.where(
-        anchor_by_gt_overlap == gt_to_anchor_max
-    )[0]
-
-    labels = np.ones((anchor_by_gt_overlap.shape[0],), dtype=np.int32) * -1
-    labels[anchors_with_max_overlap] = 1
-    labels[anchor_to_gt_max >= rpn_positive_overlap] = 1
-
-    num_fg = int(rpn_fg_fraction * rpn_batch_size_per_im)
-    fg_inds = np.where(labels == 1)[0]
-    if len(fg_inds) > num_fg and use_random:
-        disable_inds = np.random.choice(
-            fg_inds, size=(len(fg_inds) - num_fg), replace=False
-        )
-    else:
-        disable_inds = fg_inds[num_fg:]
-
-    labels[disable_inds] = -1
-    fg_inds = np.where(labels == 1)[0]
-    bbox_inside_weight = np.zeros((len(fg_inds), 4), dtype=np.float32)
-
-    num_bg = rpn_batch_size_per_im - np.sum(labels == 1)
-    bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0]
-    if len(bg_inds) > num_bg and use_random:
-        enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)]
-    else:
-        enable_inds = bg_inds[:num_bg]
-
-    fg_fake_inds = np.array([], np.int32)
-    fg_value = np.array([fg_inds[0]], np.int32)
-    fake_num = 0
-    for bg_id in enable_inds:
-        if bg_id in fg_inds:
-            fake_num += 1
-            fg_fake_inds = np.hstack([fg_fake_inds, fg_value])
-    labels[enable_inds] = 0
-
-    bbox_inside_weight[fake_num:, :] = 1
-    fg_inds = np.where(labels == 1)[0]
-    bg_inds = np.where(labels == 0)[0]
-    loc_index = np.hstack([fg_fake_inds, fg_inds])
-    score_index = np.hstack([fg_inds, bg_inds])
-    labels = labels[score_index]
-    assert not np.any(labels == -1), "Wrong labels with -1"
-
-    gt_inds = anchor_to_gt_argmax[loc_index]
-
-    return loc_index, score_index, labels, gt_inds, bbox_inside_weight
-
-
-def get_anchor(n, c, h, w):
-    input_feat = np.random.random((n, c, h, w)).astype('float32')
-    anchors, _ = anchor_generator_in_python(
-        input_feat=input_feat,
-        anchor_sizes=[32.0, 64.0],
-        aspect_ratios=[0.5, 1.0],
-        variances=[1.0, 1.0, 1.0, 1.0],
-        stride=[16.0, 16.0],
-        offset=0.5,
-    )
-    return anchors
-
-
-def rpn_target_assign_in_python(
-    all_anchors,
-    gt_boxes,
-    is_crowd,
-    im_info,
-    lod,
-    rpn_straddle_thresh,
-    rpn_batch_size_per_im,
-    rpn_positive_overlap,
-    rpn_negative_overlap,
-    rpn_fg_fraction,
-    use_random=True,
-):
-    anchor_num = all_anchors.shape[0]
-    batch_size = len(lod) - 1
-    for i in range(batch_size):
-        im_height = im_info[i][0]
-        im_width = im_info[i][1]
-        im_scale = im_info[i][2]
-        if rpn_straddle_thresh >= 0:
-            # Only keep anchors inside the image by a margin of straddle_thresh
-            inds_inside = np.where(
-                (all_anchors[:, 0] >= -rpn_straddle_thresh)
-                & (all_anchors[:, 1] >= -rpn_straddle_thresh)
-                & (all_anchors[:, 2] < im_width + rpn_straddle_thresh)
-                & (all_anchors[:, 3] < im_height + rpn_straddle_thresh)
-            )[0]
-            # keep only inside anchors
-            inside_anchors = all_anchors[inds_inside, :]
-        else:
-            inds_inside = np.arange(all_anchors.shape[0])
-            inside_anchors = all_anchors
-
-        b, e = lod[i], lod[i + 1]
-        gt_boxes_slice = gt_boxes[b:e, :] * im_scale
-        is_crowd_slice = is_crowd[b:e]
-
-        not_crowd_inds = np.where(is_crowd_slice == 0)[0]
-        gt_boxes_slice = gt_boxes_slice[not_crowd_inds]
-        iou = _bbox_overlaps(inside_anchors, gt_boxes_slice)
-
-        (
-            loc_inds,
-            score_inds,
-            labels,
-            gt_inds,
-            bbox_inside_weight,
-        ) = rpn_target_assign(
-            iou,
-            rpn_batch_size_per_im,
-            rpn_positive_overlap,
-            rpn_negative_overlap,
-            rpn_fg_fraction,
-            use_random,
-        )
-        # unmap to all anchor
-        loc_inds = inds_inside[loc_inds]
-        score_inds = inds_inside[score_inds]
-
-        sampled_gt = gt_boxes_slice[gt_inds]
-        sampled_anchor = all_anchors[loc_inds]
-        box_deltas = _box_to_delta(
-            sampled_anchor, sampled_gt, [1.0, 1.0, 1.0, 1.0]
-        )
-
-        if i == 0:
-            loc_indexes = loc_inds
-            score_indexes = score_inds
-            tgt_labels = labels
-            tgt_bboxes = box_deltas
-            bbox_inside_weights = bbox_inside_weight
-        else:
-            loc_indexes = np.concatenate(
-                [loc_indexes, loc_inds + i * anchor_num]
-            )
-            score_indexes = np.concatenate(
-                [score_indexes, score_inds + i * anchor_num]
-            )
-            tgt_labels = np.concatenate([tgt_labels, labels])
-            tgt_bboxes = np.vstack([tgt_bboxes, box_deltas])
-            bbox_inside_weights = np.vstack(
-                [bbox_inside_weights, bbox_inside_weight]
-            )
-
-    return (
-        loc_indexes,
-        score_indexes,
-        tgt_bboxes,
-        tgt_labels,
-        bbox_inside_weights,
-    )
-
-
-def retinanet_target_assign(
-    anchor_by_gt_overlap, gt_labels, positive_overlap, negative_overlap
-):
-    anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1)
-    anchor_to_gt_max = anchor_by_gt_overlap[
-        np.arange(anchor_by_gt_overlap.shape[0]), anchor_to_gt_argmax
-    ]
-
-    gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0)
-    gt_to_anchor_max = anchor_by_gt_overlap[
-        gt_to_anchor_argmax, np.arange(anchor_by_gt_overlap.shape[1])
-    ]
-    anchors_with_max_overlap = np.where(
-        anchor_by_gt_overlap == gt_to_anchor_max
-    )[0]
-
-    labels = np.ones((anchor_by_gt_overlap.shape[0],), dtype=np.int32) * -1
-    labels[anchors_with_max_overlap] = 1
-    labels[anchor_to_gt_max >= positive_overlap] = 1
-
-    fg_inds = np.where(labels == 1)[0]
-    bbox_inside_weight = np.zeros((len(fg_inds), 4), dtype=np.float32)
-
-    bg_inds = np.where(anchor_to_gt_max < negative_overlap)[0]
-    enable_inds = bg_inds
-
-    fg_fake_inds = np.array([], np.int32)
-    fg_value = np.array([fg_inds[0]], np.int32)
-    fake_num = 0
-    for bg_id in enable_inds:
-        if bg_id in fg_inds:
-            fake_num += 1
-            fg_fake_inds = np.hstack([fg_fake_inds, fg_value])
-    labels[enable_inds] = 0
-
-    bbox_inside_weight[fake_num:, :] = 1
-    fg_inds = np.where(labels == 1)[0]
-    bg_inds = np.where(labels == 0)[0]
-    loc_index = np.hstack([fg_fake_inds, fg_inds])
-    score_index = np.hstack([fg_inds, bg_inds])
-    score_index_tmp = np.hstack([fg_inds])
-    labels = labels[score_index]
-
-    gt_inds = anchor_to_gt_argmax[loc_index]
-    label_inds = anchor_to_gt_argmax[score_index_tmp]
-    labels[0 : len(fg_inds)] = np.squeeze(gt_labels[label_inds])
-    fg_num = len(fg_fake_inds) + len(fg_inds) + 1
-    assert not np.any(labels == -1), "Wrong labels with -1"
-    return loc_index, score_index, labels, gt_inds, bbox_inside_weight, fg_num
-
-
-def retinanet_target_assign_in_python(
-    all_anchors,
-    gt_boxes,
-    gt_labels,
-    is_crowd,
-    im_info,
-    lod,
-    positive_overlap,
-    negative_overlap,
-):
-    anchor_num = all_anchors.shape[0]
-    batch_size = len(lod) - 1
-    for i in range(batch_size):
-        im_scale = im_info[i][2]
-
-        inds_inside = np.arange(all_anchors.shape[0])
-        inside_anchors = all_anchors
-        b, e = lod[i], lod[i + 1]
-        gt_boxes_slice = gt_boxes[b:e, :] * im_scale
-        gt_labels_slice = gt_labels[b:e, :]
-        is_crowd_slice = is_crowd[b:e]
-
-        not_crowd_inds = np.where(is_crowd_slice == 0)[0]
-        gt_boxes_slice = gt_boxes_slice[not_crowd_inds]
-        gt_labels_slice = gt_labels_slice[not_crowd_inds]
-        iou = _bbox_overlaps(inside_anchors, gt_boxes_slice)
-
-        (
-            loc_inds,
-            score_inds,
-            labels,
-            gt_inds,
-            bbox_inside_weight,
-            fg_num,
-        ) = retinanet_target_assign(
-            iou, gt_labels_slice, positive_overlap, negative_overlap
-        )
-        # unmap to all anchor
-        loc_inds = inds_inside[loc_inds]
-        score_inds = inds_inside[score_inds]
-
-        sampled_gt = gt_boxes_slice[gt_inds]
-        sampled_anchor = all_anchors[loc_inds]
-        box_deltas = _box_to_delta(
-            sampled_anchor, sampled_gt, [1.0, 1.0, 1.0, 1.0]
-        )
-
-        if i == 0:
-            loc_indexes = loc_inds
-            score_indexes = score_inds
-            tgt_labels = labels
-            tgt_bboxes = box_deltas
-            bbox_inside_weights = bbox_inside_weight
-            fg_nums = [[fg_num]]
-        else:
-            loc_indexes = np.concatenate(
-                [loc_indexes, loc_inds + i * anchor_num]
-            )
-            score_indexes = np.concatenate(
-                [score_indexes, score_inds + i * anchor_num]
-            )
-            tgt_labels = np.concatenate([tgt_labels, labels])
-            tgt_bboxes = np.vstack([tgt_bboxes, box_deltas])
-            bbox_inside_weights = np.vstack(
-                [bbox_inside_weights, bbox_inside_weight]
-            )
-            fg_nums = np.concatenate([fg_nums, [[fg_num]]])
-
-    return (
-        loc_indexes,
-        score_indexes,
-        tgt_bboxes,
-        tgt_labels,
-        bbox_inside_weights,
-        fg_nums,
-    )
-
-
-class TestRpnTargetAssignOp(OpTest):
-    def setUp(self):
-        n, c, h, w = 2, 4, 14, 14
-        all_anchors = get_anchor(n, c, h, w)
-        gt_num = 10
-        all_anchors = all_anchors.reshape(-1, 4)
-        anchor_num = all_anchors.shape[0]
-
-        images_shape = [[64, 64], [64, 64]]
-        # images_shape = [[64, 64]]
-        groundtruth, lod = _generate_groundtruth(images_shape, 3, 4)
-        lod = [0, 4, 8]
-        # lod = [0, 4]
-
-        im_info = np.ones((len(images_shape), 3)).astype(np.float32)
-        for i in range(len(images_shape)):
-            im_info[i, 0] = images_shape[i][0]
-            im_info[i, 1] = images_shape[i][1]
-            im_info[i, 2] = 0.8  # scale
-        gt_boxes = np.vstack([v['boxes'] for v in groundtruth])
-        is_crowd = np.hstack([v['is_crowd'] for v in groundtruth])
-
-        all_anchors = all_anchors.astype('float32')
-        gt_boxes = gt_boxes.astype('float32')
-
-        rpn_straddle_thresh = 0.0
-        rpn_batch_size_per_im = 256
-        rpn_positive_overlap = 0.7
-        rpn_negative_overlap = 0.3
-        rpn_fg_fraction = 0.5
-        use_random = False
-
-        (
-            loc_index,
-            score_index,
-            tgt_bbox,
-            labels,
-            bbox_inside_weights,
-        ) = rpn_target_assign_in_python(
-            all_anchors,
-            gt_boxes,
-            is_crowd,
-            im_info,
-            lod,
-            rpn_straddle_thresh,
-            rpn_batch_size_per_im,
-            rpn_positive_overlap,
-            rpn_negative_overlap,
-            rpn_fg_fraction,
-            use_random,
-        )
-        labels = labels[:, np.newaxis]
-
-        self.op_type = "rpn_target_assign"
-        self.inputs = {
-            'Anchor': all_anchors,
-            'GtBoxes': (gt_boxes, [[4, 4]]),
-            'IsCrowd': (is_crowd, [[4, 4]]),
-            'ImInfo': (im_info, [[1, 1]]),
-        }
-        self.attrs = {
-            'rpn_batch_size_per_im': rpn_batch_size_per_im,
-            'rpn_straddle_thresh': rpn_straddle_thresh,
-            'rpn_positive_overlap': rpn_positive_overlap,
-            'rpn_negative_overlap': rpn_negative_overlap,
-            'rpn_fg_fraction': rpn_fg_fraction,
-            'use_random': use_random,
-        }
-        self.outputs = {
-            'LocationIndex': loc_index.astype('int32'),
-            'ScoreIndex': score_index.astype('int32'),
-            'TargetBBox': tgt_bbox.astype('float32'),
-            'TargetLabel': labels.astype('int32'),
-            'BBoxInsideWeight': bbox_inside_weights.astype('float32'),
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestRetinanetTargetAssignOp(OpTest):
-    def setUp(self):
-        n, c, h, w = 2, 4, 14, 14
-        all_anchors = get_anchor(n, c, h, w)
-        gt_num = 10
-        all_anchors = all_anchors.reshape(-1, 4)
-        anchor_num = all_anchors.shape[0]
-
-        images_shape = [[64, 64], [64, 64]]
-        groundtruth, lod = _generate_groundtruth(images_shape, 3, 4)
-        lod = [0, 4, 8]
-
-        im_info = np.ones((len(images_shape), 3)).astype(np.float32)
-        for i in range(len(images_shape)):
-            im_info[i, 0] = images_shape[i][0]
-            im_info[i, 1] = images_shape[i][1]
-            im_info[i, 2] = 0.8  # scale
-        gt_boxes = np.vstack([v['boxes'] for v in groundtruth])
-        is_crowd = np.hstack([v['is_crowd'] for v in groundtruth])
-        gt_labels = np.vstack(
-            [
-                v['gt_classes'].reshape(len(v['gt_classes']), 1)
-                for v in groundtruth
-            ]
-        )
-        gt_labels = gt_labels.reshape(len(gt_labels), 1)
-        all_anchors = all_anchors.astype('float32')
-        gt_boxes = gt_boxes.astype('float32')
-        gt_labels = gt_labels.astype('int32')
-
-        positive_overlap = 0.5
-        negative_overlap = 0.4
-
-        (
-            loc_index,
-            score_index,
-            tgt_bbox,
-            labels,
-            bbox_inside_weights,
-            fg_num,
-        ) = retinanet_target_assign_in_python(
-            all_anchors,
-            gt_boxes,
-            gt_labels,
-            is_crowd,
-            im_info,
-            lod,
-            positive_overlap,
-            negative_overlap,
-        )
-        labels = labels[:, np.newaxis]
-        self.op_type = "retinanet_target_assign"
-        self.inputs = {
-            'Anchor': all_anchors,
-            'GtBoxes': (gt_boxes, [[4, 4]]),
-            'GtLabels': (gt_labels, [[4, 4]]),
-            'IsCrowd': (is_crowd, [[4, 4]]),
-            'ImInfo': (im_info, [[1, 1]]),
-        }
-        self.attrs = {
-            'positive_overlap': positive_overlap,
-            'negative_overlap': negative_overlap,
-        }
-        self.outputs = {
-            'LocationIndex': loc_index.astype('int32'),
-            'ScoreIndex': score_index.astype('int32'),
-            'TargetBBox': tgt_bbox.astype('float32'),
-            'TargetLabel': labels.astype('int32'),
-            'BBoxInsideWeight': bbox_inside_weights.astype('float32'),
-            'ForegroundNumber': fg_num.astype('int32'),
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_run.py b/test/legacy_test/test_run.py
index 331d45a514a93..8c10b7d9472eb 100644
--- a/test/legacy_test/test_run.py
+++ b/test/legacy_test/test_run.py
@@ -193,9 +193,7 @@ def test_ps_3(self):
 
     def test_ps_4(self):
         log_dir = tempfile.TemporaryDirectory()
-        args = "--job_id ps4 --log_dir {} --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903".format(
-            log_dir.name
-        )
+        args = f"--job_id ps4 --log_dir {log_dir.name} --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903"
         p1 = self.pdrun(args)
         p1.wait()
         self.assertTrue(p1.poll() == 0)
diff --git a/test/legacy_test/test_set_value_op.py b/test/legacy_test/test_set_value_op.py
index 4113805c663b4..f7b87b46eb5cf 100644
--- a/test/legacy_test/test_set_value_op.py
+++ b/test/legacy_test/test_set_value_op.py
@@ -1504,16 +1504,12 @@ def set_value(t, value):
         np.testing.assert_array_equal(
             inps.grad.numpy(),
             input_grad,
-            err_msg='The gradient of value should be \n{},\n but received {}'.format(
-                input_grad, inps.grad.numpy()
-            ),
+            err_msg=f'The gradient of value should be \n{input_grad},\n but received {inps.grad.numpy()}',
         )
         np.testing.assert_array_equal(
             value.grad.numpy(),
             value_grad,
-            err_msg='The gradient of input should be \n{},\n but received {}'.format(
-                value_grad, value.grad.numpy()
-            ),
+            err_msg=f'The gradient of input should be \n{value_grad},\n but received {value.grad.numpy()}',
         )
 
         # case 2
@@ -1538,16 +1534,12 @@ def set_value(t, value):
         np.testing.assert_array_equal(
             inps2.grad.numpy(),
             input_grad2,
-            err_msg='The gradient of value should be \n{},\n but received {}'.format(
-                input_grad, inps2.grad.numpy()
-            ),
+            err_msg=f'The gradient of value should be \n{input_grad},\n but received {inps2.grad.numpy()}',
         )
         np.testing.assert_array_equal(
             value2.grad.numpy(),
             value_grad2,
-            err_msg='The gradient of input should be \n{},\n but received {}'.format(
-                value_grad, value2.grad.numpy()
-            ),
+            err_msg=f'The gradient of input should be \n{value_grad},\n but received {value2.grad.numpy()}',
         )
 
         # case 3
@@ -1592,16 +1584,12 @@ def set_value3(t, value):
         np.testing.assert_array_equal(
             inps.grad.numpy(),
             input_grad,
-            err_msg='The gradient of value should be \n{},\n but received {}'.format(
-                input_grad, inps.grad.numpy()
-            ),
+            err_msg=f'The gradient of value should be \n{input_grad},\n but received {inps.grad.numpy()}',
         )
         np.testing.assert_array_equal(
             value.grad.numpy(),
             value_grad,
-            err_msg='The gradient of input should be \n{},\n but received {}'.format(
-                value_grad, value.grad.numpy()
-            ),
+            err_msg=f'The gradient of input should be \n{value_grad},\n but received {value.grad.numpy()}',
         )
 
         # case 4: step >0
@@ -1640,16 +1628,12 @@ def set_value4(t, value):
         np.testing.assert_array_equal(
             inps.grad.numpy(),
             input_grad,
-            err_msg='The gradient of value should be \n{},\n but received {}'.format(
-                input_grad, inps.grad.numpy()
-            ),
+            err_msg=f'The gradient of value should be \n{input_grad},\n but received {inps.grad.numpy()}',
         )
         np.testing.assert_array_equal(
             value.grad.numpy(),
             value_grad,
-            err_msg='The gradient of input should be \n{},\n but received {}'.format(
-                value_grad, value.grad.numpy()
-            ),
+            err_msg=f'The gradient of input should be \n{value_grad},\n but received {value.grad.numpy()}',
         )
 
         # case 5:a[0].shape==value.shape
@@ -1692,16 +1676,12 @@ def set_value5(t, value):
         np.testing.assert_array_equal(
             inps.grad.numpy(),
             input_grad,
-            err_msg='The gradient of value should be \n{},\n but received {}'.format(
-                input_grad, inps.grad.numpy()
-            ),
+            err_msg=f'The gradient of value should be \n{input_grad},\n but received {inps.grad.numpy()}',
         )
         np.testing.assert_array_equal(
             value.grad.numpy(),
             value_grad,
-            err_msg='The gradient of input should be \n{},\n but received {}'.format(
-                value_grad, value.grad.numpy()
-            ),
+            err_msg=f'The gradient of input should be \n{value_grad},\n but received {value.grad.numpy()}',
         )
 
         # case 6: pass stop_gradient from value to x
diff --git a/test/legacy_test/test_signbit.py b/test/legacy_test/test_signbit.py
index 9f78963a2a742..b98882fb69c50 100644
--- a/test/legacy_test/test_signbit.py
+++ b/test/legacy_test/test_signbit.py
@@ -18,6 +18,7 @@
 
 import paddle
 from paddle.base import core
+from paddle.pir_utils import test_with_pir_api
 
 
 def ref_np_signbit(x: np.ndarray):
@@ -108,6 +109,86 @@ def run(place):
         for place in self.place:
             run(place)
 
+    @test_with_pir_api
+    def test_static(self):
+        np_input1 = np.random.uniform(-10, 10, (12, 10)).astype("int8")
+        np_input2 = np.random.uniform(-10, 10, (12, 10)).astype("uint8")
+        np_input3 = np.random.uniform(-10, 10, (12, 10)).astype("int16")
+        np_input4 = np.random.uniform(-10, 10, (12, 10)).astype("int32")
+        np_input5 = np.random.uniform(-10, 10, (12, 10)).astype("int64")
+        np_input6 = np.array([-0.0, 0.0]).astype("float32")
+        np_input7 = np.array([-0.0, 0.0]).astype("float64")
+        np_out1 = np.signbit(np_input1)
+        np_out2 = np.signbit(np_input2)
+        np_out3 = np.signbit(np_input3)
+        np_out4 = np.signbit(np_input4)
+        np_out5 = np.signbit(np_input5)
+        np_out6 = np.signbit(np_input6)
+        np_out7 = np.signbit(np_input7)
+
+        paddle.enable_static()
+
+        def run(place):
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
+                # The input type of sign_op must be Variable or numpy.ndarray.
+                input1 = 12
+                self.assertRaises(TypeError, paddle.tensor.math.sign, input1)
+                # The result of sign_op must correct.
+                input1 = paddle.static.data(
+                    name='input1', shape=[12, 10], dtype="int8"
+                )
+                input2 = paddle.static.data(
+                    name='input2', shape=[12, 10], dtype="uint8"
+                )
+                input3 = paddle.static.data(
+                    name='input3', shape=[12, 10], dtype="int16"
+                )
+                input4 = paddle.static.data(
+                    name='input4', shape=[12, 10], dtype="int32"
+                )
+                input5 = paddle.static.data(
+                    name='input5', shape=[12, 10], dtype="int64"
+                )
+                input6 = paddle.static.data(
+                    name='input6', shape=[2], dtype="float32"
+                )
+                input7 = paddle.static.data(
+                    name='input7', shape=[2], dtype="float64"
+                )
+                out1 = paddle.signbit(input1)
+                out2 = paddle.signbit(input2)
+                out3 = paddle.signbit(input3)
+                out4 = paddle.signbit(input4)
+                out5 = paddle.signbit(input5)
+                out6 = paddle.signbit(input6)
+                out7 = paddle.signbit(input7)
+                exe = paddle.static.Executor(place)
+                res1, res2, res3, res4, res5, res6, res7 = exe.run(
+                    paddle.static.default_main_program(),
+                    feed={
+                        "input1": np_input1,
+                        "input2": np_input2,
+                        "input3": np_input3,
+                        "input4": np_input4,
+                        "input5": np_input5,
+                        "input6": np_input6,
+                        "input7": np_input7,
+                    },
+                    fetch_list=[out1, out2, out3, out4, out5, out6, out7],
+                )
+                self.assertEqual((res1 == np_out1).all(), True)
+                self.assertEqual((res2 == np_out2).all(), True)
+                self.assertEqual((res3 == np_out3).all(), True)
+                self.assertEqual((res4 == np_out4).all(), True)
+                self.assertEqual((res5 == np_out5).all(), True)
+                self.assertEqual((res6 == np_out6).all(), True)
+                self.assertEqual((res7 == np_out7).all(), True)
+
+        for place in self.place:
+            run(place)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/legacy_test/test_similarity_focus_op.py b/test/legacy_test/test_similarity_focus_op.py
deleted file mode 100755
index 1227a48949341..0000000000000
--- a/test/legacy_test/test_similarity_focus_op.py
+++ /dev/null
@@ -1,232 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-class TestSimilarityFocusOp(OpTest):
-    def setUp(self):
-        self.op_type = "similarity_focus"
-        batch_size = 2
-        x_dim, y_dim, z_dim = 3, 2, 2
-        self.inputs = {
-            'X': np.array(
-                [
-                    [
-                        [[0.8, 0.1], [0.4, 0.5]],
-                        [[0.9, 0.7], [0.9, 0.9]],
-                        [[0.8, 0.9], [0.1, 0.2]],
-                    ],
-                    [
-                        [[0.2, 0.5], [0.3, 0.4]],
-                        [[0.9, 0.7], [0.8, 0.4]],
-                        [[0.0, 0.2], [0.4, 0.7]],
-                    ],
-                ]
-            ),
-        }
-        self.attrs = {
-            'axis': 1,
-            'indexes': [0],
-        }
-
-        output = None
-        for batch in range(batch_size):
-            res = np.zeros((1, y_dim, z_dim)).astype("float32").reshape(-1)
-            for index in self.attrs['indexes']:
-                channel = (
-                    self.inputs['X'][batch, index, :, :].reshape(-1).copy()
-                )
-                tag1 = [0 for i in range(y_dim)]
-                tag2 = [0 for i in range(z_dim)]
-                cnt = 0
-                for i in range(channel.size):
-                    index = channel.argmax()
-                    idx1 = index // z_dim
-                    idx2 = index % z_dim
-                    if tag1[idx1] + tag2[idx2] == 0:
-                        tag1[idx1] = 1
-                        tag2[idx2] = 1
-                        res[index] = 1
-                        cnt += 1
-                        if cnt == min(y_dim, z_dim):
-                            break
-                    channel[index] = -1
-            res = res.reshape(1, y_dim, z_dim).repeat([x_dim], axis=0)
-            res = res.reshape(1, x_dim, y_dim, z_dim)
-            if output is not None:
-                output = np.concatenate((output, res), axis=0)
-            else:
-                output = res
-        self.outputs = {'Out': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestSimilarityFocusOp_axis1(OpTest):
-    def setUp(self):
-        self.op_type = "similarity_focus"
-        batch_size = 3
-        x_dim, y_dim, z_dim = 4, 5, 6
-        self.inputs = {
-            'X': np.random.random((batch_size, x_dim, y_dim, z_dim)).astype(
-                "float32"
-            ),
-        }
-        self.attrs = {
-            'axis': 1,
-            'indexes': [0, 3],
-        }
-
-        output = None
-        for batch in range(batch_size):
-            res = np.zeros((1, y_dim, z_dim)).astype("float32").reshape(-1)
-            for index in self.attrs['indexes']:
-                channel = (
-                    self.inputs['X'][batch, index, :, :].reshape(-1).copy()
-                )
-                tag1 = [0 for i in range(y_dim)]
-                tag2 = [0 for i in range(z_dim)]
-                cnt = 0
-                for i in range(channel.size):
-                    index = channel.argmax()
-                    idx1 = index // z_dim
-                    idx2 = index % z_dim
-                    if tag1[idx1] + tag2[idx2] == 0:
-                        tag1[idx1] = 1
-                        tag2[idx2] = 1
-                        res[index] = 1
-                        cnt += 1
-                        if cnt == min(y_dim, z_dim):
-                            break
-                    channel[index] = -1
-            res = res.reshape(1, y_dim, z_dim)
-            res = res.repeat([x_dim], axis=0)
-            res = res.reshape(1, x_dim, y_dim, z_dim)
-            if output is not None:
-                output = np.concatenate((output, res), axis=0)
-            else:
-                output = res
-        self.outputs = {'Out': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestSimilarityFocusOp_axis2(OpTest):
-    def setUp(self):
-        self.op_type = "similarity_focus"
-        batch_size = 6
-        x_dim, y_dim, z_dim = 7, 8, 9
-        self.inputs = {
-            'X': np.random.random((batch_size, x_dim, y_dim, z_dim)).astype(
-                "float32"
-            ),
-        }
-        self.attrs = {
-            'axis': 2,
-            'indexes': [0, 3, 5],
-        }
-
-        output = None
-        for batch in range(batch_size):
-            res = np.zeros((x_dim, 1, z_dim)).astype("float32").reshape(-1)
-            for index in self.attrs['indexes']:
-                channel = (
-                    self.inputs['X'][batch, :, index, :].reshape(-1).copy()
-                )
-                tag1 = [0 for i in range(x_dim)]
-                tag2 = [0 for i in range(z_dim)]
-                cnt = 0
-                for i in range(channel.size):
-                    index = channel.argmax()
-                    idx1 = index // z_dim
-                    idx2 = index % z_dim
-                    if tag1[idx1] + tag2[idx2] == 0:
-                        tag1[idx1] = 1
-                        tag2[idx2] = 1
-                        res[index] = 1
-                        cnt += 1
-                        if cnt == min(x_dim, z_dim):
-                            break
-                    channel[index] = -1
-            res = res.reshape(x_dim, 1, z_dim)
-            res = res.repeat([y_dim], axis=1)
-            res = res.reshape(1, x_dim, y_dim, z_dim)
-            if output is not None:
-                output = np.concatenate((output, res), axis=0)
-            else:
-                output = res
-        self.outputs = {'Out': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestSimilarityFocusOp_axis3(OpTest):
-    def setUp(self):
-        self.op_type = "similarity_focus"
-        batch_size = 64
-        x_dim, y_dim, z_dim = 48, 48, 13
-        self.inputs = {
-            'X': np.random.random((batch_size, x_dim, y_dim, z_dim)).astype(
-                "float32"
-            ),
-        }
-        self.attrs = {
-            'axis': 3,
-            'indexes': [0, 2, 7, 9],
-        }
-
-        output = None
-        for batch in range(batch_size):
-            res = np.zeros((x_dim, y_dim, 1)).astype("float32").reshape(-1)
-            for index in self.attrs['indexes']:
-                channel = (
-                    self.inputs['X'][batch, :, :, index].reshape(-1).copy()
-                )
-                tag1 = [0 for i in range(x_dim)]
-                tag2 = [0 for i in range(y_dim)]
-                cnt = 0
-                for i in range(channel.size):
-                    index = channel.argmax()
-                    idx1 = index // y_dim
-                    idx2 = index % y_dim
-                    if tag1[idx1] + tag2[idx2] == 0:
-                        tag1[idx1] = 1
-                        tag2[idx2] = 1
-                        res[index] = 1
-                        cnt += 1
-                        if cnt == min(x_dim, y_dim):
-                            break
-                    channel[index] = -1
-            res = res.reshape(x_dim, y_dim, 1)
-            res = res.repeat([z_dim], axis=2)
-            res = res.reshape(1, x_dim, y_dim, z_dim)
-            if output is not None:
-                output = np.concatenate((output, res), axis=0)
-            else:
-                output = res
-        self.outputs = {'Out': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_softmax_op.py b/test/legacy_test/test_softmax_op.py
index 1876424cf4d4b..facf487ebafc5 100644
--- a/test/legacy_test/test_softmax_op.py
+++ b/test/legacy_test/test_softmax_op.py
@@ -82,7 +82,7 @@ def init_kernel_type(self):
         pass
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         if self.use_cudnn:
             place = core.CUDAPlace(0)
             self.check_output_with_place(
@@ -102,7 +102,7 @@ def test_check_output(self):
             )
 
     def test_check_grad(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         if self.use_cudnn or self.dtype == np.float16:
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place):
@@ -160,7 +160,7 @@ def setUp(self):
         self.enable_cinn = False
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         if self.use_cudnn:
             place = core.CUDAPlace(0)
             self.check_output_with_place(
@@ -207,7 +207,7 @@ def setUp(self):
         self.enable_cinn = False
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         if self.use_cudnn:
             place = core.CUDAPlace(0)
             self.check_output_with_place(
diff --git a/test/legacy_test/test_spp_op.py b/test/legacy_test/test_spp_op.py
deleted file mode 100644
index fbf3440352590..0000000000000
--- a/test/legacy_test/test_spp_op.py
+++ /dev/null
@@ -1,91 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-from test_pool2d_op import avg_pool2D_forward_naive, max_pool2D_forward_naive
-
-
-class TestSppOp(OpTest):
-    def setUp(self):
-        self.op_type = "spp"
-        self.init_test_case()
-        nsize, csize, hsize, wsize = self.shape
-        data = np.array(list(range(nsize * csize * hsize * wsize)))
-        input = data.reshape(self.shape)
-        input_random = np.random.random(self.shape).astype("float64")
-        input = input + input_random
-        out_level_flatten = []
-        for i in range(self.pyramid_height):
-            bins = np.power(2, i)
-            kernel_size = [0, 0]
-            padding = [0, 0]
-            kernel_size[0] = np.ceil(hsize / bins.astype("double")).astype(
-                "int32"
-            )
-            padding[0] = ((kernel_size[0] * bins - hsize + 1) / 2).astype(
-                "int32"
-            )
-
-            kernel_size[1] = np.ceil(wsize / bins.astype("double")).astype(
-                "int32"
-            )
-            padding[1] = ((kernel_size[1] * bins - wsize + 1) / 2).astype(
-                "int32"
-            )
-            out_level = self.pool2D_forward_naive(
-                input, kernel_size, kernel_size, padding
-            )
-            out_level_flatten.append(
-                out_level.reshape(nsize, bins * bins * csize)
-            )
-            if i == 0:
-                output = out_level_flatten[i]
-            else:
-                output = np.concatenate((output, out_level_flatten[i]), 1)
-        # output = np.concatenate(out_level_flatten.tolist(), 0);
-        self.inputs = {
-            'X': input.astype('float64'),
-        }
-        self.attrs = {
-            'pyramid_height': self.pyramid_height,
-            'pooling_type': self.pool_type,
-        }
-        self.outputs = {'Out': output.astype('float64')}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-    def init_test_case(self):
-        self.shape = [3, 2, 16, 16]
-        self.pyramid_height = 3
-        self.pool2D_forward_naive = max_pool2D_forward_naive
-        self.pool_type = "max"
-
-
-class TestCase2(TestSppOp):
-    def init_test_case(self):
-        self.shape = [3, 2, 16, 16]
-        self.pyramid_height = 3
-        self.pool2D_forward_naive = avg_pool2D_forward_naive
-        self.pool_type = "avg"
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_stack_extension_api.py b/test/legacy_test/test_stack_extension_api.py
index b6f42de6c9537..84ec12d20c286 100644
--- a/test/legacy_test/test_stack_extension_api.py
+++ b/test/legacy_test/test_stack_extension_api.py
@@ -375,7 +375,7 @@ def test_dtype(self):
 class TestRowStack(BaseTest, BaseCases):
     def setUp(self):
         self.func_paddle = paddle.row_stack
-        self.func_numpy = np.row_stack
+        self.func_numpy = np.vstack
 
     def test_mix_ndim(self):
         d0 = generate_data([2], count=1, dtype='float64')
@@ -495,7 +495,7 @@ def setUp(self):
 class TestErrorRowStack(BaseTest, ErrorCases0d1d):
     def setUp(self):
         self.func_paddle = paddle.row_stack
-        self.func_numpy = np.row_stack
+        self.func_numpy = np.vstack
 
 
 if __name__ == '__main__':
diff --git a/test/legacy_test/test_strided_slice_op.py b/test/legacy_test/test_strided_slice_op.py
index 91bb626253e7c..316665afc693c 100644
--- a/test/legacy_test/test_strided_slice_op.py
+++ b/test/legacy_test/test_strided_slice_op.py
@@ -776,9 +776,7 @@ def create_case(self, net):
         np.testing.assert_array_equal(
             s1,
             s2,
-            err_msg='dygraph graph result:\n{} \nstatic dygraph result:\n{}'.format(
-                l1.numpy(), l2.numpy()
-            ),
+            err_msg=f'dygraph graph result:\n{l1.numpy()} \nstatic dygraph result:\n{l2.numpy()}',
         )
 
     def test_strided_slice_tensor_array_cuda_pinned_place(self):
diff --git a/test/legacy_test/test_svd_lowrank.py b/test/legacy_test/test_svd_lowrank.py
new file mode 100644
index 0000000000000..acdcb81b50b54
--- /dev/null
+++ b/test/legacy_test/test_svd_lowrank.py
@@ -0,0 +1,239 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class TestSvdLowrankAPI(unittest.TestCase):
+    def transpose(self, x):
+        shape = x.shape
+        perm = list(range(0, len(shape)))
+        perm = perm[:-2] + [perm[-1]] + [perm[-2]]
+        return paddle.transpose(x, perm)
+
+    def random_matrix(self, rows, columns, *batch_dims, **kwargs):
+        dtype = kwargs.get('dtype', paddle.float64)
+
+        x = paddle.randn(batch_dims + (rows, columns), dtype=dtype)
+        if x.numel() == 0:
+            return x
+        u, _, vh = paddle.linalg.svd(x, full_matrices=False)
+        k = min(rows, columns)
+        s = paddle.linspace(1 / (k + 1), 1, k, dtype=dtype)
+        return (u * s.unsqueeze(-2)) @ vh
+
+    def random_lowrank_matrix(self, rank, rows, columns, *batch_dims, **kwargs):
+        B = self.random_matrix(rows, rank, *batch_dims, **kwargs)
+        C = self.random_matrix(rank, columns, *batch_dims, **kwargs)
+        return B.matmul(C)
+
+    def run_subtest(
+        self, guess_rank, actual_rank, matrix_size, batches, svd, **options
+    ):
+        if isinstance(matrix_size, int):
+            rows = columns = matrix_size
+        else:
+            rows, columns = matrix_size
+        a_input = self.random_lowrank_matrix(
+            actual_rank, rows, columns, *batches
+        )
+        a = a_input
+        m = a_input.mean(axis=-2, keepdim=True)
+
+        u, s, v = svd(a_input - m, q=guess_rank, **options)
+
+        self.assertEqual(s.shape[-1], guess_rank)
+        self.assertEqual(u.shape[-2], rows)
+        self.assertEqual(u.shape[-1], guess_rank)
+        self.assertEqual(v.shape[-1], guess_rank)
+        self.assertEqual(v.shape[-2], columns)
+
+        A1 = u.matmul(paddle.diag_embed(s)).matmul(self.transpose(v))
+        ones_m1 = paddle.ones(batches + (rows, 1), dtype=a.dtype)
+        c = a.sum(axis=-2) / rows
+        c = c.reshape(batches + (1, columns))
+        A2 = a - ones_m1.matmul(c)
+        np.testing.assert_allclose(A1.numpy(), A2.numpy(), atol=1e-5)
+
+        detect_rank = (s.abs() > 1e-5).sum(axis=-1)
+        left = actual_rank * paddle.ones(batches, dtype=paddle.int64)
+        if not left.shape:
+            np.testing.assert_allclose(int(left), int(detect_rank))
+        else:
+            np.testing.assert_allclose(left.numpy(), detect_rank.numpy())
+        S = paddle.linalg.svd(A2, full_matrices=False)[1]
+        left = s[..., :actual_rank]
+        right = S[..., :actual_rank]
+        np.testing.assert_allclose(left.numpy(), right.numpy())
+
+    def test_forward(self):
+        svd_lowrank = paddle.linalg.svd_lowrank
+        all_batches = [(), (1,), (3,), (2, 3)]
+        for actual_rank, size in [
+            (2, (17, 4)),
+            (6, (100, 40)),
+        ]:
+            for batches in all_batches:
+                for guess_rank in [
+                    actual_rank,
+                    actual_rank + 2,
+                    actual_rank + 6,
+                ]:
+                    if guess_rank <= min(*size):
+                        self.run_subtest(
+                            guess_rank, actual_rank, size, batches, svd_lowrank
+                        )
+                        self.run_subtest(
+                            guess_rank,
+                            actual_rank,
+                            size[::-1],
+                            batches,
+                            svd_lowrank,
+                        )
+        x = np.random.randn(5, 5).astype('float64')
+        x = paddle.to_tensor(x)
+        q = None
+        U, S, V = svd_lowrank(x, q)
+
+    def test_errors(self):
+        svd_lowrank = paddle.linalg.svd_lowrank
+        x = np.random.randn(5, 5).astype('float64')
+        x = paddle.to_tensor(x)
+
+        def test_x_not_tensor():
+            U, S, V = svd_lowrank(x.numpy())
+
+        self.assertRaises(ValueError, test_x_not_tensor)
+
+        def test_q_range():
+            q = -1
+            U, S, V = svd_lowrank(x, q)
+
+        self.assertRaises(ValueError, test_q_range)
+
+        def test_niter_range():
+            n = -1
+            U, S, V = svd_lowrank(x, niter=n)
+
+        self.assertRaises(ValueError, test_niter_range)
+
+
+class TestStaticSvdLowrankAPI(unittest.TestCase):
+    def transpose(self, x):
+        shape = x.shape
+        perm = list(range(0, len(shape)))
+        perm = perm[:-2] + [perm[-1]] + [perm[-2]]
+        return paddle.transpose(x, perm)
+
+    def random_matrix(self, rows, columns, *batch_dims, **kwargs):
+        dtype = kwargs.get('dtype', 'float64')
+
+        x = paddle.randn(batch_dims + (rows, columns), dtype=dtype)
+        u, _, vh = paddle.linalg.svd(x, full_matrices=False)
+        k = min(rows, columns)
+        s = paddle.linspace(1 / (k + 1), 1, k, dtype=dtype)
+        return (u * s.unsqueeze(-2)) @ vh
+
+    def random_lowrank_matrix(self, rank, rows, columns, *batch_dims, **kwargs):
+        B = self.random_matrix(rows, rank, *batch_dims, **kwargs)
+        C = self.random_matrix(rank, columns, *batch_dims, **kwargs)
+        return B.matmul(C)
+
+    def run_subtest(
+        self, guess_rank, actual_rank, matrix_size, batches, svd, **options
+    ):
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with paddle.static.program_guard(main, startup):
+            if isinstance(matrix_size, int):
+                rows = columns = matrix_size
+            else:
+                rows, columns = matrix_size
+            a_input = self.random_lowrank_matrix(
+                actual_rank, rows, columns, *batches
+            )
+            a = a_input
+            m = a_input.mean(axis=-2, keepdim=True)
+
+            u, s, v = svd(a_input, q=guess_rank, M=m, **options)
+
+            self.assertEqual(s.shape[-1], guess_rank)
+            self.assertEqual(u.shape[-2], rows)
+            self.assertEqual(u.shape[-1], guess_rank)
+            self.assertEqual(v.shape[-1], guess_rank)
+            self.assertEqual(v.shape[-2], columns)
+
+            A1 = u.matmul(paddle.diag_embed(s)).matmul(self.transpose(v))
+            ones_m1 = paddle.ones(batches + (rows, 1), dtype=a.dtype)
+            c = a.sum(axis=-2) / rows
+            c = c.reshape(batches + (1, columns))
+            A2 = a - ones_m1.matmul(c)
+            detect_rank = (s.abs() > 1e-5).sum(axis=-1)
+            left1 = actual_rank * paddle.ones(batches, dtype=paddle.int64)
+            S = paddle.linalg.svd(A2, full_matrices=False)[1]
+            left2 = s[..., :actual_rank]
+            right = S[..., :actual_rank]
+
+            exe = paddle.static.Executor()
+            exe.run(startup)
+            A1, A2, left1, detect_rank, left2, right = exe.run(
+                main,
+                feed={},
+                fetch_list=[A1, A2, left1, detect_rank, left2, right],
+            )
+
+            np.testing.assert_allclose(A1, A2, atol=1e-5)
+            if not left1.shape:
+                np.testing.assert_allclose(int(left1), int(detect_rank))
+            else:
+                np.testing.assert_allclose(left1, detect_rank)
+            np.testing.assert_allclose(left2, right)
+
+    def test_forward(self):
+        with paddle.pir_utils.IrGuard():
+            svd_lowrank = paddle.linalg.svd_lowrank
+            all_batches = [(), (1,), (3,), (2, 3)]
+            for actual_rank, size in [
+                (2, (17, 4)),
+                (6, (100, 40)),
+            ]:
+                for batches in all_batches:
+                    for guess_rank in [
+                        actual_rank,
+                        actual_rank + 2,
+                        actual_rank + 6,
+                    ]:
+                        if guess_rank <= min(*size):
+                            self.run_subtest(
+                                guess_rank,
+                                actual_rank,
+                                size,
+                                batches,
+                                svd_lowrank,
+                            )
+                            self.run_subtest(
+                                guess_rank,
+                                actual_rank,
+                                size[::-1],
+                                batches,
+                                svd_lowrank,
+                            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/legacy_test/test_swiglu.py b/test/legacy_test/test_swiglu.py
index 2e420a7d9abe6..dbafeb0087d05 100644
--- a/test/legacy_test/test_swiglu.py
+++ b/test/legacy_test/test_swiglu.py
@@ -232,10 +232,28 @@ def test_input_x_y(self):
         self.assertEqual(len(infered_output_dist_attrs), 1)
         self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, 0])
 
-    def test_input_x(self):
+    def test_input_x_shard_last_dim(self):
         with self.assertRaises(NotImplementedError):
             self.rule.infer_forward(self.x_dist_tensor_spec, DistTensorSpec())
 
+    def test_input_x_unshard_last_dim(self):
+        x_shape = [64, 32]
+        process_mesh = dist.ProcessMesh(mesh=[0, 1, 2, 3])
+        x_tensor_dist_attr = TensorDistAttr()
+        x_tensor_dist_attr.dims_mapping = [0, -1]
+        x_tensor_dist_attr.process_mesh = process_mesh
+        self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr)
+
+        result_dist_attrs = self.rule.infer_forward(
+            self.x_dist_tensor_spec, DistTensorSpec()
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 2)
+        self.assertEqual(len(infered_output_dist_attrs), 1)
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1])
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/legacy_test/test_tdm_sampler_op.py b/test/legacy_test/test_tdm_sampler_op.py
index e56496d9aa97b..d50fb2e12da6b 100644
--- a/test/legacy_test/test_tdm_sampler_op.py
+++ b/test/legacy_test/test_tdm_sampler_op.py
@@ -155,26 +155,14 @@ def test_check_output(self):
                 if sampling_res_list[0] != 0:
                     assert len(set(sampling_res_list)) == len(
                         sampling_res_list
-                    ), "len(set(sampling_res_list)): {}, len(sampling_res_list): {} , sample_res: {}, label_res:{}, mask_res: {}".format(
-                        len(set(sampling_res_list)),
-                        len(sampling_res_list),
-                        sampling_res,
-                        label_sampling_res,
-                        mask_sampling_res,
-                    )
+                    ), f"len(set(sampling_res_list)): {len(set(sampling_res_list))}, len(sampling_res_list): {len(sampling_res_list)} , sample_res: {sampling_res}, label_res:{label_sampling_res}, mask_res: {mask_sampling_res}"
                 # check legal
                 layer_node = self.tree_layer[layer_idx]
                 layer_node.append(0)
                 for sample in sampling_res_list:
                     assert (
                         sample in layer_node
-                    ), "sample: {}, layer_node: {} , sample_res: {}, label_res: {}, mask_res:{}".format(
-                        sample,
-                        layer_node,
-                        sampling_res,
-                        label_sampling_res,
-                        mask_sampling_res,
-                    )
+                    ), f"sample: {sample}, layer_node: {layer_node} , sample_res: {sampling_res}, label_res: {label_sampling_res}, mask_res:{mask_sampling_res}"
 
                 # check label
                 label_flag = 1
@@ -185,9 +173,7 @@ def test_check_output(self):
                 padding_index = np.where(sampling_res == 0)
                 assert not np.sum(
                     mask_sampling_res[padding_index]
-                ), "np.sum(mask_sampling_res[padding_index]): {} ".format(
-                    np.sum(mask_sampling_res[padding_index])
-                )
+                ), f"np.sum(mask_sampling_res[padding_index]): {np.sum(mask_sampling_res[padding_index])} "
                 start_offset = end_offset
             # check travel legal
             assert (
diff --git a/test/legacy_test/test_teacher_student_sigmoid_loss_op.py b/test/legacy_test/test_teacher_student_sigmoid_loss_op.py
deleted file mode 100644
index 984a47831064e..0000000000000
--- a/test/legacy_test/test_teacher_student_sigmoid_loss_op.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from math import exp, log
-
-import numpy as np
-from op_test import OpTest
-from scipy.special import logit
-
-
-class TestTeacherStudentSigmoidLossOp(OpTest):
-    """
-    Test teacher_student_sigmoid_loss with discrete one-hot labels.
-    """
-
-    def setUp(self):
-        self.op_type = "teacher_student_sigmoid_loss"
-        batch_size = 100
-        num_classes = 1
-        self.inputs = {
-            'X': logit(
-                np.random.uniform(0, 1, (batch_size, num_classes)).astype(
-                    "float64"
-                )
-            ),
-            'Label': np.random.uniform(0, 2, (batch_size, num_classes)).astype(
-                "float64"
-            ),
-        }
-        outs = []
-        for index, label in enumerate(self.inputs["Label"]):
-            x = self.inputs["X"][index]
-            if label < -1.0:
-                outs.append(max(x, 0.0) + log(1.0 + exp(-abs(x))))
-            elif label < 0.0:
-                outs.append(max(x, 0.0) - x + log(1.0 + exp(-abs(x))))
-            elif label < 1.0:
-                outs.append(
-                    max(x, 0.0)
-                    + log(1.0 + exp(-abs(x)))
-                    + max(x, 0.0)
-                    - x * label
-                    + log(1.0 + exp(-abs(x)))
-                )
-            else:
-                outs.append(
-                    max(x, 0.0)
-                    - x
-                    + log(1.0 + exp(-abs(x)))
-                    + max(x, 0.0)
-                    - x * (label - 1.0)
-                    + log(1.0 + exp(-abs(x)))
-                )
-        self.outputs = {'Y': np.array(outs)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Y", numeric_grad_delta=0.005)
diff --git a/test/legacy_test/test_unflatten.py b/test/legacy_test/test_unflatten.py
index ac8b72879dd5c..7bf621396905b 100644
--- a/test/legacy_test/test_unflatten.py
+++ b/test/legacy_test/test_unflatten.py
@@ -37,15 +37,11 @@ def numpy_unflatten(x, axis, shape):
                 sizes = np.prod(shape)
                 if sizes != x.shape[axis]:
                     raise ValueError(
-                        "The product of the elements in shape{} is not equal to {}.".format(
-                            shape, x.shape[axis]
-                        )
+                        f"The product of the elements in shape{shape} is not equal to {x.shape[axis]}."
                     )
     else:
         raise TypeError(
-            "The data type of x should be one of ['List', 'Tuple', 'Tensor'], but got {}".format(
-                type(shape)
-            )
+            f"The data type of x should be one of ['List', 'Tuple', 'Tensor'], but got {type(shape)}"
         )
     length = len(x.shape)
     if axis < 0:
diff --git a/test/legacy_test/test_unique_with_counts.py b/test/legacy_test/test_unique_with_counts.py
deleted file mode 100644
index 4cc2879bfab7a..0000000000000
--- a/test/legacy_test/test_unique_with_counts.py
+++ /dev/null
@@ -1,150 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest, paddle_static_guard
-
-import paddle
-from paddle.base import core
-
-
-class TestUniqueWithCountsOp(OpTest):
-    def setUp(self):
-        self.op_type = "unique_with_counts"
-        self.init_config()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def init_config(self):
-        self.inputs = {
-            'X': np.array([2, 3, 3, 1, 5, 3], dtype='int64'),
-        }
-        self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)}
-        self.outputs = {
-            'Out': np.array([2, 3, 1, 5], dtype='int64'),
-            'Index': np.array([0, 1, 1, 2, 3, 1], dtype='int32'),
-            'Count': np.array([1, 3, 1, 1], dtype='int32'),
-        }
-
-
-class TestOne(TestUniqueWithCountsOp):
-    def init_config(self):
-        self.inputs = {
-            'X': np.array([2], dtype='int64'),
-        }
-        self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)}
-        self.outputs = {
-            'Out': np.array([2], dtype='int64'),
-            'Index': np.array([0], dtype='int32'),
-            'Count': np.array([1], dtype='int32'),
-        }
-
-
-class TestRandom(TestUniqueWithCountsOp):
-    def init_config(self):
-        input_data = np.random.randint(0, 100, (2000,), dtype='int64')
-        self.inputs = {'X': input_data}
-        self.attrs = {'dtype': int(core.VarDesc.VarType.INT64)}
-        np_unique, np_index, reverse_index = np.unique(
-            self.inputs['X'], True, True
-        )
-        np_tuple = [(np_unique[i], np_index[i]) for i in range(len(np_unique))]
-        np_tuple.sort(key=lambda x: x[1])
-        target_out = np.array([i[0] for i in np_tuple], dtype='int64')
-        target_index = np.array(
-            [list(target_out).index(i) for i in self.inputs['X']], dtype='int64'
-        )
-        count = [0 for i in range(len(np_unique))]
-        for i in range(target_index.shape[0]):
-            count[target_index[i]] += 1
-        target_count = np.array(count, dtype='int64')
-        self.outputs = {
-            'Out': target_out,
-            'Index': target_index,
-            'Count': target_count,
-        }
-
-
-class TestUniqueWithCountsRaiseError(unittest.TestCase):
-    def test_errors(self):
-        with paddle_static_guard():
-
-            def test_dtype():
-                data = paddle.static.data(
-                    shape=[10], dtype="int16", name="input"
-                )
-                paddle.unique(data)
-
-            self.assertRaises(TypeError, test_dtype)
-
-
-@unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-)
-class TestOneGPU(TestUniqueWithCountsOp):
-    def init_config(self):
-        self.inputs = {
-            'X': np.array([2], dtype='int64'),
-        }
-        self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)}
-        self.outputs = {
-            'Out': np.array([2], dtype='int64'),
-            'Index': np.array([0], dtype='int32'),
-            'Count': np.array([1], dtype='int32'),
-        }
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-5)
-
-
-@unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-)
-class TestRandomGPU(TestUniqueWithCountsOp):
-    def init_config(self):
-        input_data = np.random.randint(0, 100, (2000,), dtype='int64')
-        self.inputs = {'X': input_data}
-        self.attrs = {'dtype': int(core.VarDesc.VarType.INT64)}
-        np_unique, np_index, reverse_index = np.unique(
-            self.inputs['X'], True, True
-        )
-        np_tuple = [(np_unique[i], np_index[i]) for i in range(len(np_unique))]
-        np_tuple.sort(key=lambda x: x[1])
-        target_out = np.array([i[0] for i in np_tuple], dtype='int64')
-        target_index = np.array(
-            [list(target_out).index(i) for i in self.inputs['X']], dtype='int64'
-        )
-        count = [0 for i in range(len(np_unique))]
-        for i in range(target_index.shape[0]):
-            count[target_index[i]] += 1
-        target_count = np.array(count, dtype='int64')
-        self.outputs = {
-            'Out': target_out,
-            'Index': target_index,
-            'Count': target_count,
-        }
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-5)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_unzip_op.py b/test/legacy_test/test_unzip_op.py
deleted file mode 100644
index fd564fe6f3578..0000000000000
--- a/test/legacy_test/test_unzip_op.py
+++ /dev/null
@@ -1,115 +0,0 @@
-#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-import paddle
-from paddle import base
-from paddle.base import core
-
-
-class TestUnzipOp(unittest.TestCase):
-    def test_result(self):
-        """
-        For unzip op
-        """
-        paddle.enable_static()
-        if core.is_compiled_with_cuda():
-            place = base.CUDAPlace(0)
-            x = paddle.static.data(name='X', shape=[6], dtype='float64')
-            lod = paddle.static.data(name='lod', shape=[6], dtype='int64')
-            len = 4
-            output = paddle.incubate.operators.unzip(x, lod, len)
-
-            input = [1.0, 2.0, 3.0, 1.0, 2.0, 4.0]
-            lod = [0, 3, 3, 3, 4, 6]
-
-            feed = {
-                'X': np.array(input).astype("float64"),
-                'lod': np.array(lod).astype("int64"),
-            }
-
-            exe = base.Executor(place=place)
-            exe.run(base.default_startup_program())
-            res = exe.run(feed=feed, fetch_list=[output])
-            out = [
-                [1.0, 2.0, 3.0, 0.0],
-                [0.0, 0.0, 0.0, 0.0],
-                [0.0, 0.0, 0.0, 0.0],
-                [1.0, 0.0, 0.0, 0.0],
-                [2.0, 4.0, 0.0, 0.0],
-            ]
-            out_np = np.array(out, dtype="float64")
-            assert (res == out_np).all(), "output is not right"
-
-
-class TestUnzipOp_Complex(unittest.TestCase):
-    def test_result(self):
-        """
-        For unzip op
-        """
-        self.dtype = self.get_dtype()
-        paddle.enable_static()
-        prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        with paddle.static.program_guard(prog, startup_prog):
-            if core.is_compiled_with_cuda():
-                place = base.CUDAPlace(0)
-                x = paddle.static.data(
-                    name='Complex64_X', shape=[6], dtype=self.dtype
-                )
-                lod = paddle.static.data(name='lodx', shape=[6], dtype='int64')
-                len = 4
-                output = paddle.incubate.operators.unzip(x, lod, len)
-                input = [
-                    1.0 + 1.0j,
-                    2.0 + 2.0j,
-                    3.0 + 3.0j,
-                    1.0 + 1.0j,
-                    2.0 + 2.0j,
-                    4.0 + 4.0j,
-                ]
-                lod = [0, 3, 3, 3, 4, 6]
-
-                feed = {
-                    'Complex64_X': np.array(input).astype(self.dtype),
-                    'lodx': np.array(lod).astype("int64"),
-                }
-
-                exe = base.Executor(place=place)
-                exe.run(base.default_startup_program())
-                res = exe.run(prog, feed=feed, fetch_list=[output])
-                out = [
-                    [1.0 + 1.0j, 2.0 + 2.0j, 3.0 + 3.0j, 0.0j],
-                    [0.0j, 0.0j, 0.0j, 0.0j],
-                    [0.0j, 0.0j, 0.0j, 0.0j],
-                    [1.0 + 1.0j, 0.0j, 0.0j, 0.0j],
-                    [2.0 + 2.0j, 4.0 + 4.0j, 0.0j, 0.0j],
-                ]
-                out_np = np.array(out, dtype=self.dtype)
-                assert (res == out_np).all(), "output is not right"
-
-    def get_dtype(self):
-        return np.complex64
-
-
-class TestUnzipOp_Complex128(TestUnzipOp_Complex):
-    def get_dtype(self):
-        return np.complex128
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_weight_decay.py b/test/legacy_test/test_weight_decay.py
index a56ea5c06b6da..c1717f0a6d680 100644
--- a/test/legacy_test/test_weight_decay.py
+++ b/test/legacy_test/test_weight_decay.py
@@ -112,10 +112,6 @@ def run_parallel_exe(
         feeder = base.DataFeeder(feed_list=feed_list, place=place)
         exe.run(base.default_startup_program())
 
-        exec_strategy = base.ExecutionStrategy()
-        if use_fast_executor:
-            exec_strategy.use_experimental_executor = True
-
         build_strategy = base.BuildStrategy()
         build_strategy.reduce_strategy = (
             base.BuildStrategy.ReduceStrategy.Reduce
diff --git a/test/legacy_test/test_where_op.py b/test/legacy_test/test_where_op.py
index d88b1b3b3a5a7..1bc9e095bca39 100644
--- a/test/legacy_test/test_where_op.py
+++ b/test/legacy_test/test_where_op.py
@@ -784,7 +784,6 @@ def test_where_condition(self):
 
 
 class TestWhereOpError(unittest.TestCase):
-    @test_with_pir_api
     def test_errors(self):
         with paddle.static.program_guard(
             paddle.static.Program(), paddle.static.Program()
@@ -805,15 +804,20 @@ def test_Value():
             self.assertRaises(TypeError, test_Value)
 
             def test_type():
-                x = paddle.static.data(name='x', shape=[-1, 4], dtype='bool')
-                x.desc.set_need_check_feed(False)
-                y = paddle.static.data(name='y', shape=[-1, 4], dtype='float16')
-                y.desc.set_need_check_feed(False)
-                cond = paddle.static.data(
-                    name='cond', shape=[-1, 4], dtype='int32'
-                )
-                cond.desc.set_need_check_feed(False)
-                paddle.where(cond, x, y)
+                with paddle.pir_utils.OldIrGuard():
+                    x = paddle.static.data(
+                        name='x', shape=[-1, 4], dtype='bool'
+                    )
+                    x.desc.set_need_check_feed(False)
+                    y = paddle.static.data(
+                        name='y', shape=[-1, 4], dtype='float16'
+                    )
+                    y.desc.set_need_check_feed(False)
+                    cond = paddle.static.data(
+                        name='cond', shape=[-1, 4], dtype='int32'
+                    )
+                    cond.desc.set_need_check_feed(False)
+                    paddle.where(cond, x, y)
 
             self.assertRaises(TypeError, test_type)
 
diff --git a/test/legacy_test/test_while_op.py b/test/legacy_test/test_while_op.py
index 69dc4e1b8c070..90591bbb3fde1 100644
--- a/test/legacy_test/test_while_op.py
+++ b/test/legacy_test/test_while_op.py
@@ -244,9 +244,7 @@ def body(i, s, x):
                             continue
                         self.assertTrue(
                             out_name in op.input("X"),
-                            "In while op, the variable in output(`Out`) must exists in inputs(`X`), but the variable with name `{}` not meet the precondition.".format(
-                                out_name
-                            ),
+                            f"In while op, the variable in output(`Out`) must exists in inputs(`X`), but the variable with name `{out_name}` not meet the precondition.",
                         )
 
 
diff --git a/test/legacy_test/test_zero_dim_reduce_api.py b/test/legacy_test/test_zero_dim_reduce_api.py
index 1f663dcc704b5..677c66b6aeab7 100644
--- a/test/legacy_test/test_zero_dim_reduce_api.py
+++ b/test/legacy_test/test_zero_dim_reduce_api.py
@@ -22,6 +22,7 @@
 import numpy as np
 
 import paddle
+from paddle.pir_utils import test_with_pir_api
 
 reduce_api_list = [
     paddle.sum,
@@ -142,13 +143,11 @@ def test_dygraph_reduce(self):
 
         paddle.enable_static()
 
-    # TODO(SigureMo): Temporarily disable this test case in due to hanging in mac CI.
-    # @test_with_pir_api
-    def test_static_reduce(self):
+    @test_with_pir_api
+    def test_static_reduce_x_0D(self):
         paddle.enable_static()
         for api in reduce_api_list:
             main_prog = paddle.static.Program()
-            block = main_prog.global_block()
             exe = paddle.static.Executor()
             with paddle.static.program_guard(
                 main_prog, paddle.static.Program()
@@ -188,17 +187,26 @@ def test_static_reduce(self):
                 )
                 res = exe.run(main_prog, fetch_list=fetch_list)
 
-                self.assertEqual(res[0].shape, ())
-                self.assertEqual(res[1].shape, ())
+                for res_data in res:
+                    self.assertEqual(res_data.shape, ())
                 if api not in [paddle.count_nonzero]:
                     np.testing.assert_allclose(res[0], res[1])
 
+                if len(res) > 3:
+                    np.testing.assert_allclose(res[-2], np.array(1.0))
+                    np.testing.assert_allclose(res[-1], np.array(1.0))
                 if len(res) > 2:
-                    self.assertEqual(res[2].shape, ())
-                    self.assertEqual(res[3].shape, ())
-                    np.testing.assert_allclose(res[2], np.array(1.0))
-                    np.testing.assert_allclose(res[3], np.array(1.0))
+                    np.testing.assert_allclose(res[-1], np.array(1.0))
 
+    @test_with_pir_api
+    def test_static_reduce_ND_0D(self):
+        paddle.enable_static()
+        for api in reduce_api_list:
+            main_prog = paddle.static.Program()
+            exe = paddle.static.Executor()
+            with paddle.static.program_guard(
+                main_prog, paddle.static.Program()
+            ):
                 # 2) x is ND, reduce to 0D
                 if api in [paddle.all, paddle.any]:
                     x = paddle.randint(0, 2, [3, 5]).astype('bool')
@@ -229,6 +237,15 @@ def test_static_reduce(self):
                 if len(res) > 2:
                     self.assertEqual(res[2].shape, (3, 5))
 
+    @test_with_pir_api
+    def test_static_reduce_x_1D(self):
+        paddle.enable_static()
+        for api in reduce_api_list:
+            main_prog = paddle.static.Program()
+            exe = paddle.static.Executor()
+            with paddle.static.program_guard(
+                main_prog, paddle.static.Program()
+            ):
                 # 3) x is 1D, axis=0, reduce to 0D
                 if api in [paddle.all, paddle.any]:
                     x = paddle.randint(0, 2, [5]).astype('bool')
diff --git a/test/legacy_test/test_zero_dim_sundry_static_api_part1.py b/test/legacy_test/test_zero_dim_sundry_static_api_part1.py
index c8d5ef8bdc93f..22386fc5022ed 100644
--- a/test/legacy_test/test_zero_dim_sundry_static_api_part1.py
+++ b/test/legacy_test/test_zero_dim_sundry_static_api_part1.py
@@ -23,6 +23,7 @@
 from decorator_helper import prog_scope
 
 import paddle
+from paddle.framework import in_pir_mode
 from paddle.pir_utils import test_with_pir_api
 
 # Use to test zero-dim of Sundry API, which is unique and can not be classified
@@ -125,14 +126,28 @@ def test_trapezoid(self):
         self.assertEqual(res[0].shape, ())
         self.assertEqual(res[1].shape, (5,))
 
+    @test_with_pir_api
     @prog_scope()
-    def test_create_parameter_var(self):
+    def test_create_parameter(self):
+        if not in_pir_mode():
+            zero_dim_param = paddle.create_parameter(shape=[], dtype='float32')
+            self.assertShapeEqual(zero_dim_param, [])
+            prog = paddle.static.default_startup_program()
+            res = self.exe.run(prog, fetch_list=[zero_dim_param])
+            self.assertEqual(res[0].shape, ())
+            return
         zero_dim_param = paddle.create_parameter(shape=[], dtype='float32')
-        self.assertShapeEqual(zero_dim_param, [])
-        prog = paddle.static.default_startup_program()
-        res = self.exe.run(prog, fetch_list=[zero_dim_param])
-        self.assertEqual(res[0].shape, ())
+        self.assertEqual(zero_dim_param.shape, [])
+        startup_prog = paddle.static.default_startup_program()
+        main_prog = paddle.static.default_main_program()
+        self.exe.run(startup_prog)
+        (zero_dim_param_res,) = self.exe.run(
+            main_prog, fetch_list=[zero_dim_param]
+        )
+        self.assertEqual(zero_dim_param_res.shape, ())
 
+    @prog_scope()
+    def test_create_global_var(self):
         zero_dim_var = paddle.static.create_global_var(
             shape=[], value=0.5, dtype='float32'
         )
@@ -142,6 +157,7 @@ def test_create_parameter_var(self):
         self.assertEqual(res[0].shape, ())
         self.assertEqual(res[0], 0.5)
 
+    @test_with_pir_api
     @prog_scope()
     def test_getitem(self):
         # case1: When all axis have a scalar indice, output should be a 0-d Tensor;
@@ -764,6 +780,7 @@ def test_inner(self):
         self.assertEqual(res[2].shape, (2, 2))
         self.assertEqual(res[3].shape, (2, 2))
 
+    @test_with_pir_api
     @prog_scope()
     def test_tensordot(self):
         x = paddle.full(shape=[10], fill_value=0.25, dtype='float64')
diff --git a/test/legacy_test/test_zero_dim_sundry_static_api_part3.py b/test/legacy_test/test_zero_dim_sundry_static_api_part3.py
index c25bdead36e1e..146b5811c0cc7 100644
--- a/test/legacy_test/test_zero_dim_sundry_static_api_part3.py
+++ b/test/legacy_test/test_zero_dim_sundry_static_api_part3.py
@@ -356,21 +356,6 @@ def test_t(self):
         self.assertEqual(res[1].shape, ())
         self.assertEqual(res[2].shape, ())
 
-    @prog_scope()
-    def test_sequence_pad(self):
-        x = paddle.static.data("x", [-1, 2], dtype=paddle.int64, lod_level=1)
-        value = paddle.to_tensor(1000, dtype=paddle.int64).squeeze()
-        out = paddle.static.nn.sequence_pad(x, value)
-
-        x_tensor = paddle.base.create_lod_tensor(
-            np.arange(20).astype(np.int64).reshape(-1, 2),
-            [[3, 3, 4]],
-            place=self.exe.place,
-        )
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, feed={"x": x_tensor}, fetch_list=[out])
-        self.assertEqual(res[0].shape, (3, 4, 2))
-
     @test_with_pir_api
     @prog_scope()
     def test_static_data(self):
diff --git a/test/legacy_test/test_zero_dim_sundry_static_api_part4.py b/test/legacy_test/test_zero_dim_sundry_static_api_part4.py
index 6ca5ff1e2c303..6a4dc55eede9e 100644
--- a/test/legacy_test/test_zero_dim_sundry_static_api_part4.py
+++ b/test/legacy_test/test_zero_dim_sundry_static_api_part4.py
@@ -272,6 +272,7 @@ def test_det(self):
         self.assertEqual(res[0].shape, ())
         self.assertEqual(res[1].shape, (3, 3))
 
+    @test_with_pir_api
     @prog_scope()
     def test_dist(self):
         x = paddle.to_tensor([[3, 3], [3, 3]], dtype="float32")
@@ -288,11 +289,12 @@ def test_dist(self):
 
         self.assertEqual(res[0].shape, ())
         self.assertEqual(res[1].shape, (2, 2))
-        self.assertEqual(res[1].shape, (2, 2))
+        self.assertEqual(res[2].shape, (2, 2))
         np.testing.assert_array_equal(res[0], np.array(2).astype(np.float32))
 
+    @test_with_pir_api
     @prog_scope()
-    def test_linalg_norm(self):
+    def test_linalg_norm1(self):
         # 1D input, p = fro ,axis = None, using reduceInferMeta
         x_1 = paddle.arange(24, dtype="float32") - 12
         x_1.stop_gradient = False
@@ -306,85 +308,120 @@ def test_linalg_norm(self):
         self.assertEqual(res[0].shape, ())
         self.assertEqual(res[1].shape, (24,))
 
+    @test_with_pir_api
+    @prog_scope()
+    def test_linalg_norm2(self):
         # 1D input, p = 1 ,axis = None,
         # using p_norm, as_vector = True
         x_2 = paddle.arange(24, dtype="float32") - 12
         x_2.stop_gradient = False
         out_2 = paddle.linalg.norm(x_2, p=1)
-        paddle.static.append_backward(out_2.sum())
+        ((_, x_2_grad),) = paddle.static.append_backward(
+            out_2.sum(), parameter_list=[x_2]
+        )
 
         prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out_2, x_2.grad_name])
+        res = self.exe.run(prog, fetch_list=[out_2, x_2_grad])
         self.assertEqual(res[0].shape, ())
         self.assertEqual(res[1].shape, (24,))
 
+    @test_with_pir_api
+    @prog_scope()
+    def test_linalg_norm3(self):
         # 1D input, p = 1 ,axis = 0,
         # using p_norm, as_vector = False
         x_2_p = paddle.arange(24, dtype="float32") - 12
         x_2_p.stop_gradient = False
         out_2_p = paddle.linalg.norm(x_2_p, p=1, axis=0)
-        paddle.static.append_backward(out_2_p.sum())
+        ((_, x_2_p_grad),) = paddle.static.append_backward(
+            out_2_p.sum(), parameter_list=[x_2_p]
+        )
 
         prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out_2_p, x_2_p.grad_name])
+        res = self.exe.run(prog, fetch_list=[out_2_p, x_2_p_grad])
         self.assertEqual(res[0].shape, ())
         self.assertEqual(res[1].shape, (24,))
 
+    @test_with_pir_api
+    @prog_scope()
+    def test_linalg_norm4(self):
         # 1D input, p = fro ,axis = 0,
         # using p_norm, as_vector = False
         x_2_fro = paddle.arange(24, dtype="float32") - 12
         x_2_fro.stop_gradient = False
         out_2_fro = paddle.linalg.norm(x_2_fro, p="fro", axis=0)
-        paddle.static.append_backward(out_2_fro.sum())
+        ((_, x_2_fro_grad),) = paddle.static.append_backward(
+            out_2_fro.sum(), parameter_list=[x_2_fro]
+        )
 
         prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out_2_fro, x_2_fro.grad_name])
+        res = self.exe.run(prog, fetch_list=[out_2_fro, x_2_fro_grad])
         self.assertEqual(res[0].shape, ())
         self.assertEqual(res[1].shape, (24,))
 
+    @test_with_pir_api
+    @prog_scope()
+    def test_linalg_norm5(self):
         # 2D input, p = 1, axis = [0, 1]
         # using p_matrix_norm, depends on paddle.sum
         x_3 = paddle.arange(24, dtype="float32").reshape([4, 6])
         x_3.stop_gradient = False
         out_3 = paddle.linalg.norm(x_3, p=1, axis=[0, 1])
-        paddle.static.append_backward(out_3.sum())
+        ((_, x_3_grad),) = paddle.static.append_backward(
+            out_3.sum(), parameter_list=[x_3]
+        )
 
         prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out_3, x_3.grad_name])
+        res = self.exe.run(prog, fetch_list=[out_3, x_3_grad])
         self.assertEqual(res[0].shape, ())
         self.assertEqual(res[1].shape, (4, 6))
 
+    @test_with_pir_api
+    @prog_scope()
+    def test_linalg_norm6(self):
         # 2D input, p = 1, axis = None
         # using p_matrix_norm, depends on paddle.sum
         x_4 = paddle.arange(24, dtype="float32").reshape([4, 6])
         x_4.stop_gradient = False
         out_4 = paddle.linalg.norm(x_4)
-        paddle.static.append_backward(out_4.sum())
+        ((_, x_4_grad),) = paddle.static.append_backward(
+            out_4.sum(), parameter_list=[x_4]
+        )
 
         prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out_4, x_4.grad_name])
+        res = self.exe.run(prog, fetch_list=[out_4, x_4_grad])
 
         self.assertEqual(res[0].shape, ())
         self.assertEqual(res[1].shape, (4, 6))
 
+    @test_with_pir_api
+    @prog_scope()
+    def test_linalg_norm7(self):
         # 2D input, p = inf, axis = None
         x_5 = paddle.arange(24, dtype="float32").reshape([4, 6])
         x_5.stop_gradient = False
         out_5 = paddle.linalg.norm(x_5)
-        paddle.static.append_backward(out_5.sum())
+        ((_, x_5_grad),) = paddle.static.append_backward(
+            out_5.sum(), parameter_list=[x_5]
+        )
         prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out_5, x_5.grad_name])
+        res = self.exe.run(prog, fetch_list=[out_5, x_5_grad])
 
         self.assertEqual(res[0].shape, ())
         self.assertEqual(res[1].shape, (4, 6))
 
+    @test_with_pir_api
+    @prog_scope()
+    def test_linalg_norm8(self):
         # 2D input, p = -inf, axis = [0, 1]
         x_6 = paddle.arange(24, dtype="float32").reshape([4, 6])
         x_6.stop_gradient = False
         out_6 = paddle.linalg.norm(x_6, p=-float("inf"), axis=[0, 1])
-        paddle.static.append_backward(out_6.sum())
+        ((_, x_6_grad),) = paddle.static.append_backward(
+            out_6.sum(), parameter_list=[x_6]
+        )
         prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out_6, x_6.grad_name])
+        res = self.exe.run(prog, fetch_list=[out_6, x_6_grad])
 
         self.assertEqual(res[0].shape, ())
         self.assertEqual(res[1].shape, (4, 6))
@@ -499,6 +536,7 @@ def test_linalg_cond(self):
         self.assertEqual(res[0].shape, (2,))
         self.assertEqual(res[1].shape, (2, 4, 4))
 
+    @test_with_pir_api
     @prog_scope()
     def test_trace(self):
         x = paddle.to_tensor([[3, 2], [1, 9]], dtype="float32")
diff --git a/test/legacy_test/test_zeros_op.py b/test/legacy_test/test_zeros_op.py
index ce4449335425c..a5888c11d086e 100644
--- a/test/legacy_test/test_zeros_op.py
+++ b/test/legacy_test/test_zeros_op.py
@@ -80,5 +80,13 @@ def test_shape_errors(self):
                 assert error_msg.find("expected to be no less than 0") > 0
 
 
+class ApiZerosWithDynamicShape(unittest.TestCase):
+    def test_dynamic_shape(self):
+        with paddle.pir_utils.IrGuard():
+            x = paddle.static.data("x", shape=[], dtype='int32')
+            out = paddle.zeros(shape=[101, x])
+            self.assertEqual(out.shape, [101, -1])
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/mkldnn/CMakeLists.txt b/test/mkldnn/CMakeLists.txt
index 4f40752c69c87..4dcf8d7ff2ca4 100644
--- a/test/mkldnn/CMakeLists.txt
+++ b/test/mkldnn/CMakeLists.txt
@@ -7,7 +7,7 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 list(REMOVE_ITEM TEST_OPS "test_onnx_format_quantization_mobilenetv1")
 list(REMOVE_ITEM TEST_OPS "test_flags_mkldnn_ops_on_off")
 
-if(WITH_MKLDNN AND NOT WIN32)
+if(WITH_ONEDNN AND NOT WIN32)
   list(APPEND TEST_OPS "test_onnx_format_quantization_mobilenetv1")
 endif()
 
@@ -16,7 +16,7 @@ foreach(TEST_OP ${TEST_OPS})
 endforeach()
 
 # NODE(Ruibiao): Remove it after static build is enabled by default.
-if(WITH_MKLDNN AND NOT WIN32)
+if(WITH_ONEDNN AND NOT WIN32)
   py_test_modules(
     test_dequantize_mkldnn_op_static_build MODULES test_dequantize_mkldnn_op
     ENVS FLAGS_new_executor_static_build=true)
@@ -26,7 +26,7 @@ set_tests_properties(test_concat_mkldnn_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_conv3d_mkldnn_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_elementwise_mul_onednn_op PROPERTIES TIMEOUT 60)
 set_tests_properties(test_elementwise_add_mkldnn_op PROPERTIES TIMEOUT 60)
-if(WITH_MKLDNN AND NOT WIN32)
+if(WITH_ONEDNN AND NOT WIN32)
   set_tests_properties(test_onnx_format_quantization_mobilenetv1
                        PROPERTIES TIMEOUT 300)
 endif()
diff --git a/test/mkldnn/check_flags_mkldnn_ops_on_off.py b/test/mkldnn/check_flags_mkldnn_ops_on_off.py
index a41da982d354a..8c41fd325de97 100644
--- a/test/mkldnn/check_flags_mkldnn_ops_on_off.py
+++ b/test/mkldnn/check_flags_mkldnn_ops_on_off.py
@@ -33,12 +33,12 @@ def check():
     )
     print("check: DNNL_VERBOSE=", os.environ['DNNL_VERBOSE'])
     print(
-        "check: FLAGS_tracer_mkldnn_ops_on=",
-        _global_flags()['FLAGS_tracer_mkldnn_ops_on'],
+        "check: FLAGS_tracer_onednn_ops_on=",
+        _global_flags()['FLAGS_tracer_onednn_ops_on'],
     )
     print(
-        "check: FLAGS_tracer_mkldnn_ops_off=",
-        _global_flags()['FLAGS_tracer_mkldnn_ops_off'],
+        "check: FLAGS_tracer_onednn_ops_off=",
+        _global_flags()['FLAGS_tracer_onednn_ops_off'],
     )
     a_np = np.random.uniform(-2, 2, (10, 20, 30)).astype(np.float32)
     b_np = np.random.uniform(-5, 5, (10, 20, 30)).astype(np.float32)
diff --git a/test/mkldnn/test_concat_int8_mkldnn_op.py b/test/mkldnn/test_concat_int8_mkldnn_op.py
index 546f6d4978f50..9b258ebb15d34 100644
--- a/test/mkldnn/test_concat_int8_mkldnn_op.py
+++ b/test/mkldnn/test_concat_int8_mkldnn_op.py
@@ -36,7 +36,7 @@ def setUp(self):
         self.outputs = {'Out': self.output}
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_output(check_dygraph=False, check_pir_onednn=True)
 
     # --------------------test concat s8 in with axis 0--------------------
diff --git a/test/mkldnn/test_conv2d_int8_mkldnn_op.py b/test/mkldnn/test_conv2d_int8_mkldnn_op.py
index c72f70b07e218..5d28ea8e3bacf 100644
--- a/test/mkldnn/test_conv2d_int8_mkldnn_op.py
+++ b/test/mkldnn/test_conv2d_int8_mkldnn_op.py
@@ -182,7 +182,7 @@ def residual_helper(init_low, init_high, output_):
         self.outputs = {'Output': output}
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         # the atol for integer tests should be 1
         self.check_output_with_place(
             core.CPUPlace(),
diff --git a/test/mkldnn/test_conv2d_mkldnn_op.py b/test/mkldnn/test_conv2d_mkldnn_op.py
index 606c86ce62f4b..8ff67267f7142 100644
--- a/test/mkldnn/test_conv2d_mkldnn_op.py
+++ b/test/mkldnn/test_conv2d_mkldnn_op.py
@@ -65,7 +65,7 @@ def setUp(self):
 
         output = self.outputs['Output']
 
-        # mkldnn only support either conv-sum-relu, or conv-relu.
+        # onednn only support either conv-sum-relu, or conv-relu.
         if self.fuse_bias and self.bias_size is not None:
             bias = np.random.random(self.bias_size).astype(self.dtype)
             output = conv2d_bias_naive(output, bias)
@@ -146,7 +146,7 @@ def setUp(self):
 
         output = self.outputs['Output']
 
-        # mkldnn only support either conv-sum-relu, or conv-relu.
+        # onednn only support either conv-sum-relu, or conv-relu.
         if self.fuse_bias and self.bias_size is not None:
             bias = np.random.random(self.bias_size).astype(self.dtype)
             output = conv2d_bias_naive(output, bias)
diff --git a/test/mkldnn/test_conv2d_transpose_mkldnn_op.py b/test/mkldnn/test_conv2d_transpose_mkldnn_op.py
index 54fa3f4eabea5..2ff110386fe70 100644
--- a/test/mkldnn/test_conv2d_transpose_mkldnn_op.py
+++ b/test/mkldnn/test_conv2d_transpose_mkldnn_op.py
@@ -41,7 +41,7 @@ def test_check_grad_no_filter(self):
         return
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         if self.use_cudnn:
             place = core.CUDAPlace(0)
             self.check_output_with_place(
diff --git a/test/mkldnn/test_dequantize_mkldnn_op.py b/test/mkldnn/test_dequantize_mkldnn_op.py
index 81772ae6e254c..6c09e85e595e0 100644
--- a/test/mkldnn/test_dequantize_mkldnn_op.py
+++ b/test/mkldnn/test_dequantize_mkldnn_op.py
@@ -65,7 +65,7 @@ def prepare_output_int8(self):
         self.outputs = {'Output': output}
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_output(check_dygraph=False, check_pir_onednn=True)
 
     def check_raise_error(self, msg):
diff --git a/test/mkldnn/test_elementwise_add_mkldnn_op.py b/test/mkldnn/test_elementwise_add_mkldnn_op.py
index 8b9dded0129bd..8f5e62aa80aea 100644
--- a/test/mkldnn/test_elementwise_add_mkldnn_op.py
+++ b/test/mkldnn/test_elementwise_add_mkldnn_op.py
@@ -149,7 +149,7 @@ def init_scales(self):
         self.attrs['scale_out'] = 1.0
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.init_scales()
         self.check_output(
             check_dygraph=(not self.use_mkldnn),
diff --git a/test/mkldnn/test_elementwise_mul_onednn_op.py b/test/mkldnn/test_elementwise_mul_onednn_op.py
index 7e1bcdde30002..71d4057e428fe 100644
--- a/test/mkldnn/test_elementwise_mul_onednn_op.py
+++ b/test/mkldnn/test_elementwise_mul_onednn_op.py
@@ -149,7 +149,7 @@ def init_scales(self):
         self.attrs['scale_out'] = 1.0
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.init_scales()
         self.check_output(check_dygraph=(not self.use_mkldnn))
 
diff --git a/test/mkldnn/test_elementwise_sub_onednn_op.py b/test/mkldnn/test_elementwise_sub_onednn_op.py
index f6932cc177b80..f3952a6c132f0 100644
--- a/test/mkldnn/test_elementwise_sub_onednn_op.py
+++ b/test/mkldnn/test_elementwise_sub_onednn_op.py
@@ -197,7 +197,7 @@ def test_check_grad_ignore_y(self):
 
 
 # Special cases for swin transformer, will ignore grad check
-class TestOneDNNlementwiseSubSrcDifferentShape(TestOneDNNElementwiseSubOp):
+class TestOneDNNElementwiseSubSrcDifferentShape(TestOneDNNElementwiseSubOp):
     def init_input_output(self):
         self.x = np.random.random((6, 1, 144)).astype(self.dtype)
         self.y = np.random.random((6, 144, 1)).astype(self.dtype)
diff --git a/test/mkldnn/test_fc_mkldnn_op.py b/test/mkldnn/test_fc_mkldnn_op.py
index 9a587ea2e95cf..e6605a4fa2331 100644
--- a/test/mkldnn/test_fc_mkldnn_op.py
+++ b/test/mkldnn/test_fc_mkldnn_op.py
@@ -54,7 +54,7 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_output(check_dygraph=False, check_pir_onednn=True)
 
     def test_check_grad_normal(self):
diff --git a/test/mkldnn/test_flags_mkldnn_ops_on_off.py b/test/mkldnn/test_flags_mkldnn_ops_on_off.py
index ebc4dee6fffd4..1548960e05c54 100644
--- a/test/mkldnn/test_flags_mkldnn_ops_on_off.py
+++ b/test/mkldnn/test_flags_mkldnn_ops_on_off.py
@@ -73,28 +73,28 @@ def test_flags_use_mkl_dnn_on_empty_off_empty(self):
         assert self.found(self.matmul_regex, out, err)
 
     def test_flags_use_mkl_dnn_on(self):
-        env = {"FLAGS_tracer_mkldnn_ops_on": "relu"}
+        env = {"FLAGS_tracer_onednn_ops_on": "relu"}
         out, err = self.flags_use_mkl_dnn_common(env)
         assert self.found(self.relu_regex, out, err)
         assert self.not_found(self.ew_add_regex, out, err)
         assert self.not_found(self.matmul_regex, out, err)
 
     def test_flags_use_mkl_dnn_on_multiple(self):
-        env = {"FLAGS_tracer_mkldnn_ops_on": "relu,elementwise_add"}
+        env = {"FLAGS_tracer_onednn_ops_on": "relu,elementwise_add"}
         out, err = self.flags_use_mkl_dnn_common(env)
         assert self.found(self.relu_regex, out, err)
         assert self.found(self.ew_add_regex, out, err)
         assert self.not_found(self.matmul_regex, out, err)
 
     def test_flags_use_mkl_dnn_off(self):
-        env = {"FLAGS_tracer_mkldnn_ops_off": "matmul_v2"}
+        env = {"FLAGS_tracer_onednn_ops_off": "matmul_v2"}
         out, err = self.flags_use_mkl_dnn_common(env)
         assert self.found(self.relu_regex, out, err)
         assert self.found(self.ew_add_regex, out, err)
         assert self.not_found(self.matmul_regex, out, err)
 
     def test_flags_use_mkl_dnn_off_multiple(self):
-        env = {"FLAGS_tracer_mkldnn_ops_off": "matmul_v2,relu"}
+        env = {"FLAGS_tracer_onednn_ops_off": "matmul_v2,relu"}
         out, err = self.flags_use_mkl_dnn_common(env)
         assert self.not_found(self.relu_regex, out, err)
         assert self.found(self.ew_add_regex, out, err)
@@ -102,8 +102,8 @@ def test_flags_use_mkl_dnn_off_multiple(self):
 
     def test_flags_use_mkl_dnn_on_off(self):
         env = {
-            "FLAGS_tracer_mkldnn_ops_on": "elementwise_add",
-            "FLAGS_tracer_mkldnn_ops_off": "matmul_v2",
+            "FLAGS_tracer_onednn_ops_on": "elementwise_add",
+            "FLAGS_tracer_onednn_ops_off": "matmul_v2",
         }
         out, err = self.flags_use_mkl_dnn_common(env)
         assert self.not_found(self.relu_regex, out, err)
diff --git a/test/mkldnn/test_lrn_mkldnn_op.py b/test/mkldnn/test_lrn_mkldnn_op.py
index 27571c3d19eea..c5aab505a5495 100644
--- a/test/mkldnn/test_lrn_mkldnn_op.py
+++ b/test/mkldnn/test_lrn_mkldnn_op.py
@@ -24,8 +24,8 @@ def get_attrs(self):
         return attrs
 
     def test_check_output(self):
-        # We cannot validate MidOut as LRN REF has diffrent meaning in it
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # We cannot validate MidOut as LRN REF has different meaning in it
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_output(
             atol=0.002,
             no_check_set=['MidOut'],
@@ -34,7 +34,7 @@ def test_check_output(self):
         )
 
     def test_check_grad_normal(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_grad(
             ['X'], 'Out', max_relative_error=0.01, check_dygraph=False
         )
diff --git a/test/mkldnn/test_matmul_v2_mkldnn_op.py b/test/mkldnn/test_matmul_v2_mkldnn_op.py
index 42c592cca9bdf..0829e03d1ef55 100644
--- a/test/mkldnn/test_matmul_v2_mkldnn_op.py
+++ b/test/mkldnn/test_matmul_v2_mkldnn_op.py
@@ -161,7 +161,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMulV2MatrixXMatrixTranposeXOneDNNOp2(
+class TestMatMulV2MatrixXMatrixTransposeXOneDNNOp2(
     TestMatMulV2VectorXVectorOneDNNOp
 ):
     def config(self):
@@ -171,7 +171,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMulV2MatrixXMatrixTranposeX2OneDNNOp3(
+class TestMatMulV2MatrixXMatrixTransposeX2OneDNNOp3(
     TestMatMulV2VectorXVectorOneDNNOp
 ):
     def config(self):
@@ -235,7 +235,7 @@ def config(self):
         self.trans_y = True
 
 
-class TestMatMulV2MatrixXMatrix5DTranposeYOneDNNOp(
+class TestMatMulV2MatrixXMatrix5DTransposeYOneDNNOp(
     TestMatMulV2VectorXVectorOneDNNOp
 ):
     def config(self):
@@ -448,15 +448,15 @@ def calculate_grads(self):
 create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeYOneDNNOp)
 create_bf16_test_class(TestMatMulV2MatrixXMatrix2OneDNNOp)
 create_bf16_test_class(TestMatMulV2MatrixXMatrix3OneDNNOp)
-create_bf16_test_class(TestMatMulV2MatrixXMatrixTranposeXOneDNNOp2)
-create_bf16_test_class(TestMatMulV2MatrixXMatrixTranposeX2OneDNNOp3)
+create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeXOneDNNOp2)
+create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeX2OneDNNOp3)
 create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeX3OneDNNOp)
 create_bf16_test_class(TestMatMulV2MatrixXMatrix4OneDNNOp)
 create_bf16_test_class(TestMatMulV2VectorXMatrix5DOneDNNOp)
 create_bf16_test_class(TestMatMulV2Matrix3DXVectorOneDNNOp)
 create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeXTransposeYOneDNNOp)
 create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeY2OneDNNOp)
-create_bf16_test_class(TestMatMulV2MatrixXMatrix5DTranposeYOneDNNOp)
+create_bf16_test_class(TestMatMulV2MatrixXMatrix5DTransposeYOneDNNOp)
 create_bf16_test_class(TestMatMulV2MatrixXMatrix6Dx2DOneDNNOp)
 create_bf16_test_class(TestMatMulV2MatrixXMatrix2Dx5DOneDNNOp)
 
diff --git a/test/mkldnn/test_mul_int8_mkldnn_op.py b/test/mkldnn/test_mul_int8_mkldnn_op.py
index 56b9966cbeaea..039255b2a18d6 100644
--- a/test/mkldnn/test_mul_int8_mkldnn_op.py
+++ b/test/mkldnn/test_mul_int8_mkldnn_op.py
@@ -77,7 +77,7 @@ def init_data(self):
         self.outputs = {'Out': output}
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_output_with_place(
             core.CPUPlace(), atol=0, check_dygraph=False, check_pir_onednn=True
         )
diff --git a/test/mkldnn/test_onnx_format_quantization_mobilenetv1.py b/test/mkldnn/test_onnx_format_quantization_mobilenetv1.py
index dec8a27bcd394..14c49bd378af8 100644
--- a/test/mkldnn/test_onnx_format_quantization_mobilenetv1.py
+++ b/test/mkldnn/test_onnx_format_quantization_mobilenetv1.py
@@ -311,9 +311,7 @@ def run_test(
         model_cache_folder = self.download_data(data_urls, data_md5s, model)
 
         print(
-            "Start INT8 post training quantization for {} on {} images ...".format(
-                model, sample_iterations * batch_size
-            )
+            f"Start INT8 post training quantization for {model} on {sample_iterations * batch_size} images ..."
         )
         self.generate_quantized_model(
             os.path.join(model_cache_folder, "model"),
@@ -327,9 +325,7 @@ def run_test(
         )
 
         print(
-            "Start FP32 inference for {} on {} images ...".format(
-                model, infer_iterations * batch_size
-            )
+            f"Start FP32 inference for {model} on {infer_iterations * batch_size} images ..."
         )
         (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program(
             os.path.join(model_cache_folder, "model"),
@@ -338,9 +334,7 @@ def run_test(
         )
 
         print(
-            "Start INT8 inference for {} on {} images ...".format(
-                model, infer_iterations * batch_size
-            )
+            f"Start INT8 inference for {model} on {infer_iterations * batch_size} images ..."
         )
         (int8_throughput, int8_latency, int8_acc1) = self.run_program(
             self.int8_model,
@@ -351,14 +345,10 @@ def run_test(
 
         print(f"---Post training quantization of {algo} method---")
         print(
-            "FP32 {}: batch_size {}, throughput {} images/second, latency {} second, accuracy {}.".format(
-                model, batch_size, fp32_throughput, fp32_latency, fp32_acc1
-            )
+            f"FP32 {model}: batch_size {batch_size}, throughput {fp32_throughput} images/second, latency {fp32_latency} second, accuracy {fp32_acc1}."
         )
         print(
-            "INT8 {}: batch_size {}, throughput {} images/second, latency {} second, accuracy {}.\n".format(
-                model, batch_size, int8_throughput, int8_latency, int8_acc1
-            )
+            f"INT8 {model}: batch_size {batch_size}, throughput {int8_throughput} images/second, latency {int8_latency} second, accuracy {int8_acc1}.\n"
         )
         sys.stdout.flush()
 
diff --git a/test/mkldnn/test_pool2d_int8_mkldnn_op.py b/test/mkldnn/test_pool2d_int8_mkldnn_op.py
index ece6031105426..5b97cb5856675 100644
--- a/test/mkldnn/test_pool2d_int8_mkldnn_op.py
+++ b/test/mkldnn/test_pool2d_int8_mkldnn_op.py
@@ -53,7 +53,7 @@ def setUp(self):
         self.outputs = {'Out': output}
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_output_with_place(
             core.CPUPlace(),
             atol=1e-5,
diff --git a/test/mkldnn/test_quantize_mkldnn_op.py b/test/mkldnn/test_quantize_mkldnn_op.py
index cb175f62219e1..0d2d73d069e1c 100644
--- a/test/mkldnn/test_quantize_mkldnn_op.py
+++ b/test/mkldnn/test_quantize_mkldnn_op.py
@@ -64,7 +64,7 @@ def prepare_output(self):
         self.outputs = {'Output': output}
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_output(check_dygraph=False, check_pir_onednn=True)
 
     def check_raise_error(self, msg):
diff --git a/test/mkldnn/test_requantize_mkldnn_op.py b/test/mkldnn/test_requantize_mkldnn_op.py
index 43daa9baadb8c..786d7ca484df8 100644
--- a/test/mkldnn/test_requantize_mkldnn_op.py
+++ b/test/mkldnn/test_requantize_mkldnn_op.py
@@ -90,7 +90,7 @@ def prepare_output(self):
         self.outputs = {'Output': self.output}
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.assertTrue(
             self.input_data_type == 'uint8' or self.shift_in == 0.0,
             'Input data must be unsigned if it has nonzero shift.',
diff --git a/test/mkldnn/test_softmax_mkldnn_op.py b/test/mkldnn/test_softmax_mkldnn_op.py
index c8e4e3d8948e8..2bc06aee3b80d 100644
--- a/test/mkldnn/test_softmax_mkldnn_op.py
+++ b/test/mkldnn/test_softmax_mkldnn_op.py
@@ -69,7 +69,7 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         if self.use_cudnn:
             place = core.CUDAPlace(0)
             self.check_output_with_place(
@@ -79,7 +79,7 @@ def test_check_output(self):
             self.check_output(check_dygraph=False, check_pir_onednn=True)
 
     def test_check_grad(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         if self.use_cudnn or self.dtype == np.float16:
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place):
diff --git a/test/mkldnn/test_sum_mkldnn_op.py b/test/mkldnn/test_sum_mkldnn_op.py
index fc86c6834b940..5874fd102d251 100644
--- a/test/mkldnn/test_sum_mkldnn_op.py
+++ b/test/mkldnn/test_sum_mkldnn_op.py
@@ -38,11 +38,11 @@ def init_data_type(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_output(check_dygraph=False, check_pir_onednn=True)
 
     def test_check_grad(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_grad(
             ['x0'], 'Out', check_dygraph=False, check_pir_onednn=True
         )
diff --git a/test/mkldnn/test_transpose_int8_mkldnn_op.py b/test/mkldnn/test_transpose_int8_mkldnn_op.py
index e2a3fba8d2bc0..2787e40dd5259 100644
--- a/test/mkldnn/test_transpose_int8_mkldnn_op.py
+++ b/test/mkldnn/test_transpose_int8_mkldnn_op.py
@@ -48,7 +48,7 @@ def init_op_type(self):
         self.op_type = "transpose2"
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_output_with_place(
             core.CPUPlace(),
             1e-5,
diff --git a/test/mkldnn/test_transpose_mkldnn_op.py b/test/mkldnn/test_transpose_mkldnn_op.py
index 34a25cf2f8b1e..2ca4d2ad53802 100644
--- a/test/mkldnn/test_transpose_mkldnn_op.py
+++ b/test/mkldnn/test_transpose_mkldnn_op.py
@@ -37,13 +37,13 @@ def init_op_type(self):
         self.use_mkldnn = True
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_output(
             no_check_set=['XShape'], check_dygraph=False, check_pir_onednn=True
         )
 
     def test_check_grad(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # TODO(wangzhongpu): support onednn op in dygraph mode
         self.check_grad(
             ['X'], 'Out', check_dygraph=False, check_pir_onednn=True
         )
diff --git a/test/prim/model/test_bert_cinn.py b/test/prim/model/test_bert_cinn.py
index 3ae1bcb27aeea..2b765922b71d7 100644
--- a/test/prim/model/test_bert_cinn.py
+++ b/test/prim/model/test_bert_cinn.py
@@ -110,11 +110,7 @@ def train(to_static, enable_prim, enable_cinn):
         losses.append(loss.numpy().item())
 
         print(
-            "step: {}, loss: {}, batch_cost: {:.5}".format(
-                step,
-                loss.numpy(),
-                time.time() - start_time,
-            )
+            f"step: {step}, loss: {loss.numpy()}, batch_cost: {time.time() - start_time:.5}"
         )
         if step >= 9:
             break
diff --git a/test/prim/model/test_bert_prim.py b/test/prim/model/test_bert_prim.py
index 74a65e2f0761c..623300dba338d 100644
--- a/test/prim/model/test_bert_prim.py
+++ b/test/prim/model/test_bert_prim.py
@@ -109,11 +109,7 @@ def train(to_static, enable_prim, enable_cinn):
         losses.append(loss.numpy().item())
 
         print(
-            "step: {}, loss: {}, batch_cost: {:.5}".format(
-                step,
-                loss.numpy(),
-                time.time() - start_time,
-            )
+            f"step: {step}, loss: {loss.numpy()}, batch_cost: {time.time() - start_time:.5}"
         )
         if step >= 9:
             break
diff --git a/test/prim/model/test_bert_prim_cinn.py b/test/prim/model/test_bert_prim_cinn.py
index 42e283a7c1e45..99d86ba35acc8 100644
--- a/test/prim/model/test_bert_prim_cinn.py
+++ b/test/prim/model/test_bert_prim_cinn.py
@@ -110,11 +110,7 @@ def train(to_static, enable_prim, enable_cinn):
         losses.append(loss.numpy().item())
 
         print(
-            "step: {}, loss: {}, batch_cost: {:.5}".format(
-                step,
-                loss.numpy(),
-                time.time() - start_time,
-            )
+            f"step: {step}, loss: {loss.numpy()}, batch_cost: {time.time() - start_time:.5}"
         )
         if step >= 9:
             break
diff --git a/test/prim/pir_prim/test_prim_sub_graph_dynamic_shape.py b/test/prim/pir_prim/test_prim_sub_graph_dynamic_shape.py
index 446045cf632b4..c204d3f949d83 100644
--- a/test/prim/pir_prim/test_prim_sub_graph_dynamic_shape.py
+++ b/test/prim/pir_prim/test_prim_sub_graph_dynamic_shape.py
@@ -70,16 +70,6 @@ def stack_net(x):
     return paddle.stack([x, y], axis=0)
 
 
-def tile_net1(x):
-    y = paddle.tile(x, repeat_times=[2, 5])
-    return y
-
-
-def tile_net2(x):
-    y = paddle.tile(x, repeat_times=[3, 2, 5])
-    return y
-
-
 def index_sample_net(x, index):
     return paddle.index_sample(x, index)
 
@@ -92,33 +82,54 @@ def swiglu_net2(x):
     return paddle.incubate.nn.functional.swiglu(x)
 
 
+group_norm1 = paddle.nn.GroupNorm(num_channels=128, num_groups=32)
+
+
 def group_norm_net1(x):
-    group_norm = paddle.nn.GroupNorm(num_channels=x.shape[1], num_groups=32)
-    return group_norm(x)
+    return group_norm1(x)
+
+
+group_norm2 = paddle.nn.GroupNorm(
+    num_channels=128, num_groups=32, weight_attr=False
+)
 
 
 def group_norm_net2(x):
-    group_norm = paddle.nn.GroupNorm(
-        num_channels=x.shape[1], num_groups=32, weight_attr=False
-    )
-    return group_norm(x)
+    return group_norm2(x)
+
+
+group_norm3 = paddle.nn.GroupNorm(
+    num_channels=128, num_groups=32, bias_attr=False
+)
 
 
 def group_norm_net3(x):
-    group_norm = paddle.nn.GroupNorm(
-        num_channels=x.shape[1], num_groups=32, bias_attr=False
-    )
-    return group_norm(x)
+    return group_norm3(x)
+
+
+group_norm4 = paddle.nn.GroupNorm(
+    num_channels=128,
+    num_groups=32,
+    weight_attr=False,
+    bias_attr=False,
+)
 
 
 def group_norm_net4(x):
-    group_norm = paddle.nn.GroupNorm(
-        num_channels=x.shape[1],
-        num_groups=32,
-        weight_attr=False,
-        bias_attr=False,
-    )
-    return group_norm(x)
+    return group_norm4(x)
+
+
+group_norm5 = paddle.nn.GroupNorm(
+    num_channels=128,
+    num_groups=32,
+    weight_attr=False,
+    bias_attr=False,
+    data_format='NHWC',
+)
+
+
+def group_norm_net5(x):
+    return group_norm5(x)
 
 
 def layer_norm_net1(x):
@@ -230,32 +241,6 @@ def setUp(self):
         self.tol = 1e-6
 
 
-class TestPrimTile(TestPrimBase):
-    def setUp(self):
-        np.random.seed(2023)
-        self.dtype = "float32"
-        self.x_shape = [1, 300, 4096]
-        self.init_x_shape = [None, None, 4096]
-        self.x = np.random.random(self.x_shape).astype(self.dtype)
-        self.net = tile_net1
-        self.necessary_ops = "pd_op.tile"
-        self.enable_cinn = False
-        self.tol = 1e-6
-
-
-class TestPrimTile2(TestPrimBase):
-    def setUp(self):
-        np.random.seed(2023)
-        self.dtype = "float32"
-        self.x_shape = [300, 4096]
-        self.init_x_shape = [None, 4096]
-        self.x = np.random.random(self.x_shape).astype(self.dtype)
-        self.net = tile_net2
-        self.necessary_ops = "pd_op.tile"
-        self.enable_cinn = False
-        self.tol = 1e-6
-
-
 class TestPrimTwo(unittest.TestCase):
     def setUp(self):
         np.random.seed(2023)
@@ -394,56 +379,95 @@ def setUp(self):
         self.tol = 1e-6
 
 
-class TestPrimGroupNorm1(unittest.TestCase):
+class TestPrimGroupNorm1(TestPrimBase):
     def setUp(self):
         np.random.seed(2023)
         self.dtype = "float32"
-        self.x_shape = [50, 640, 10, 20]
-        self.init_x_shape = [None, 640, None, None]
+        self.x_shape = [8, 128, 10, 20]
+        self.init_x_shape = [None, 128, None, None]
         self.x = np.random.random(self.x_shape).astype(self.dtype)
         self.net = group_norm_net1
         self.necessary_ops = "pd_op.group_norm"
         self.enable_cinn = False
-        self.tol = 1e-6
+        self.tol = 5e-6
 
 
-class TestPrimGroupNorm2(unittest.TestCase):
+class TestPrimGroupNorm2(TestPrimBase):
     def setUp(self):
         np.random.seed(2023)
         self.dtype = "float32"
-        self.x_shape = [50, 640, 10, 20]
-        self.init_x_shape = [None, 640, None, None]
+        self.x_shape = [8, 128, 10, 20]
+        self.init_x_shape = [None, 128, None, None]
         self.x = np.random.random(self.x_shape).astype(self.dtype)
         self.net = group_norm_net2
         self.necessary_ops = "pd_op.group_norm"
         self.enable_cinn = False
-        self.tol = 1e-6
+        self.tol = 5e-6
 
 
-class TestPrimGroupNorm3(unittest.TestCase):
+class TestPrimGroupNorm3(TestPrimBase):
     def setUp(self):
         np.random.seed(2023)
         self.dtype = "float32"
-        self.x_shape = [50, 640, 10]
-        self.init_x_shape = [None, 640, None]
+        self.x_shape = [50, 128, 10]
+        self.init_x_shape = [None, 128, None]
         self.x = np.random.random(self.x_shape).astype(self.dtype)
         self.net = group_norm_net3
         self.necessary_ops = "pd_op.group_norm"
         self.enable_cinn = False
-        self.tol = 1e-6
+        self.tol = 5e-6
 
 
-class TestPrimGroupNorm4(unittest.TestCase):
+class TestPrimGroupNorm4(TestPrimBase):
     def setUp(self):
         np.random.seed(2023)
         self.dtype = "float32"
-        self.x_shape = [50, 640, 10, 20]
-        self.init_x_shape = [None, 640, None, None]
+        self.x_shape = [8, 128, 10, 20]
+        self.init_x_shape = [None, 128, None, None]
         self.x = np.random.random(self.x_shape).astype(self.dtype)
         self.net = group_norm_net4
         self.necessary_ops = "pd_op.group_norm"
         self.enable_cinn = False
-        self.tol = 1e-6
+        self.tol = 5e-6
+
+
+class TestPrimGroupNorm5(TestPrimBase):
+    def setUp(self):
+        np.random.seed(2023)
+        self.dtype = "float32"
+        self.x_shape = [8, 6, 8, 4, 128]
+        self.init_x_shape = [8, 6, 8, 4, 128]
+        self.x = np.random.random(self.x_shape).astype(self.dtype)
+        self.net = group_norm_net5
+        self.necessary_ops = "pd_op.group_norm"
+        self.enable_cinn = False
+        self.tol = 5e-6
+
+
+class TestPrimGroupNorm6(TestPrimBase):
+    def setUp(self):
+        np.random.seed(2023)
+        self.dtype = "float32"
+        self.x_shape = [8, 6, 8, 4, 128]
+        self.init_x_shape = [None, None, None, None, 128]
+        self.x = np.random.random(self.x_shape).astype(self.dtype)
+        self.net = group_norm_net5
+        self.necessary_ops = "pd_op.group_norm"
+        self.enable_cinn = False
+        self.tol = 5e-6
+
+
+class TestPrimGroupNorm7(TestPrimBase):
+    def setUp(self):
+        np.random.seed(2023)
+        self.dtype = "float32"
+        self.x_shape = [8, 10, 8, 128]
+        self.init_x_shape = [None, None, None, 128]
+        self.x = np.random.random(self.x_shape).astype(self.dtype)
+        self.net = group_norm_net5
+        self.necessary_ops = "pd_op.group_norm"
+        self.enable_cinn = False
+        self.tol = 5e-6
 
 
 if __name__ == "__main__":
diff --git a/test/prim/pir_prim/test_vjp_prim.py b/test/prim/pir_prim/test_vjp_prim.py
index ada7fc496b89b..8dc8fda66921f 100644
--- a/test/prim/pir_prim/test_vjp_prim.py
+++ b/test/prim/pir_prim/test_vjp_prim.py
@@ -81,7 +81,7 @@ def test_divide_grad_prim_case1(self):
             reshape_op2 = pir_program.global_block().ops[-1]
             reshape_op1 = pir_program.global_block().ops[-4]
             self.assertEqual(len(grad_outs), 2)
-            self.assertEqual(len(pir_program.global_block().ops), 17)
+            self.assertEqual(len(pir_program.global_block().ops), 16)
             self.assertTrue(reshape_op2.result(0).is_same(grad_outs[0][0]))
             self.assertTrue(reshape_op1.result(0).is_same(grad_outs[1][0]))
             all_op_names = [
@@ -89,8 +89,7 @@ def test_divide_grad_prim_case1(self):
                 "pd_op.full",
                 "pd_op.full",
                 "pd_op.divide",
-                "pd_op.full",
-                "pd_op.elementwise_pow",
+                "pd_op.multiply",
                 "pd_op.divide",
                 "pd_op.full",
                 "pd_op.scale",
diff --git a/test/prim/prim/vjp/eager/test_comp_eager_min_grad.py b/test/prim/prim/vjp/eager/test_comp_eager_min_grad.py
new file mode 100644
index 0000000000000..48408b2d33b6c
--- /dev/null
+++ b/test/prim/prim/vjp/eager/test_comp_eager_min_grad.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import parameterized as param
+
+import paddle
+from paddle.base import core
+
+core.set_prim_eager_enabled(True)
+
+
+@param.parameterized_class(
+    ('primal', 'axis', 'cotangent', 'dtype'),
+    [
+        (np.random.rand(16, 32), [1], np.random.rand(16, 32), np.float32),
+        (np.random.rand(16, 32), [0], np.random.rand(16, 32), np.float32),
+    ],
+)
+class TestMinGradComp(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.primal = cls.primal.astype(cls.dtype)
+
+    def test_min_grad_comp(self):
+        def actual(primal0, axis):
+            core.set_prim_eager_enabled(True)
+            paddle.disable_static()
+            x = paddle.to_tensor(primal0, dtype='float32', stop_gradient=False)
+            x.stop_gradient = False
+            out = paddle.min(x, axis)
+            res = paddle.grad(out, [x], create_graph=False)
+            return res[0].numpy()
+
+        def desired(primal0, axis):
+            core.set_prim_eager_enabled(False)
+            paddle.disable_static()
+            x = paddle.to_tensor(primal0, dtype='float32', stop_gradient=False)
+            x.stop_gradient = False
+            out = paddle.min(x, axis)
+            res = paddle.grad(out, [x], create_graph=False)
+            return res[0].numpy()
+
+        dx = actual(self.primal, self.axis)
+
+        ddx = desired(self.primal, self.axis)
+
+        np.testing.assert_allclose(
+            actual=dx,
+            desired=ddx,
+            rtol=1e-6,
+            atol=0,
+        )
+        core.set_prim_eager_enabled(False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/prim/prim/vjp/test_comp_high_grad.py b/test/prim/prim/vjp/test_comp_high_grad.py
index f1f2d02887a36..29c907296addd 100644
--- a/test/prim/prim/vjp/test_comp_high_grad.py
+++ b/test/prim/prim/vjp/test_comp_high_grad.py
@@ -411,6 +411,248 @@ def test_high_grad(self):
             self.func_triple(p)
 
 
+@param.parameterized_class(
+    ('shape1'),
+    [
+        ([2],),
+        ([2, 3],),
+        ([2, 3, 4],),
+        ([2, 3, 3, 4],),
+    ],
+)
+class TestExpHighGradCheck(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.shape1 = cls.shape1
+
+    def exp_wrapper(self, x):
+        return paddle.exp(x[0])
+
+    @prog_scope()
+    def func_double(self, place):
+        shape1 = self.shape1
+        eps = 0.0005
+        dtype = np.float64
+        x = paddle.static.data('x', shape1, dtype=dtype)
+        x.stop_gradient = False
+        x.persistable = True
+        out = paddle.exp(x)
+        x_arr = np.random.uniform(-1, 1, shape1).astype(dtype)
+        x_arr[np.abs(x_arr) < 0.005] = 0.002
+
+        # exp double grad only has CompositeOpMaker, don't need set prim_flag
+        from paddle.base import core
+
+        core._set_prim_backward_enabled(True)
+        gradient_checker.double_grad_check(
+            [x], y=out, x_init=[x_arr], place=place, eps=eps
+        )
+        gradient_checker.double_grad_check_for_dygraph(
+            self.exp_wrapper,
+            [x],
+            y=out,
+            x_init=[x_arr],
+            place=place,
+        )
+        core._set_prim_backward_enabled(False)
+
+    @prog_scope()
+    def func_triple(self, place):
+        shape1 = self.shape1
+        eps = 0.0005
+        dtype = np.float64
+        x = paddle.static.data('x', shape1, dtype=dtype)
+        x.stop_gradient = False
+        x.persistable = True
+        out = paddle.exp(x)
+        x_arr = np.random.uniform(-1, 1, shape1).astype(dtype)
+        x_arr[np.abs(x_arr) < 0.005] = 0.002
+        from paddle.base import core
+
+        core._set_prim_backward_enabled(True)
+        gradient_checker.triple_grad_check(
+            [x], y=out, x_init=[x_arr], place=place, eps=eps
+        )
+        gradient_checker.triple_grad_check_for_dygraph(
+            self.exp_wrapper,
+            [x],
+            y=out,
+            x_init=[x_arr],
+            place=place,
+        )
+        core._set_prim_backward_enabled(False)
+
+    def test_high_grad(self):
+        paddle.enable_static()
+        places = [base.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(base.CUDAPlace(0))
+        for p in places:
+            self.func_double(p)
+            self.func_triple(p)
+
+
+@param.parameterized_class(
+    ('shape1'),
+    [
+        ([2],),
+        ([2, 3],),
+        ([2, 3, 4],),
+        ([2, 3, 3, 4],),
+    ],
+)
+class TestLogHighGradCheck(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.shape1 = cls.shape1
+
+    def log_wrapper(self, x):
+        return paddle.log(x[0])
+
+    @prog_scope()
+    def func_double(self, place):
+        shape1 = self.shape1
+        eps = 0.0005
+        dtype = np.float64
+        x = paddle.static.data('x', shape1, dtype=dtype)
+        x.stop_gradient = False
+        x.persistable = True
+        out = paddle.log(x)
+        x_arr = np.random.uniform(0.0, 10.0, shape1).astype(dtype)
+        x_arr[np.abs(x_arr) < 0.005] = 0.002
+
+        # log double grad only has CompositeOpMaker,don't need set prim_flag
+        from paddle.base import core
+
+        core._set_prim_backward_enabled(True)
+        gradient_checker.double_grad_check(
+            [x], y=out, x_init=[x_arr], place=place, eps=eps
+        )
+        gradient_checker.double_grad_check_for_dygraph(
+            self.log_wrapper,
+            [x],
+            y=out,
+            x_init=[x_arr],
+            place=place,
+        )
+        core._set_prim_backward_enabled(False)
+
+    @prog_scope()
+    def func_triple(self, place):
+        shape1 = self.shape1
+        eps = 0.0005
+        dtype = np.float64
+        x = paddle.static.data('x', shape1, dtype=dtype)
+        x.stop_gradient = False
+        x.persistable = True
+        out = paddle.log(x)
+        x_arr = np.random.uniform(0.0, 10.0, shape1).astype(dtype)
+        x_arr[np.abs(x_arr) < 0.005] = 0.002
+        from paddle.base import core
+
+        core._set_prim_backward_enabled(True)
+        gradient_checker.triple_grad_check(
+            [x], y=out, x_init=[x_arr], place=place, eps=eps
+        )
+        gradient_checker.triple_grad_check_for_dygraph(
+            self.log_wrapper,
+            [x],
+            y=out,
+            x_init=[x_arr],
+            place=place,
+        )
+        core._set_prim_backward_enabled(False)
+
+    def test_high_grad(self):
+        paddle.enable_static()
+        places = [base.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(base.CUDAPlace(0))
+        for p in places:
+            self.func_double(p)
+            self.func_triple(p)
+
+
+@param.parameterized_class(
+    ('shape1'),
+    [
+        ([2],),
+        ([2, 3],),
+        ([2, 3, 4],),
+        ([2, 3, 3, 4],),
+    ],
+)
+class TestAbsHighGradCheck(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.shape1 = cls.shape1
+
+    def abs_wrapper(self, x):
+        return paddle.abs(x[0])
+
+    @prog_scope()
+    def func_double(self, place):
+        shape1 = self.shape1
+        eps = 0.0005
+        dtype = np.float64
+        x = paddle.static.data('x', shape1, dtype=dtype)
+        x.stop_gradient = False
+        x.persistable = True
+        out = paddle.abs(x)
+        x_arr = np.random.uniform(0.0, 10.0, shape1).astype(dtype)
+        x_arr[np.abs(x_arr) < 0.005] = 0.002
+
+        from paddle.base import core
+
+        core._set_prim_backward_enabled(True)
+        gradient_checker.double_grad_check(
+            [x], y=out, x_init=[x_arr], place=place, eps=eps
+        )
+        gradient_checker.double_grad_check_for_dygraph(
+            self.abs_wrapper,
+            [x],
+            y=out,
+            x_init=[x_arr],
+            place=place,
+        )
+        core._set_prim_backward_enabled(False)
+
+    @prog_scope()
+    def func_triple(self, place):
+        shape1 = self.shape1
+        eps = 0.0005
+        dtype = np.float64
+        x = paddle.static.data('x', shape1, dtype=dtype)
+        x.stop_gradient = False
+        x.persistable = True
+        out = paddle.abs(x)
+        x_arr = np.random.uniform(0.0, 10.0, shape1).astype(dtype)
+        x_arr[np.abs(x_arr) < 0.005] = 0.002
+        from paddle.base import core
+
+        core._set_prim_backward_enabled(True)
+        gradient_checker.triple_grad_check(
+            [x], y=out, x_init=[x_arr], place=place, eps=eps
+        )
+        gradient_checker.triple_grad_check_for_dygraph(
+            self.abs_wrapper,
+            [x],
+            y=out,
+            x_init=[x_arr],
+            place=place,
+        )
+        core._set_prim_backward_enabled(False)
+
+    def test_high_grad(self):
+        paddle.enable_static()
+        places = [base.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(base.CUDAPlace(0))
+        for p in places:
+            self.func_double(p)
+            self.func_triple(p)
+
+
 @param.parameterized_class(
     ('shape1', 'shape2'),
     [
diff --git a/test/ps/fl_ps_trainer.py b/test/ps/fl_ps_trainer.py
index ad59a68b0a35e..bbee2bcb40913 100755
--- a/test/ps/fl_ps_trainer.py
+++ b/test/ps/fl_ps_trainer.py
@@ -112,9 +112,7 @@ def fl_ps_train():
             inputs, config, "python dataset_generator_A.py"
         )
         print(
-            "base.default_main_program: {}".format(
-                base.default_main_program()._heter_pipeline_opt
-            )
+            f"base.default_main_program: {base.default_main_program()._heter_pipeline_opt}"
         )
         for epoch in range(epoch_num):
             # A 方和 B 方如果要以文件粒度 shuffle 时，则需要固定同一个种子
@@ -146,9 +144,7 @@ def fl_ps_train():
             inputs, config, "python dataset_generator_B.py", "heter_worker"
         )
         print(
-            "base.default_main_program: {}".format(
-                base.default_main_program()._heter_pipeline_opt
-            )
+            f"base.default_main_program: {base.default_main_program()._heter_pipeline_opt}"
         )
         for epoch in range(epoch_num):
             dataset.set_filelist(file_list)
diff --git a/test/quantization/CMakeLists.txt b/test/quantization/CMakeLists.txt
index 7786514b58bcf..4ff5b4096e81c 100644
--- a/test/quantization/CMakeLists.txt
+++ b/test/quantization/CMakeLists.txt
@@ -244,7 +244,7 @@ if(NOT WITH_GPU)
   list(REMOVE_ITEM TEST_OPS test_apply_per_channel_scale)
 endif()
 
-if(LINUX AND WITH_MKLDNN)
+if(LINUX AND WITH_ONEDNN)
 
   #### Image classification dataset: ImageNet (small)
   # The dataset should already be downloaded for INT8v2 unit tests
@@ -394,7 +394,7 @@ if(LINUX AND WITH_MKLDNN)
   set(NLP_DATA_ARCHIVE "Ernie_dataset.tar.gz")
   set(NLP_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie_dataset")
   set(NLP_DATA_PATH "${NLP_DATA_DIR}/Ernie_dataset/1.8w.bs1")
-  set(NLP_LABLES_PATH "${NLP_DATA_DIR}/Ernie_dataset/label.xnli.dev")
+  set(NLP_LABELS_PATH "${NLP_DATA_DIR}/Ernie_dataset/label.xnli.dev")
   download_quant_data(${NLP_DATA_DIR} ${NLP_DATA_ARCHIVE}
                       e650ce0cbc1fadbed5cc2c01d4e734dc)
 
@@ -411,7 +411,7 @@ if(LINUX AND WITH_MKLDNN)
   inference_quant2_int8_nlp_test(
     test_quant2_int8_ernie_mkldnn ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float
     ${FP32_ERNIE_MODEL_DIR}/ernie_fp32_model ${NLP_DATA_PATH}
-    ${NLP_LABLES_PATH} ${QUANT2_ERNIE_OPS_TO_QUANTIZE})
+    ${NLP_LABELS_PATH} ${QUANT2_ERNIE_OPS_TO_QUANTIZE})
 
   # Quant2 GRU
   set(QUANT2_GRU_MODEL_DIR "${QUANT_INSTALL_DIR}/GRU_quant2")
@@ -516,7 +516,7 @@ set_tests_properties(test_imperative_qat_user_defined PROPERTIES TIMEOUT 200)
 set_tests_properties(test_imperative_qat_lsq PROPERTIES TIMEOUT 300)
 set_tests_properties(test_imperative_qat_matmul PROPERTIES TIMEOUT 300)
 
-if(LINUX AND WITH_MKLDNN)
+if(LINUX AND WITH_ONEDNN)
   set_tests_properties(test_quant2_int8_mobilenetv1_mkldnn PROPERTIES TIMEOUT
                                                                       120)
   set_tests_properties(convert_model2dot_ernie PROPERTIES TIMEOUT 120)
diff --git a/test/quantization/convert_model2dot.py b/test/quantization/convert_model2dot.py
index 8e7a4bed5033d..e34e1f61e9a90 100644
--- a/test/quantization/convert_model2dot.py
+++ b/test/quantization/convert_model2dot.py
@@ -78,9 +78,7 @@ def generate_dot_for_model(model_path, save_graph_dir, save_graph_name):
             save_graph_name = model_name
         graph.draw(save_graph_dir, save_graph_name, graph.all_op_nodes())
         print(
-            "Success! Generated dot and pdf files for {} model, that can be found at {} named {}.\n".format(
-                model_name, save_graph_dir, save_graph_name
-            )
+            f"Success! Generated dot and pdf files for {model_name} model, that can be found at {save_graph_dir} named {save_graph_name}.\n"
         )
 
 
diff --git a/test/quantization/quant2_int8_image_classification_comparison.py b/test/quantization/quant2_int8_image_classification_comparison.py
index 71505e7f84ee6..fac217637d54b 100644
--- a/test/quantization/quant2_int8_image_classification_comparison.py
+++ b/test/quantization/quant2_int8_image_classification_comparison.py
@@ -332,9 +332,7 @@ def _summarize_accuracy(
 
     def _compare_accuracy(self, threshold, quant_acc1, int8_acc1):
         _logger.info(
-            'Accepted top1 accuracy drop threshold: {}. (condition: (Quant_top1_acc - IN8_top1_acc) <= threshold && Quant_top1_acc > 0.5 && INT8_top1_acc > 0.5)'.format(
-                threshold
-            )
+            f'Accepted top1 accuracy drop threshold: {threshold}. (condition: (Quant_top1_acc - IN8_top1_acc) <= threshold && Quant_top1_acc > 0.5 && INT8_top1_acc > 0.5)'
         )
         # We assume valid accuracy to be at least 0.5
         assert quant_acc1 > 0.5
diff --git a/test/quantization/quant_int8_image_classification_comparison.py b/test/quantization/quant_int8_image_classification_comparison.py
index 7d04939ee3731..4cfb3bdf79865 100644
--- a/test/quantization/quant_int8_image_classification_comparison.py
+++ b/test/quantization/quant_int8_image_classification_comparison.py
@@ -270,19 +270,13 @@ def _compare_accuracy(
     ):
         _logger.info('--- Accuracy summary ---')
         _logger.info(
-            'Accepted top1 accuracy drop threshold: {}. (condition: (FP32_top1_acc - IN8_top1_acc) <= threshold)'.format(
-                threshold
-            )
+            f'Accepted top1 accuracy drop threshold: {threshold}. (condition: (FP32_top1_acc - IN8_top1_acc) <= threshold)'
         )
         _logger.info(
-            'FP32: avg top1 accuracy: {:.4f}, avg top5 accuracy: {:.4f}'.format(
-                fp32_acc1, fp32_acc5
-            )
+            f'FP32: avg top1 accuracy: {fp32_acc1:.4f}, avg top5 accuracy: {fp32_acc5:.4f}'
         )
         _logger.info(
-            'INT8: avg top1 accuracy: {:.4f}, avg top5 accuracy: {:.4f}'.format(
-                int8_acc1, int8_acc5
-            )
+            f'INT8: avg top1 accuracy: {int8_acc1:.4f}, avg top5 accuracy: {int8_acc5:.4f}'
         )
         assert fp32_acc1 > 0.0
         assert int8_acc1 > 0.0
diff --git a/test/quantization/test_imperative_ptq.py b/test/quantization/test_imperative_ptq.py
index e01482c9576e6..2e5446c934e1d 100644
--- a/test/quantization/test_imperative_ptq.py
+++ b/test/quantization/test_imperative_ptq.py
@@ -153,9 +153,7 @@ def model_test(self, model, batch_num=-1, batch_size=8):
 
             if batch_id % 50 == 0:
                 _logger.info(
-                    "Test | At step {}: acc1 = {:}, acc5 = {:}".format(
-                        batch_id, acc_top1.numpy(), acc_top5.numpy()
-                    )
+                    f"Test | At step {batch_id}: acc1 = {acc_top1.numpy()}, acc5 = {acc_top5.numpy()}"
                 )
 
             if batch_num > 0 and batch_id + 1 >= batch_num:
@@ -196,9 +194,7 @@ def program_test(self, program_path, batch_num=-1, batch_size=8):
 
             if total_num % 50 == 49:
                 _logger.info(
-                    "Test | Test num {}: acc1 = {:}".format(
-                        total_num, top1_correct_num / total_num
-                    )
+                    f"Test | Test num {total_num}: acc1 = {top1_correct_num / total_num}"
                 )
 
             if batch_num > 0 and batch_id + 1 >= batch_num:
diff --git a/test/quantization/test_imperative_qat.py b/test/quantization/test_imperative_qat.py
index baa2d76ca8dbd..7c92597cca02f 100644
--- a/test/quantization/test_imperative_qat.py
+++ b/test/quantization/test_imperative_qat.py
@@ -135,9 +135,7 @@ def test_qat(self):
                     lenet.clear_gradients()
                     if batch_id % 100 == 0:
                         _logger.info(
-                            "Train | At epoch {} step {}: loss = {:}, acc= {:}".format(
-                                epoch, batch_id, avg_loss.numpy(), acc.numpy()
-                            )
+                            f"Train | At epoch {epoch} step {batch_id}: loss = {avg_loss.numpy()}, acc= {acc.numpy()}"
                         )
                     if batch_id == 500:  # For shortening CI time
                         break
@@ -168,12 +166,7 @@ def test_qat(self):
                     if batch_id % 100 == 0:
                         eval_acc_top1_list.append(float(acc_top1.numpy()))
                         _logger.info(
-                            "Test | At epoch {} step {}: acc1 = {:}, acc5 = {:}".format(
-                                epoch,
-                                batch_id,
-                                acc_top1.numpy(),
-                                acc_top5.numpy(),
-                            )
+                            f"Test | At epoch {epoch} step {batch_id}: acc1 = {acc_top1.numpy()}, acc5 = {acc_top5.numpy()}"
                         )
 
                 # check eval acc
diff --git a/test/quantization/test_imperative_qat_amp.py b/test/quantization/test_imperative_qat_amp.py
index 611806dd6fbf7..16ef05878c8ab 100644
--- a/test/quantization/test_imperative_qat_amp.py
+++ b/test/quantization/test_imperative_qat_amp.py
@@ -140,9 +140,7 @@ def model_train(self, model, batch_num=-1, batch_size=32, use_amp=False):
 
             if batch_id % 100 == 0:
                 _logger.info(
-                    "Train | step {}: loss = {:}, acc= {:}".format(
-                        batch_id, avg_loss.numpy(), acc.numpy()
-                    )
+                    f"Train | step {batch_id}: loss = {avg_loss.numpy()}, acc= {acc.numpy()}"
                 )
 
             if batch_num > 0 and batch_id + 1 >= batch_num:
@@ -175,9 +173,7 @@ def model_test(self, model, batch_num=-1, batch_size=32, use_amp=False):
             acc_top1_list.append(float(acc_top1.numpy()))
             if batch_id % 100 == 0:
                 _logger.info(
-                    "Test | At step {}: acc1 = {:}, acc5 = {:}".format(
-                        batch_id, acc_top1.numpy(), acc_top5.numpy()
-                    )
+                    f"Test | At step {batch_id}: acc1 = {acc_top1.numpy()}, acc5 = {acc_top5.numpy()}"
                 )
 
             if batch_num > 0 and batch_id + 1 >= batch_num:
diff --git a/test/quantization/test_imperative_qat_lsq.py b/test/quantization/test_imperative_qat_lsq.py
index d9ca04311bcd3..c71bd02c56bbc 100644
--- a/test/quantization/test_imperative_qat_lsq.py
+++ b/test/quantization/test_imperative_qat_lsq.py
@@ -176,9 +176,7 @@ def func_qat(self):
 
                 if batch_id % 100 == 0:
                     _logger.info(
-                        "Train | At epoch {} step {}: loss = {:}, acc= {:}".format(
-                            epoch, batch_id, avg_loss.numpy(), acc.numpy()
-                        )
+                        f"Train | At epoch {epoch} step {batch_id}: loss = {avg_loss.numpy()}, acc= {acc.numpy()}"
                     )
 
             lenet.eval()
@@ -207,12 +205,7 @@ def func_qat(self):
                     if batch_id % 100 == 0:
                         eval_acc_top1_list.append(float(acc_top1.numpy()))
                         _logger.info(
-                            "Test | At epoch {} step {}: acc1 = {:}, acc5 = {:}".format(
-                                epoch,
-                                batch_id,
-                                acc_top1.numpy(),
-                                acc_top5.numpy(),
-                            )
+                            f"Test | At epoch {epoch} step {batch_id}: acc1 = {acc_top1.numpy()}, acc5 = {acc_top5.numpy()}"
                         )
 
             # check eval acc
diff --git a/test/quantization/test_imperative_qat_matmul.py b/test/quantization/test_imperative_qat_matmul.py
index dbb520d7dca03..81860b2774a6f 100644
--- a/test/quantization/test_imperative_qat_matmul.py
+++ b/test/quantization/test_imperative_qat_matmul.py
@@ -180,9 +180,7 @@ def func_qat(self):
 
                 if batch_id % 100 == 0:
                     _logger.info(
-                        "Train | At epoch {} step {}: loss = {:}, acc= {:}".format(
-                            epoch, batch_id, avg_loss.numpy(), acc.numpy()
-                        )
+                        f"Train | At epoch {epoch} step {batch_id}: loss = {avg_loss.numpy()}, acc= {acc.numpy()}"
                     )
 
             lenet.eval()
@@ -211,12 +209,7 @@ def func_qat(self):
                     if batch_id % 100 == 0:
                         eval_acc_top1_list.append(float(acc_top1.numpy()))
                         _logger.info(
-                            "Test | At epoch {} step {}: acc1 = {:}, acc5 = {:}".format(
-                                epoch,
-                                batch_id,
-                                acc_top1.numpy(),
-                                acc_top5.numpy(),
-                            )
+                            f"Test | At epoch {epoch} step {batch_id}: acc1 = {acc_top1.numpy()}, acc5 = {acc_top5.numpy()}"
                         )
 
             # check eval acc
diff --git a/test/quantization/test_imperative_qat_user_defined.py b/test/quantization/test_imperative_qat_user_defined.py
index 5e52d027c9683..76386d6fac128 100644
--- a/test/quantization/test_imperative_qat_user_defined.py
+++ b/test/quantization/test_imperative_qat_user_defined.py
@@ -216,9 +216,7 @@ def train(model):
                     adam.clear_grad()
                     if batch_id % 50 == 0:
                         _logger.info(
-                            "Train | At epoch {} step {}: loss = {:}, acc= {:}".format(
-                                epoch, batch_id, avg_loss.numpy(), acc.numpy()
-                            )
+                            f"Train | At epoch {epoch} step {batch_id}: loss = {avg_loss.numpy()}, acc= {acc.numpy()}"
                         )
                         break
 
@@ -245,9 +243,7 @@ def test(model):
                 avg_acc[1].append(acc_top5.numpy())
                 if batch_id % 100 == 0:
                     _logger.info(
-                        "Test | step {}: acc1 = {:}, acc5 = {:}".format(
-                            batch_id, acc_top1.numpy(), acc_top5.numpy()
-                        )
+                        f"Test | step {batch_id}: acc1 = {acc_top1.numpy()}, acc5 = {acc_top5.numpy()}"
                     )
 
         train_reader = paddle.batch(
diff --git a/test/quantization/test_post_training_quantization_lstm_model.py b/test/quantization/test_post_training_quantization_lstm_model.py
index 0905b02b5a541..24fc9238bca3b 100644
--- a/test/quantization/test_post_training_quantization_lstm_model.py
+++ b/test/quantization/test_post_training_quantization_lstm_model.py
@@ -260,9 +260,7 @@ def run_test(
         )
 
         print(
-            "Start post training quantization for {} on {} samples ...".format(
-                model_name, quant_iterations
-            )
+            f"Start post training quantization for {model_name} on {quant_iterations} samples ..."
         )
         self.generate_quantized_model(
             fp32_model_path,
@@ -293,14 +291,10 @@ def run_test(
 
         print(f"---Post training quantization of {algo} method---")
         print(
-            "FP32 {}: batch_size {}, latency {} s, acc {}.".format(
-                model_name, 1, fp32_latency, fp32_acc
-            )
+            f"FP32 {model_name}: batch_size {1}, latency {fp32_latency} s, acc {fp32_acc}."
         )
         print(
-            "INT8 {}: batch_size {}, latency {} s, acc1 {}.\n".format(
-                model_name, 1, int8_latency, int8_acc
-            )
+            f"INT8 {model_name}: batch_size {1}, latency {int8_latency} s, acc1 {int8_acc}.\n"
         )
         sys.stdout.flush()
 
diff --git a/test/quantization/test_post_training_quantization_mnist.py b/test/quantization/test_post_training_quantization_mnist.py
index 2ff3f4e29ab68..52abf57d44cb5 100644
--- a/test/quantization/test_post_training_quantization_mnist.py
+++ b/test/quantization/test_post_training_quantization_mnist.py
@@ -285,9 +285,7 @@ def run_test(
         origin_model_path = os.path.join(origin_model_path, model_name)
 
         print(
-            "Start FP32 inference for {} on {} images ...".format(
-                model_name, infer_iterations * batch_size
-            )
+            f"Start FP32 inference for {model_name} on {infer_iterations * batch_size} images ..."
         )
 
         (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program(
@@ -299,9 +297,7 @@ def run_test(
         )
 
         print(
-            "Start INT8 post training quantization for {} on {} images ...".format(
-                model_name, quant_iterations * batch_size
-            )
+            f"Start INT8 post training quantization for {model_name} on {quant_iterations * batch_size} images ..."
         )
         self.generate_quantized_model(
             origin_model_path,
@@ -321,9 +317,7 @@ def run_test(
         )
 
         print(
-            "Start INT8 inference for {} on {} images ...".format(
-                model_name, infer_iterations * batch_size
-            )
+            f"Start INT8 inference for {model_name} on {infer_iterations * batch_size} images ..."
         )
         (int8_throughput, int8_latency, int8_acc1) = self.run_program(
             self.int8_model_path,
@@ -335,14 +329,10 @@ def run_test(
 
         print(f"---Post training quantization of {algo} method---")
         print(
-            "FP32 {}: batch_size {}, throughput {} img/s, latency {} s, acc1 {}.".format(
-                model_name, batch_size, fp32_throughput, fp32_latency, fp32_acc1
-            )
+            f"FP32 {model_name}: batch_size {batch_size}, throughput {fp32_throughput} img/s, latency {fp32_latency} s, acc1 {fp32_acc1}."
         )
         print(
-            "INT8 {}: batch_size {}, throughput {} img/s, latency {} s, acc1 {}.\n".format(
-                model_name, batch_size, int8_throughput, int8_latency, int8_acc1
-            )
+            f"INT8 {model_name}: batch_size {batch_size}, throughput {int8_throughput} img/s, latency {int8_latency} s, acc1 {int8_acc1}.\n"
         )
         sys.stdout.flush()
 
diff --git a/test/quantization/test_post_training_quantization_mobilenetv1.py b/test/quantization/test_post_training_quantization_mobilenetv1.py
index 113b2cb066b91..ac9f53690542e 100644
--- a/test/quantization/test_post_training_quantization_mobilenetv1.py
+++ b/test/quantization/test_post_training_quantization_mobilenetv1.py
@@ -392,9 +392,7 @@ def run_test(
         model_cache_folder = self.download_data(data_urls, data_md5s, model)
         model_path = os.path.join(model_cache_folder, data_name)
         _logger.info(
-            "Start FP32 inference for {} on {} images ...".format(
-                model, infer_iterations * batch_size
-            )
+            f"Start FP32 inference for {model} on {infer_iterations * batch_size} images ..."
         )
         (
             fp32_throughput,
@@ -427,9 +425,7 @@ def run_test(
         )
 
         _logger.info(
-            "Start INT8 inference for {} on {} images ...".format(
-                model, infer_iterations * batch_size
-            )
+            f"Start INT8 inference for {model} on {infer_iterations * batch_size} images ..."
         )
         (int8_throughput, int8_latency, int8_acc1, _) = self.run_program(
             self.int8_model,
@@ -441,14 +437,10 @@ def run_test(
 
         _logger.info(f"---Post training quantization of {algo} method---")
         _logger.info(
-            "FP32 {}: batch_size {}, throughput {} images/second, latency {} second, accuracy {}.".format(
-                model, batch_size, fp32_throughput, fp32_latency, fp32_acc1
-            )
+            f"FP32 {model}: batch_size {batch_size}, throughput {fp32_throughput} images/second, latency {fp32_latency} second, accuracy {fp32_acc1}."
         )
         _logger.info(
-            "INT8 {}: batch_size {}, throughput {} images/second, latency {} second, accuracy {}.\n".format(
-                model, batch_size, int8_throughput, int8_latency, int8_acc1
-            )
+            f"INT8 {model}: batch_size {batch_size}, throughput {int8_throughput} images/second, latency {int8_latency} second, accuracy {int8_acc1}.\n"
         )
         sys.stdout.flush()
 
diff --git a/test/quantization/test_post_training_quantization_program_resnet50.py b/test/quantization/test_post_training_quantization_program_resnet50.py
index fecb2e7609948..1f1845465d06f 100644
--- a/test/quantization/test_post_training_quantization_program_resnet50.py
+++ b/test/quantization/test_post_training_quantization_program_resnet50.py
@@ -262,9 +262,7 @@ def run_test(
         model_cache_folder = self.download_data(data_urls, data_md5s, model)
 
         print(
-            "Start FP32 inference for {} on {} images ...".format(
-                model, infer_iterations * batch_size
-            )
+            f"Start FP32 inference for {model} on {infer_iterations * batch_size} images ..."
         )
         (
             fp32_throughput,
@@ -295,9 +293,7 @@ def run_test(
         )
 
         print(
-            "Start INT8 inference for {} on {} images ...".format(
-                model, infer_iterations * batch_size
-            )
+            f"Start INT8 inference for {model} on {infer_iterations * batch_size} images ..."
         )
         (int8_throughput, int8_latency, int8_acc1, _, _, _) = self.run_program(
             self.int8_model,
@@ -309,14 +305,10 @@ def run_test(
 
         print(f"---Post training quantization of {algo} method---")
         print(
-            "FP32 {}: batch_size {}, throughput {} images/second, latency {} second, accuracy {}.".format(
-                model, batch_size, fp32_throughput, fp32_latency, fp32_acc1
-            )
+            f"FP32 {model}: batch_size {batch_size}, throughput {fp32_throughput} images/second, latency {fp32_latency} second, accuracy {fp32_acc1}."
         )
         print(
-            "INT8 {}: batch_size {}, throughput {} images/second, latency {} second, accuracy {}.\n".format(
-                model, batch_size, int8_throughput, int8_latency, int8_acc1
-            )
+            f"INT8 {model}: batch_size {batch_size}, throughput {int8_throughput} images/second, latency {int8_latency} second, accuracy {int8_acc1}.\n"
         )
         sys.stdout.flush()
 
diff --git a/test/quantization/test_post_training_quantization_while.py b/test/quantization/test_post_training_quantization_while.py
index 9a169b27c513a..8da167ab01b9a 100644
--- a/test/quantization/test_post_training_quantization_while.py
+++ b/test/quantization/test_post_training_quantization_while.py
@@ -198,18 +198,14 @@ def run_test(
         origin_model_path = self.download_model(data_url, data_md5, model_name)
 
         print(
-            "Start FP32 inference for {} on {} images ...".format(
-                model_name, infer_iterations * batch_size
-            )
+            f"Start FP32 inference for {model_name} on {infer_iterations * batch_size} images ..."
         )
         (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program(
             origin_model_path, batch_size, infer_iterations
         )
 
         print(
-            "Start INT8 post training quantization for {} on {} images ...".format(
-                model_name, quant_iterations * batch_size
-            )
+            f"Start INT8 post training quantization for {model_name} on {quant_iterations * batch_size} images ..."
         )
         self.generate_quantized_model(
             origin_model_path,
@@ -223,9 +219,7 @@ def run_test(
         )
 
         print(
-            "Start INT8 inference for {} on {} images ...".format(
-                model_name, infer_iterations * batch_size
-            )
+            f"Start INT8 inference for {model_name} on {infer_iterations * batch_size} images ..."
         )
         (int8_throughput, int8_latency, int8_acc1) = self.run_program(
             self.int8_model_path, batch_size, infer_iterations
@@ -233,14 +227,10 @@ def run_test(
 
         print(f"---Post training quantization of {algo} method---")
         print(
-            "FP32 {}: batch_size {}, throughput {} img/s, latency {} s, acc1 {}.".format(
-                model_name, batch_size, fp32_throughput, fp32_latency, fp32_acc1
-            )
+            f"FP32 {model_name}: batch_size {batch_size}, throughput {fp32_throughput} img/s, latency {fp32_latency} s, acc1 {fp32_acc1}."
         )
         print(
-            "INT8 {}: batch_size {}, throughput {} img/s, latency {} s, acc1 {}.\n".format(
-                model_name, batch_size, int8_throughput, int8_latency, int8_acc1
-            )
+            f"INT8 {model_name}: batch_size {batch_size}, throughput {int8_throughput} img/s, latency {int8_latency} s, acc1 {int8_acc1}.\n"
         )
         sys.stdout.flush()
 
diff --git a/test/quantization/test_quant_amp.py b/test/quantization/test_quant_amp.py
index a7908834fbcf7..2f285dfdf07d9 100644
--- a/test/quantization/test_quant_amp.py
+++ b/test/quantization/test_quant_amp.py
@@ -114,9 +114,7 @@ def train(program):
                 iter += 1
                 if iter % 100 == 0:
                     logging.info(
-                        'train iter={}, avg loss {}, acc_top1 {}, acc_top5 {}'.format(
-                            iter, cost, top1, top5
-                        )
+                        f'train iter={iter}, avg loss {cost}, acc_top1 {top1}, acc_top5 {top5}'
                     )
                 if stop_iter is not None and iter == stop_iter:
                     break
@@ -134,9 +132,7 @@ def test(program):
                 iter += 1
                 if iter % 100 == 0:
                     logging.info(
-                        'eval iter={}, avg loss {}, acc_top1 {}, acc_top5 {}'.format(
-                            iter, cost, top1, top5
-                        )
+                        f'eval iter={iter}, avg loss {cost}, acc_top1 {top1}, acc_top5 {top5}'
                     )
                 result[0].append(cost)
                 result[1].append(top1)
@@ -144,9 +140,7 @@ def test(program):
                 if stop_iter is not None and iter == stop_iter:
                     break
             logging.info(
-                ' avg loss {}, acc_top1 {}, acc_top5 {}'.format(
-                    np.mean(result[0]), np.mean(result[1]), np.mean(result[2])
-                )
+                f' avg loss {np.mean(result[0])}, acc_top1 {np.mean(result[1])}, acc_top5 {np.mean(result[2])}'
             )
             return np.mean(result[1]), np.mean(result[2])
 
diff --git a/test/quantization/test_quant_aware.py b/test/quantization/test_quant_aware.py
index 4a07ad69bae9d..c7f6f48ea994b 100644
--- a/test/quantization/test_quant_aware.py
+++ b/test/quantization/test_quant_aware.py
@@ -303,9 +303,7 @@ def train(program):
                 iter += 1
                 if iter % 100 == 0:
                     logging.info(
-                        'train iter={}, avg loss {}, acc_top1 {}, acc_top5 {}'.format(
-                            iter, cost, top1, top5
-                        )
+                        f'train iter={iter}, avg loss {cost}, acc_top1 {top1}, acc_top5 {top5}'
                     )
                 if stop_iter is not None and iter == stop_iter:
                     break
@@ -323,9 +321,7 @@ def test(program):
                 iter += 1
                 if iter % 100 == 0:
                     logging.info(
-                        'eval iter={}, avg loss {}, acc_top1 {}, acc_top5 {}'.format(
-                            iter, cost, top1, top5
-                        )
+                        f'eval iter={iter}, avg loss {cost}, acc_top1 {top1}, acc_top5 {top5}'
                     )
                 result[0].append(cost)
                 result[1].append(top1)
@@ -333,9 +329,7 @@ def test(program):
                 if stop_iter is not None and iter == stop_iter:
                     break
             logging.info(
-                ' avg loss {}, acc_top1 {}, acc_top5 {}'.format(
-                    np.mean(result[0]), np.mean(result[1]), np.mean(result[2])
-                )
+                f' avg loss {np.mean(result[0])}, acc_top1 {np.mean(result[1])}, acc_top5 {np.mean(result[2])}'
             )
             return np.mean(result[1]), np.mean(result[2])
 
diff --git a/test/quantization/test_quant_aware_config.py b/test/quantization/test_quant_aware_config.py
index 74e1e7e3c72b3..82411249380c6 100644
--- a/test/quantization/test_quant_aware_config.py
+++ b/test/quantization/test_quant_aware_config.py
@@ -112,9 +112,7 @@ def train(program):
                 iter += 1
                 if iter % 100 == 0:
                     logging.info(
-                        'train iter={}, avg loss {}, acc_top1 {}, acc_top5 {}'.format(
-                            iter, cost, top1, top5
-                        )
+                        f'train iter={iter}, avg loss {cost}, acc_top1 {top1}, acc_top5 {top5}'
                     )
                 if stop_iter is not None and iter == stop_iter:
                     break
@@ -132,9 +130,7 @@ def test(program):
                 iter += 1
                 if iter % 100 == 0:
                     logging.info(
-                        'eval iter={}, avg loss {}, acc_top1 {}, acc_top5 {}'.format(
-                            iter, cost, top1, top5
-                        )
+                        f'eval iter={iter}, avg loss {cost}, acc_top1 {top1}, acc_top5 {top5}'
                     )
                 result[0].append(cost)
                 result[1].append(top1)
@@ -142,9 +138,7 @@ def test(program):
                 if stop_iter is not None and iter == stop_iter:
                     break
             logging.info(
-                ' avg loss {}, acc_top1 {}, acc_top5 {}'.format(
-                    np.mean(result[0]), np.mean(result[1]), np.mean(result[2])
-                )
+                f' avg loss {np.mean(result[0])}, acc_top1 {np.mean(result[1])}, acc_top5 {np.mean(result[2])}'
             )
             return np.mean(result[1]), np.mean(result[2])
 
diff --git a/test/quantization/test_quant_aware_user_defined.py b/test/quantization/test_quant_aware_user_defined.py
index 4352145511f53..3521ecf7ddeff 100644
--- a/test/quantization/test_quant_aware_user_defined.py
+++ b/test/quantization/test_quant_aware_user_defined.py
@@ -127,9 +127,7 @@ def train(program):
                 iter += 1
                 if iter % 100 == 0:
                     logging.info(
-                        'train iter={}, avg loss {}, acc_top1 {}, acc_top5 {}'.format(
-                            iter, cost, top1, top5
-                        )
+                        f'train iter={iter}, avg loss {cost}, acc_top1 {top1}, acc_top5 {top5}'
                     )
                 if stop_iter is not None and iter == stop_iter:
                     break
@@ -147,9 +145,7 @@ def test(program):
                 iter += 1
                 if iter % 100 == 0:
                     logging.info(
-                        'eval iter={}, avg loss {}, acc_top1 {}, acc_top5 {}'.format(
-                            iter, cost, top1, top5
-                        )
+                        f'eval iter={iter}, avg loss {cost}, acc_top1 {top1}, acc_top5 {top5}'
                     )
                 result[0].append(cost)
                 result[1].append(top1)
@@ -157,9 +153,7 @@ def test(program):
                 if stop_iter is not None and iter == stop_iter:
                     break
             logging.info(
-                ' avg loss {}, acc_top1 {}, acc_top5 {}'.format(
-                    np.mean(result[0]), np.mean(result[1]), np.mean(result[2])
-                )
+                f' avg loss {np.mean(result[0])}, acc_top1 {np.mean(result[1])}, acc_top5 {np.mean(result[2])}'
             )
             return np.mean(result[1]), np.mean(result[2])
 
diff --git a/test/quantization/test_weight_only_linear.py b/test/quantization/test_weight_only_linear.py
index 5a2e32a4194b7..e047cc92a93b1 100644
--- a/test/quantization/test_weight_only_linear.py
+++ b/test/quantization/test_weight_only_linear.py
@@ -26,6 +26,7 @@
 from paddle import base
 from paddle.base import core
 from paddle.framework import set_default_dtype
+from paddle.pir_utils import IrGuard
 
 np.random.seed(123)
 paddle.seed(123)
@@ -155,59 +156,9 @@ def get_weight_only_linear_out(self):
         )
         return out.numpy()
 
-    def get_weight_only_linear_out_static(self):
-        paddle.enable_static()
-        main = base.Program()
-        start = base.Program()
-        with base.program_guard(main, start):
-            x = paddle.static.data("x", self.x.shape, dtype=self.x.dtype)
-
-            weight = paddle.static.data(
-                "weight", self.weight.shape, dtype=self.weight.dtype
-            )
-            bias = paddle.static.data(
-                "bias", self.bias.shape, dtype=self.bias.dtype
-            )
-            x_np = self.x.numpy()
-            weight_np = self.weight.numpy()
-            bias_np = self.bias.numpy()
-            if self.weight_scale is not None:
-                weight_scale = paddle.static.data(
-                    "weight_scale",
-                    self.weight_scale.shape,
-                    dtype=self.weight_scale.dtype,
-                )
-                weight_scale_np = self.weight_scale.numpy()
-            else:
-                weight_scale = None
-                weight_scale_np = None
-
-            out = Q.weight_only_linear(
-                x,
-                weight,
-                bias,
-                weight_scale,
-                self.weight_dtype,
-                group_size=self.group_size,
-            )
-            feed_dict = {
-                'x': x_np,
-                'weight': weight_np,
-                'bias': bias_np,
-                "weight_scale": weight_scale_np,
-            }
-            exe = base.Executor(paddle.CUDAPlace(0))
-            exe.run(start)
-            (out,) = exe.run(main, feed=feed_dict, fetch_list=[out])
-        paddle.disable_static()
-        return out
-
     def test_weight_only_linear(self):
         out_expect = self.get_linear_out()
-        if self.static:
-            out_real = self.get_weight_only_linear_out_static()
-        else:
-            out_real = self.get_weight_only_linear_out()
+        out_real = self.get_weight_only_linear_out()
 
         if self.dtype == "bfloat16":
             out_real = convert_uint16_to_float(out_real)
@@ -651,7 +602,9 @@ def config(self):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda() or get_cuda_version() < 11020,
+    not core.is_compiled_with_cuda()
+    or get_cuda_version() < 11020
+    or paddle.device.cuda.get_device_capability()[0] < 8,
     "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
 )
 class WeightOnlyLinearTestCaseStatic(WeightOnlyLinearTestCase):
@@ -659,6 +612,92 @@ def config(self):
         super().config()
         self.static = True
 
+    def get_weight_only_linear_out_static(self):
+        paddle.enable_static()
+        main = paddle.static.Program()
+        start = paddle.static.Program()
+        with paddle.static.program_guard(main, start):
+            x = paddle.static.data("x", self.x.shape, dtype=self.x.dtype)
+
+            weight = paddle.static.data(
+                "weight", self.weight.shape, dtype=self.weight.dtype
+            )
+            bias = paddle.static.data(
+                "bias", self.bias.shape, dtype=self.bias.dtype
+            )
+            x_np = self.x.numpy()
+            weight_np = self.weight.numpy()
+            bias_np = self.bias.numpy()
+            if self.weight_scale is not None:
+                weight_scale = paddle.static.data(
+                    "weight_scale",
+                    self.weight_scale.shape,
+                    dtype=self.weight_scale.dtype,
+                )
+                weight_scale_np = self.weight_scale.numpy()
+            else:
+                weight_scale = None
+                weight_scale_np = None
+
+            out = Q.weight_only_linear(
+                x,
+                weight,
+                bias,
+                weight_scale,
+                self.weight_dtype,
+                group_size=self.group_size,
+            )
+            feed_dict = {
+                'x': x_np,
+                'weight': weight_np,
+                'bias': bias_np,
+                "weight_scale": weight_scale_np,
+            }
+            exe = base.Executor(paddle.CUDAPlace(0))
+            exe.run(start)
+            (out,) = exe.run(main, feed=feed_dict, fetch_list=[out])
+        paddle.disable_static()
+        return out
+
+    def test_weight_quantize_and_dequantize_pir(self, algo='weight_only_int8'):
+        with IrGuard():
+            weight = (
+                paddle.rand(shape=(4096, 12288), dtype='float16')
+                * 1
+                / math.sqrt(4096)
+            )
+
+            quant_weight, quant_scale = Q.weight_quantize(x=weight, algo=algo)
+            dequant_weight = Q.weight_dequantize(
+                quant_weight, quant_scale, algo=algo
+            )
+            exe = paddle.static.Executor(paddle.CUDAPlace(0))
+            res = exe.run(feed={}, fetch_list=[weight, dequant_weight])
+            np.testing.assert_allclose(res[0], res[1], rtol=1e-2, atol=1e-2)
+
+    def test_weight_quantize_and_dequantize_int4_pir(self):
+        self.test_weight_quantize_and_dequantize_pir(algo='weight_only_int4')
+
+    def test_weight_only_linear(self):
+        out_expect = self.get_linear_out()
+
+        out_real = self.get_weight_only_linear_out_static()
+        if self.dtype == "bfloat16":
+            out_real = convert_uint16_to_float(out_real)
+            out_expect = convert_uint16_to_float(out_expect)
+        np.testing.assert_allclose(
+            out_real, out_expect, rtol=self.rtol, atol=self.atol
+        )
+
+        with IrGuard():
+            out_real = self.get_weight_only_linear_out_static()
+        if self.dtype == "bfloat16":
+            out_real = convert_uint16_to_float(out_real)
+            out_expect = convert_uint16_to_float(out_expect)
+        np.testing.assert_allclose(
+            out_real, out_expect, rtol=self.rtol, atol=self.atol
+        )
+
 
 @unittest.skipIf(
     not core.is_compiled_with_cuda()
@@ -667,7 +706,9 @@ def config(self):
     "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
 )
 class WeightOnlyLinearBackwardAndWeightDequantizeTestCase(unittest.TestCase):
-    def test_weightonly_linear_backward(self):
+    def test_weightonly_linear_backward(
+        self, algo='weight_only_int8', weight_dtype='int8'
+    ):
         x = (
             paddle.rand(shape=(128, 4096), dtype='float16')
             * 1
@@ -683,16 +724,18 @@ def test_weightonly_linear_backward(self):
         )
 
         quant_weight, quant_scale = Q.weight_quantize(
-            x=weight.cuda(), algo='weight_only_int8'
+            x=weight.cuda(), algo=algo
+        )
+        dequant_weight = Q.weight_dequantize(
+            quant_weight.cuda(), quant_scale, algo=algo
         )
-        dequant_weight = Q.weight_dequantize(quant_weight.cuda(), quant_scale)
         np.testing.assert_allclose(weight, dequant_weight, rtol=1e-2, atol=1e-2)
 
         quant_out = Q.weight_only_linear(
             x=quant_x,
             weight=quant_weight,
             weight_scale=quant_scale,
-            weight_dtype="int8",
+            weight_dtype=weight_dtype,
         )
         out = paddle.matmul(x=x, y=weight)
         np.testing.assert_allclose(quant_out, out, rtol=1e-3, atol=1e-3)
@@ -701,6 +744,11 @@ def test_weightonly_linear_backward(self):
         out.backward()
         np.testing.assert_allclose(quant_x.grad, x.grad, rtol=1e-3, atol=1e-3)
 
+    def test_weightonly_linear_backward_int4(self):
+        self.test_weightonly_linear_backward(
+            algo='weight_only_int4', weight_dtype='int4'
+        )
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/sequence/CMakeLists.txt b/test/sequence/CMakeLists.txt
index 4f5b02114ebfd..3d5e3ecf46eb4 100644
--- a/test/sequence/CMakeLists.txt
+++ b/test/sequence/CMakeLists.txt
@@ -8,7 +8,6 @@ foreach(TEST_OP ${TEST_OPS})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach()
 set_tests_properties(test_sequence_conv PROPERTIES TIMEOUT 120)
-set_tests_properties(test_sequence_concat PROPERTIES TIMEOUT 120)
 set_tests_properties(test_sequence_pool PROPERTIES TIMEOUT 120)
 
 set(PIR_COVERAGE_TESTS test_sequence_mask)
diff --git a/test/sequence/test_sequence_concat.py b/test/sequence/test_sequence_concat.py
deleted file mode 100644
index 7eb37e7b3803b..0000000000000
--- a/test/sequence/test_sequence_concat.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-import paddle
-
-
-class TestSequenceConcat(OpTest):
-    def setLoD(self):
-        self.lod1 = [7, 3]
-        self.lod2 = [12, 8]
-        self.out_lod = [19, 11]
-
-    def setUp(self):
-        x1 = np.random.random(size=(10, 80)).astype('float64')
-        x2 = np.random.random(size=(20, 80)).astype('float64')
-        self.setLoD()
-
-        out = np.concatenate(
-            (
-                x1[0 : self.lod1[0]],
-                x2[0 : self.lod2[0]],
-                x1[self.lod1[0] :],
-                x2[self.lod2[0] :],
-            )
-        )
-
-        self.op_type = "sequence_concat"
-        self.inputs = {
-            'X': [("x1", (x1, [self.lod1])), ("x2", (x2, [self.lod2]))]
-        }
-        self.outputs = {"Out": (out, [self.out_lod])}
-
-    def test_output(self):
-        self.check_output()
-
-    def test_dx(self):
-        self.check_grad(inputs_to_check=['x1', 'x2'], output_names="Out")
-
-
-class TestSequenceConcatCase2(TestSequenceConcat):
-    def setLoD(self):
-        self.lod1 = [10, 0]
-        self.lod2 = [12, 8]
-        self.out_lod = [22, 8]
-
-
-class TestSequenceConcatCase3(TestSequenceConcat):
-    def setLoD(self):
-        self.lod1 = [10, 0]
-        self.lod2 = [20, 0]
-        self.out_lod = [30, 0]
-
-
-class TestSequenceConcatCase4(TestSequenceConcat):
-    def setLoD(self):
-        self.lod1 = [0, 10]
-        self.lod2 = [0, 20]
-        self.out_lod = [0, 30]
-
-
-class TestSequenceConcatCase5(TestSequenceConcat):
-    def setLoD(self):
-        self.lod1 = [0, 10]
-        self.lod2 = [20, 0]
-        self.out_lod = [20, 10]
-
-
-class TestSequenceConcatOpError(unittest.TestCase):
-    def test_errors(self):
-        def test_input_list():
-            # the input type must be list
-            x_data = paddle.static.data(
-                name='x', shape=[-1, 4], dtype='float32'
-            )
-            paddle.static.nn.sequence_lod.sequence_concat(input=x_data)
-
-        self.assertRaises(TypeError, test_input_list)
-
-        def test_variable1():
-            # the input element type must be Variable
-            x1_data = np.array([[3, 5]]).astype('float32')
-
-            y1_data = paddle.static.data(
-                name='y1', shape=[-1, 4], dtype='float32'
-            )
-            paddle.static.nn.sequence_lod.sequence_concat(
-                input=[x1_data, y1_data]
-            )
-
-        def test_variable2():
-            x2_data = np.array([[3, 5]]).astype('float32')
-            y2_data = paddle.static.data(
-                name='y2', shape=[-1, 4], dtype='float32'
-            )
-            paddle.static.nn.sequence_lod.sequence_concat(
-                input=[y2_data, x2_data]
-            )
-
-        for i in range(2):
-            if i == 0:
-                self.assertRaises(TypeError, test_variable1)
-            else:
-                self.assertRaises(TypeError, test_variable2)
-
-        def test_dtype():
-            # dtype must be 'float32', 'float64', 'int64'
-            x3_data = paddle.static.data(
-                name="x3", shape=[-1, 3, 5], dtype='int32'
-            )
-            y3_data = paddle.static.data(
-                name="y3", shape=[-1, 3, 5], dtype='int16'
-            )
-            input_list = [x3_data, y3_data]
-            paddle.static.nn.sequence_lod.sequence_concat(input=input_list)
-
-        self.assertRaises(TypeError, test_dtype)
-
-        def test_0_shape():
-            # dtype must be 'float32', 'float64', 'int64'
-            x4_data = paddle.static.data(name="x4", shape=[0], dtype='float32')
-            y4_data = paddle.static.data(name="y4", shape=[1], dtype='float32')
-            input_list = [x4_data, y4_data]
-            paddle.static.nn.sequence_lod.sequence_concat(input=input_list)
-
-        self.assertRaises(ValueError, test_0_shape)
-
-
-if __name__ == '__main__':
-    paddle.enable_static()
-    unittest.main()
diff --git a/test/sequence/test_sequence_expand_as.py b/test/sequence/test_sequence_expand_as.py
deleted file mode 100644
index 82d0e0c395522..0000000000000
--- a/test/sequence/test_sequence_expand_as.py
+++ /dev/null
@@ -1,115 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-import paddle
-from paddle.base import Program, program_guard
-
-
-class TestSequenceExpandAs(OpTest):
-    def setUp(self):
-        self.op_type = 'sequence_expand_as'
-        self.set_data()
-        self.compute()
-
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [3, 40]).astype('float64')
-        y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float64')
-        y_lod = [[1, 3, 4]]
-        self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
-
-    def compute(self):
-        x = self.inputs['X']
-        x_data, x_lod = x if type(x) == tuple else (x, None)
-        y_data, y_lod = self.inputs['Y']
-
-        assert len(y_lod) == 1 and len(y_lod[0]) == x_data.shape[0]
-
-        repeats = []
-        for i in range(len(y_lod[0])):
-            repeat_num = y_lod[0][i]
-            if repeat_num == 0:
-                continue
-            repeats.extend([i for _ in range(repeat_num)])
-
-        out_data = x_data[repeats]
-        self.outputs = {'Out': (out_data, y_lod)}
-
-    def test_check_output(self):
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out", check_dygraph=False)
-
-
-class TestSequenceExpandAsCase1(TestSequenceExpandAs):
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [5, 20]).astype('float64')
-        x_lod = [[2, 3]]
-        y_data = np.random.uniform(0.1, 1, [10, 1]).astype('float64')
-        y_lod = [[2, 2, 0, 3, 3]]
-        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
-
-
-class TestSequenceExpandAsCase2(TestSequenceExpandAs):
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [5, 20]).astype('float64')
-        x_lod = [[2, 3]]
-        y_data = np.random.uniform(0.1, 1, [10, 1]).astype('float64')
-        y_lod = [[0, 4, 0, 6, 0]]
-        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
-
-
-class TestSequenceExpandAsCase3(TestSequenceExpandAs):
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [1, 2, 50]).astype('float64')
-        x_lod = [[1]]
-        y_data = np.random.uniform(0.1, 1, [2, 2, 2]).astype('float64')
-        y_lod = [[2]]
-        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
-
-
-class TestSequenceExpandAsOpError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            # the input x must be Variable
-            x1 = np.random.random((2, 4)).astype("float32")
-            self.assertRaises(
-                TypeError, paddle.static.nn.sequence_lod.sequence_expand_as, x1
-            )
-
-            # the dtype of input x must be float32, float64, int32 or int64
-            x2 = paddle.static.data(name='x2', shape=[None, 4], dtype="bool")
-            self.assertRaises(
-                TypeError, paddle.static.nn.sequence_lod.sequence_expand_as, x2
-            )
-
-            # the input y must be Variable
-            x3 = paddle.static.data(name='x3', shape=[None, 4], dtype="float32")
-            y = np.random.random((2, 4)).astype("float32")
-            self.assertRaises(
-                TypeError,
-                paddle.static.nn.sequence_lod.sequence_expand_as,
-                x3,
-                y,
-            )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/sequence/test_sequence_pad_op.py b/test/sequence/test_sequence_pad_op.py
deleted file mode 100644
index 743e21dce5c3e..0000000000000
--- a/test/sequence/test_sequence_pad_op.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-import paddle
-
-
-class TestSequencePadOp(OpTest):
-    def set_attr(self):
-        self.x_shape = [12, 10]
-        self.x_len_lod = [[2, 3, 4, 3]]
-        self.pad_value = [1.0]
-        self.padded_length = -1
-        self.dtype = 'float64'
-
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 0.5, self.x_shape).astype(self.dtype)
-        pad_value_data = np.array(self.pad_value).astype(self.dtype)
-        self.inputs = {
-            'X': (x_data, self.x_len_lod),
-            'PadValue': pad_value_data,
-        }
-        self.attrs = {'padded_length': self.padded_length}
-
-    def compute(self):
-        # get padded length
-        padded_length = self.padded_length
-        x_len_lod_0 = self.x_len_lod[0]
-        if padded_length == -1:
-            max_seq_len = 0
-            for l in x_len_lod_0:
-                max_seq_len = max(max_seq_len, l)
-            padded_length = max_seq_len
-
-        # do padding
-        x_data = self.inputs['X'][0]
-        pad_value_data = self.inputs['PadValue']
-        if pad_value_data.shape == (1,):
-            pad_value_data = np.broadcast_to(
-                pad_value_data, shape=x_data.shape[1:]
-            )
-        padded_sequences = []
-        start_idx = 0
-        for l in x_len_lod_0:
-            end_idx = start_idx + l
-            seq = x_data[start_idx:end_idx]
-            to_pad_len = padded_length - l
-            for _ in range(to_pad_len):
-                seq = np.append(seq, pad_value_data[np.newaxis, :], axis=0)
-            padded_sequences.append(seq)
-            start_idx = end_idx
-
-        out_data = np.array(padded_sequences)
-        length = np.array(self.x_len_lod[0]).reshape(-1)
-        self.outputs = {'Out': out_data, 'Length': length}
-
-    def setUp(self):
-        self.op_type = 'sequence_pad'
-        self.set_attr()
-        self.set_data()
-        self.compute()
-
-    def test_check_output(self):
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out", check_dygraph=False)
-
-
-class TestSequencePadOp2(TestSequencePadOp):
-    def set_attr(self):
-        self.x_shape = [12, 10]
-        self.x_len_lod = [[2, 3, 4, 3]]
-        self.pad_value = np.random.random(10)
-        self.padded_length = -1
-        self.dtype = 'float64'
-
-
-class TestSequencePadOp3(TestSequencePadOp):
-    def set_attr(self):
-        self.x_shape = [12, 10]
-        self.x_len_lod = [[2, 3, 4, 3]]
-        self.pad_value = [1.0]
-        self.padded_length = 7
-        self.dtype = 'float64'
-
-
-class TestSequencePadOp4(TestSequencePadOp):
-    def set_attr(self):
-        self.x_shape = [12, 10]
-        self.x_len_lod = [[2, 3, 4, 3]]
-        self.pad_value = np.random.random(10)
-        self.padded_length = 7
-        self.dtype = 'float64'
-
-
-class TestSequencePadOp5(TestSequencePadOp):
-    def set_attr(self):
-        self.x_shape = [12, 2, 5]
-        self.x_len_lod = [[2, 3, 4, 3]]
-        self.pad_value = [1.0]
-        self.padded_length = -1
-        self.dtype = 'float64'
-
-
-class TestSequencePadOp6(TestSequencePadOp):
-    def set_attr(self):
-        self.x_shape = [12, 2, 5]
-        self.x_len_lod = [[2, 3, 4, 3]]
-        self.pad_value = np.random.random((2, 5))
-        self.padded_length = -1
-        self.dtype = 'float64'
-
-
-class TestSequencePadOp7(TestSequencePadOp):
-    def set_attr(self):
-        self.x_shape = [12, 2, 5]
-        self.x_len_lod = [[2, 3, 4, 3]]
-        self.pad_value = [1.0]
-        self.padded_length = 7
-        self.dtype = 'float64'
-
-
-class TestSequencePadOp8(TestSequencePadOp):
-    def set_attr(self):
-        self.x_shape = [12, 2, 5]
-        self.x_len_lod = [[0, 8, 0, 4, 0]]
-        self.pad_value = [1.0]
-        self.padded_length = 10
-        self.dtype = 'float64'
-
-
-class TestSequencePadOpError(unittest.TestCase):
-    def test_error(self):
-        def test_x_variable():
-            # the input x type must be Variable
-            x = np.random.random((2, 4)).astype("float32")
-
-            pad_value = paddle.assign(np.array([0.0], dtype=np.float32))
-            paddle.static.nn.sequence_lod.sequence_pad(x=x, pad_value=pad_value)
-
-        self.assertRaises(TypeError, test_x_variable)
-
-        def test_pad_value_variable():
-            x1 = paddle.static.data(
-                name='x1', shape=[-1, 10, 5], dtype='float32', lod_level=1
-            )
-            pad_value1 = np.array([0.0], dtype=np.float32)
-            paddle.static.nn.sequence_lod.sequence_pad(
-                x=x1, pad_value=pad_value1
-            )
-
-        self.assertRaises(TypeError, test_pad_value_variable)
-
-        def test_dtype():
-            x2 = paddle.static.data(
-                name='x2', shape=[-1, 10, 5], dtype='int16', lod_level=1
-            )
-
-            pad_value2 = paddle.assign(np.array([0.0], dtype=np.int32))
-            paddle.static.nn.sequence_lod.sequence_pad(
-                x=x2, pad_value=pad_value2
-            )
-
-        self.assertRaises(TypeError, test_dtype)
-
-    def test_length_dtype(self):
-        x = paddle.static.data(
-            name='x', shape=[10, 5], dtype='float32', lod_level=1
-        )
-
-        pad_value = paddle.assign(np.array([0.0], dtype=np.float32))
-        out, length = paddle.static.nn.sequence_lod.sequence_pad(
-            x=x, pad_value=pad_value
-        )
-        # check if the dtype of length is int64 in compile time
-        self.assertEqual(length.dtype, paddle.int64)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/sequence/test_sequence_scatter_op.py b/test/sequence/test_sequence_scatter_op.py
deleted file mode 100644
index 626c5363552c9..0000000000000
--- a/test/sequence/test_sequence_scatter_op.py
+++ /dev/null
@@ -1,89 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-class TestSequenceScatterOp(OpTest):
-    def init_lod(self):
-        return [[30, 50, 40]]
-
-    def setUp(self):
-        self.op_type = "sequence_scatter"
-
-        X_data = np.random.uniform(0.1, 1.0, [3, 6]).astype('float64')
-        Ids_data = np.random.randint(0, 6, (120, 1)).astype('int64')
-        Ids_lod = self.init_lod()
-
-        Updates_data = np.random.uniform(0.1, 1.0, [120, 1]).astype('float64')
-        Updates_lod = Ids_lod
-
-        Out_data = np.copy(X_data)
-        offset = 0
-        for i in range(3):
-            for j in range(Ids_lod[0][i]):
-                Out_data[i][Ids_data[offset + j]] += Updates_data[offset + j]
-            offset += Ids_lod[0][i]
-
-        self.inputs = {
-            'X': X_data,
-            'Ids': (Ids_data, Ids_lod),
-            'Updates': (Updates_data, Updates_lod),
-        }
-        self.outputs = {'Out': Out_data}
-
-    def test_check_output(self):
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        self.check_grad(['Updates'], 'Out', in_place=True, check_dygraph=False)
-
-
-class TestSequenceScatterOpSeqLen0(TestSequenceScatterOp):
-    def init_lod(self):
-        return [[60, 60, 00]]
-
-
-class TestSequenceScatterOpSeqLen0Case1(TestSequenceScatterOp):
-    def init_lod(self):
-        return [[0, 60, 60]]
-
-
-class TestSequenceScatterOpSeqLen0Case2(TestSequenceScatterOp):
-    def init_lod(self):
-        return [[60, 0, 60]]
-
-
-class TestSequenceScatterOpSeqLen0Case3(TestSequenceScatterOp):
-    def init_lod(self):
-        return [[120, 0, 0]]
-
-
-class TestSequenceScatterOpSeqLen0Case4(TestSequenceScatterOp):
-    def init_lod(self):
-        return [[0, 120, 0]]
-
-
-class TestSequenceScatterOpSeqLen0Case5(TestSequenceScatterOp):
-    def init_lod(self):
-        return [[0, 0, 120]]
-
-
-# run the uni tests
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/sequence/test_sequence_slice_op.py b/test/sequence/test_sequence_slice_op.py
deleted file mode 100644
index 22c276824c8a5..0000000000000
--- a/test/sequence/test_sequence_slice_op.py
+++ /dev/null
@@ -1,90 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-class TestSequenceSliceOp(OpTest):
-    def set_data(self):
-        self.init_test_case()
-        # only supprot one level LoD
-        x = np.random.random(self.x_dim).astype('float32')
-        lod = self.x_lod
-        offset = np.array(self.offset).astype("int64")
-        length = np.array(self.length).astype("int64")
-
-        self.inputs = {'X': (x, lod), 'Offset': offset, 'Length': length}
-        outs = []  # np.zeros((100, 3, 2)).astype('float32')
-        out_lod = [[]]
-        lod_offset = 0
-        for i in range(len(offset)):
-            sub_x = x[
-                lod_offset
-                + offset[i, 0] : lod_offset
-                + offset[i, 0]
-                + length[i, 0],
-                :,
-            ]
-            outs.append(sub_x)
-            out_lod[0].append(len(sub_x))
-            lod_offset += lod[0][i]
-        outs = np.concatenate(outs, axis=0)
-        self.outputs = {'Out': (outs, out_lod)}
-
-    def init_test_case(self):
-        self.x_dim = (100, 3, 2)
-        self.x_lod = [[20, 20, 20, 20, 20]]
-        self.offset = [[1], [2], [3], [4], [5]]
-        self.length = [[10], [8], [6], [4], [2]]
-
-    def setUp(self):
-        self.op_type = "sequence_slice"
-        self.set_data()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestSequenceSliceOpSeqlen0Case0(TestSequenceSliceOp):
-    def init_test_case(self):
-        self.x_dim = (100, 3, 2)
-        self.x_lod = [[20, 30, 0, 30, 20]]
-        self.offset = [[1], [2], [0], [4], [5]]
-        self.length = [[10], [8], [0], [4], [2]]
-
-
-class TestSequenceSliceOpSeqlen0Case1(TestSequenceSliceOp):
-    def init_test_case(self):
-        self.x_dim = (100, 3, 2)
-        self.x_lod = [[0, 70, 0, 30, 0]]
-        self.offset = [[0], [2], [0], [4], [0]]
-        self.length = [[0], [8], [0], [4], [0]]
-
-
-class TestSequenceSliceOpSeqlen0Case2(TestSequenceSliceOp):
-    def init_test_case(self):
-        self.x_dim = (100, 3, 2)
-        self.x_lod = [[0, 100, 0, 0, 0]]
-        self.offset = [[0], [2], [0], [0], [0]]
-        self.length = [[0], [8], [0], [0], [0]]
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/sequence/test_sequence_unpad_op.py b/test/sequence/test_sequence_unpad_op.py
deleted file mode 100644
index 9eaaff04e5fdf..0000000000000
--- a/test/sequence/test_sequence_unpad_op.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-import paddle
-
-
-class TestSequenceUnpadOp(OpTest):
-    def init(self):
-        self.length = [2, 3, 4]
-        self.x_shape = (3, 40)
-        self.dtype = "float64"
-
-    def compute(self):
-        assert len(self.length) == self.x_shape[0]
-        x = np.random.random(self.x_shape).astype(self.dtype)
-        out_lod = [self.length]
-
-        out = x[0, 0 : self.length[0]]
-        for i in range(1, x.shape[0]):
-            out = np.append(out, x[i, 0 : self.length[i]], axis=0)
-
-        out_shape = (sum(self.length),)
-        if len(self.x_shape) == 2:
-            out_shape = out_shape + (1,)
-        else:
-            out_shape = out_shape + self.x_shape[2:]
-
-        self.inputs = {'X': x, 'Length': np.array(self.length).astype('int64')}
-        self.outputs = {'Out': (out.reshape(out_shape), out_lod)}
-
-    def setUp(self):
-        self.op_type = 'sequence_unpad'
-        self.init()
-        self.compute()
-
-    def test_check_output(self):
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out", check_dygraph=False)
-
-
-class TestSequenceUnpadOp2(TestSequenceUnpadOp):
-    def init(self):
-        self.length = [2, 3, 4]
-        self.x_shape = (3, 5, 4, 3)
-        self.dtype = "float64"
-
-
-class TestSequenceUnpadOp3(TestSequenceUnpadOp):
-    def init(self):
-        self.length = [5, 2, 3, 4]
-        self.x_shape = (4, 5, 3, 3, 6)
-        self.dtype = "float64"
-
-
-class TestSequenceUnpadOp4(TestSequenceUnpadOp):
-    def init(self):
-        self.length = [5, 0, 0, 4]
-        self.x_shape = (4, 5, 3, 3, 6)
-        self.dtype = "float64"
-
-
-class TestSequenceUnpadOp5(TestSequenceUnpadOp):
-    def init(self):
-        self.length = [0, 4, 3, 0]
-        self.x_shape = (4, 5, 3, 3, 6)
-        self.dtype = "float64"
-
-
-class TestSequenceUnpadOpError(unittest.TestCase):
-    def test_error(self):
-        def test_x_variable():
-            x = np.random.random((10, 5)).astype("float64")
-            len = paddle.static.data(name='length2', shape=[10], dtype='int64')
-            paddle.static.nn.sequence_lod.sequence_pad(x=x, length=len)
-
-        self.assertRaises(TypeError, test_x_variable)
-
-        def test_length_variable():
-            x1 = paddle.static.data(name='x1', shape=[10, 5], dtype='float32')
-            len1 = np.random.random(10).astype("int64")
-            paddle.static.nn.sequence_lod.sequence_pad(x=x1, length=len1)
-
-        self.assertRaises(TypeError, test_length_variable)
-
-        def test_x_dtype():
-            x2 = paddle.static.data(name='x2', shape=[10, 5], dtype='float16')
-            len2 = paddle.static.data(name='length2', shape=[10], dtype='int64')
-            paddle.static.nn.sequence_lod.sequence_pad(x=x2, length=len2)
-
-        self.assertRaises(TypeError, test_x_dtype)
-
-        def test_length_dtype():
-            x3 = paddle.static.data(name='x3', shape=[10, 5], dtype='float64')
-            len3 = paddle.static.data(name='length3', shape=[10], dtype='int32')
-            paddle.static.nn.sequence_lod.sequence_pad(x=x3, length=len3)
-
-        self.assertRaises(TypeError, test_length_dtype)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/sot/test_dataclass.py b/test/sot/test_dataclass.py
new file mode 100644
index 0000000000000..a0e885ec99a26
--- /dev/null
+++ b/test/sot/test_dataclass.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from dataclasses import dataclass
+
+from test_case_base import (
+    TestCaseBase,
+    run_in_both_default_and_pir,
+)
+
+import paddle
+from paddle.jit.sot.utils import strict_mode_guard
+
+
+@dataclass
+class Data:
+    x: paddle.Tensor
+
+
+@dataclass
+class DataWithPostInit:
+    x: paddle.Tensor
+
+    def __post_init__(self):
+        self.x += 1
+
+
+def return_dataclass(x):
+    return Data(x + 1)
+
+
+def return_dataclass_with_post_init(x):
+    return DataWithPostInit(x)
+
+
+class TestDataclass(TestCaseBase):
+    @strict_mode_guard(False)
+    @run_in_both_default_and_pir
+    def test_dtype_reconstruct(self):
+        x = paddle.to_tensor(1)
+        self.assert_results(return_dataclass, x)
+
+    @strict_mode_guard(False)
+    @run_in_both_default_and_pir
+    def test_dtype_reconstruct_with_post_init(self):
+        x = paddle.to_tensor(1)
+        self.assert_results(return_dataclass_with_post_init, x)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/sot/test_execution_base.py b/test/sot/test_execution_base.py
index 87d67ca04c357..3e8316120d9f2 100644
--- a/test/sot/test_execution_base.py
+++ b/test/sot/test_execution_base.py
@@ -17,6 +17,7 @@
 from test_case_base import TestCaseBase
 
 import paddle
+from paddle.framework import use_pir_api
 from paddle.jit.sot import symbolic_translate
 from paddle.static import BuildStrategy
 
@@ -52,6 +53,9 @@ class TestBackend(TestCaseBase):
     def test_backend(self):
         x = paddle.randn([2, 3])
         dy_out = foo(x)
+        # TODO(SigureMo): Find a better way to test the CINN backend.
+        if not paddle.is_compiled_with_cinn() and use_pir_api():
+            return
         sot_out = symbolic_translate(
             foo, build_strategy=BuildStrategy(), backend='CINN'
         )(x)
diff --git a/test/standalone_executor/test_standalone_executor.py b/test/standalone_executor/test_standalone_executor.py
index 6c510c77ca1f9..934558c170f51 100644
--- a/test/standalone_executor/test_standalone_executor.py
+++ b/test/standalone_executor/test_standalone_executor.py
@@ -70,9 +70,6 @@ def setUp(self):
         )
         self.perf_path = './perfstat'
 
-    def test_parallel_executor_statistics(self):
-        self.run_with_statistics(executor='ParallelExecutor')
-
     def test_executor_statistics(self):
         self.run_with_statistics(executor='Executor')
 
@@ -88,13 +85,6 @@ def run_with_statistics(self, executor=None):
         # note: startup program is empty
         main_program, startup_program, fetch_list = build_program()
 
-        enable = True
-        if executor == 'ParallelExecutor':
-            main_program = paddle.base.compiler.CompiledProgram(main_program)
-            enable = False
-        elif executor == 'Executor':
-            enable = False
-
         scope = paddle.static.Scope()
         with paddle.static.scope_guard(scope):
             exe = paddle.static.Executor(self.place)
diff --git a/test/white_list/check_op_sequence_batch_1_input_white_list.py b/test/white_list/check_op_sequence_batch_1_input_white_list.py
index 2506c557e6e63..09a93310c1c29 100644
--- a/test/white_list/check_op_sequence_batch_1_input_white_list.py
+++ b/test/white_list/check_op_sequence_batch_1_input_white_list.py
@@ -17,7 +17,6 @@
 # For ops in this whitelist, the check of batch size = 1 input will be skipped.
 # Ops in this whitelist need to be fixed later.
 NEED_TO_FIX_OP_LIST = [
-    'sequence_concat',
     'sequence_conv',
     'sequence_enumerate',
     'sequence_erase',
diff --git a/test/white_list/check_op_sequence_instance_0_input_white_list.py b/test/white_list/check_op_sequence_instance_0_input_white_list.py
index b4f9d16317e16..df298fcfe8eca 100644
--- a/test/white_list/check_op_sequence_instance_0_input_white_list.py
+++ b/test/white_list/check_op_sequence_instance_0_input_white_list.py
@@ -15,7 +15,7 @@
 # If the output after infershape() is a lod_tensor, commenly its lod_level
 # should be equal during compile time and run time.
 # For ops in this whitelist, the equality check of lod_level between
-# compiletime&runtime will be skipped. Ops in this whitelist need to declear
+# compiletime&runtime will be skipped. Ops in this whitelist need to declare
 # reasons for skipping compile_vs_runtime test or be fixed later.
 
 import sys
@@ -23,7 +23,6 @@
 # For ops in this whitelist, the check of instance size is 0 input will be skipped.
 # Ops in this whitelist need to be fixed later.
 NEED_TO_FIX_OP_LIST = [
-    'sequence_concat',
     'sequence_conv',
     'sequence_enumerate',
     'sequence_erase',
diff --git a/test/white_list/compile_vs_runtime_white_list.py b/test/white_list/compile_vs_runtime_white_list.py
index 0c74eb327a853..38a35e6c16f49 100644
--- a/test/white_list/compile_vs_runtime_white_list.py
+++ b/test/white_list/compile_vs_runtime_white_list.py
@@ -15,14 +15,13 @@
 # If the output after infershape() is a lod_tensor, commenly its lod_level
 # should be equal during compile time and run time.
 # For ops in this whitelist, the equality check of lod_level between
-# compiletime&runtime will be skipped. Ops in this whitelist need to declear
+# compiletime&runtime will be skipped. Ops in this whitelist need to declare
 # reasons for skipping compile_vs_runtime test or be fixed later.
 
 COMPILE_RUN_OP_WHITE_LIST = [
     'sequence_pool',
     'sequence_slice',
     'generate_proposals',
-    'mine_hard_examples',
     'retinanet_detection_output',
     'ctc_align',
     'fusion_seqpool_cvm_concat',
@@ -32,6 +31,5 @@
     'im2sequence',
     'generate_proposal_labels',
     'detection_map',
-    'locality_aware_nms',
     'var_conv_2d',
 ]
diff --git a/test/white_list/no_grad_set_white_list.py b/test/white_list/no_grad_set_white_list.py
index 36210a8175025..42042046eb6e6 100644
--- a/test/white_list/no_grad_set_white_list.py
+++ b/test/white_list/no_grad_set_white_list.py
@@ -61,7 +61,6 @@
     'matmul_v2',
     'mul',
     'multiplex',
-    'rank_loss',
     'sequence_conv',
     'smooth_l1_loss',
     'spectral_norm',
diff --git a/test/white_list/op_accuracy_white_list.py b/test/white_list/op_accuracy_white_list.py
index 00d0ffccbac02..3a6b10400d472 100644
--- a/test/white_list/op_accuracy_white_list.py
+++ b/test/white_list/op_accuracy_white_list.py
@@ -52,7 +52,6 @@
     'pool2d',
     'pool3d',
     'prroi_pool',
-    'rank_loss',
     'reduce_max',
     'reduce_min',
     'reshape2',
diff --git a/test/white_list/op_threshold_white_list.py b/test/white_list/op_threshold_white_list.py
index 9b9d590fd0a21..351efe8da96b0 100644
--- a/test/white_list/op_threshold_white_list.py
+++ b/test/white_list/op_threshold_white_list.py
@@ -54,6 +54,7 @@
     'solve',
     'qr',
     'layer_norm',
+    # 'expand_v2',
 ]
 
 NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST = [
diff --git a/test/white_list/pir_op_test_no_check_list b/test/white_list/pir_op_test_no_check_list
index 7bed213c8278f..52b03b7aaf685 100644
--- a/test/white_list/pir_op_test_no_check_list
+++ b/test/white_list/pir_op_test_no_check_list
@@ -1,3 +1,4 @@
+test_assign_pos_op
 test_bernoulli_op
 test_dirichlet_op
 test_empty_op
diff --git a/test/white_list/pir_op_test_white_list b/test/white_list/pir_op_test_white_list
index 42d7f70c26db1..2daa7ddd497e4 100644
--- a/test/white_list/pir_op_test_white_list
+++ b/test/white_list/pir_op_test_white_list
@@ -16,6 +16,7 @@ test_arg_min_max_op_static_build
 test_arg_min_max_v2_op
 test_argsort_op
 test_assign_op
+test_assign_pos_op
 test_assign_value_op
 test_atan2_op
 test_auc_op
diff --git a/test/xpu/test_adamw_op_xpu.py b/test/xpu/test_adamw_op_xpu.py
index f8e0b7cd545bf..a029a0b7e8219 100644
--- a/test/xpu/test_adamw_op_xpu.py
+++ b/test/xpu/test_adamw_op_xpu.py
@@ -650,7 +650,7 @@ def get_numpy_output(param, grad, moment1, moment2, lr_ratio, t):
             paddle.disable_static()
 
 
-class TestAdamWOpMultiPrecisonWithMainGrad(unittest.TestCase):
+class TestAdamWOpMultiPrecisionWithMainGrad(unittest.TestCase):
     def _test_adamw_op_dygraph_place_amp_with_maingrad(
         self, place, shape, use_main_grad
     ):
@@ -789,7 +789,7 @@ def test_main(self):
                     )
 
 
-class TestAdamWOpMultiPrecison(unittest.TestCase):
+class TestAdamWOpMultiPrecision(unittest.TestCase):
     def _test_adamw_op_dygraph_place_amp(self, place, use_amp=False):
         paddle.disable_static()
         paddle.seed(10)
diff --git a/test/xpu/test_collective_api_base.py b/test/xpu/test_collective_api_base.py
index f7dd0a66b0993..0c3d710a06335 100644
--- a/test/xpu/test_collective_api_base.py
+++ b/test/xpu/test_collective_api_base.py
@@ -200,10 +200,7 @@ class TestDistBase(unittest.TestCase):
     def setUp(self):
         self._port_set = set()
         self._trainers = 2
-        self._ps_endpoints = "127.0.0.1:{},127.0.0.1:{}".format(
-            self._find_free_port(),
-            self._find_free_port(),
-        )
+        self._ps_endpoints = f"127.0.0.1:{self._find_free_port()},127.0.0.1:{self._find_free_port()}"
         self._python_interp = sys.executable
         self._master_endpoints = "127.0.0.1:%s" % (self._find_free_port())
 
diff --git a/test/xpu/test_collective_base_xpu.py b/test/xpu/test_collective_base_xpu.py
index df5426bfb894c..8f2b26468e390 100644
--- a/test/xpu/test_collective_base_xpu.py
+++ b/test/xpu/test_collective_base_xpu.py
@@ -177,10 +177,7 @@ class TestDistBase(unittest.TestCase):
     def setUp(self):
         self._port_set = set()
         self._trainers = 2
-        self._ps_endpoints = "127.0.0.1:{},127.0.0.1:{}".format(
-            self._find_free_port(),
-            self._find_free_port(),
-        )
+        self._ps_endpoints = f"127.0.0.1:{self._find_free_port()},127.0.0.1:{self._find_free_port()}"
         self._python_interp = sys.executable
 
         self.temp_dir = tempfile.TemporaryDirectory()
diff --git a/test/xpu/test_conv2d_op_xpu.py b/test/xpu/test_conv2d_op_xpu.py
index 1e2795b1d2964..df36f226408eb 100644
--- a/test/xpu/test_conv2d_op_xpu.py
+++ b/test/xpu/test_conv2d_op_xpu.py
@@ -415,13 +415,13 @@ def has_cuda(self):
             )
 
         def test_check_output(self):
-            # TODO(wangzhongpu): support mkldnn op in dygraph mode
+            # TODO(wangzhongpu): support onednn op in dygraph mode
             if core.is_compiled_with_xpu():
                 paddle.enable_static()
                 self.check_output_with_place(place=self.place)
 
         def test_check_grad(self):
-            # TODO(wangzhongpu): support mkldnn op in dygraph mode
+            # TODO(wangzhongpu): support onednn op in dygraph mode
             if hasattr(self, "no_need_check_grad") and self.no_need_check_grad:
                 return
             if core.is_compiled_with_xpu():
@@ -431,7 +431,7 @@ def test_check_grad(self):
                 )
 
         def test_check_grad_no_filter(self):
-            # TODO(wangzhongpu): support mkldnn op in dygraph mode
+            # TODO(wangzhongpu): support onednn op in dygraph mode
             if hasattr(self, "no_need_check_grad") and self.no_need_check_grad:
                 return
             if core.is_compiled_with_xpu():
@@ -441,7 +441,7 @@ def test_check_grad_no_filter(self):
                 )
 
         def test_check_grad_no_input(self):
-            # TODO(wangzhongpu): support mkldnn op in dygraph mode
+            # TODO(wangzhongpu): support onednn op in dygraph mode
             if hasattr(self, "no_need_check_grad") and self.no_need_check_grad:
                 return
             if core.is_compiled_with_xpu():
diff --git a/test/xpu/test_conv3d_op_xpu.py b/test/xpu/test_conv3d_op_xpu.py
index 9c65c81d12cf3..021c57821c12d 100644
--- a/test/xpu/test_conv3d_op_xpu.py
+++ b/test/xpu/test_conv3d_op_xpu.py
@@ -255,7 +255,7 @@ def test_check_output(self):
 
         def test_check_grad(self):
             place = paddle.XPUPlace(0)
-            # TODO(wangzhongpu): support mkldnn op in dygraph mode
+            # TODO(wangzhongpu): support onednn op in dygraph mode
             self.check_grad_with_place(
                 place,
                 {'Input', 'Filter'},
@@ -265,7 +265,7 @@ def test_check_grad(self):
 
         def test_check_grad_no_filter(self):
             place = paddle.XPUPlace(0)
-            # TODO(wangzhongpu): support mkldnn op in dygraph mode
+            # TODO(wangzhongpu): support onednn op in dygraph mode
             self.check_grad_with_place(
                 place,
                 ['Input'],
@@ -276,7 +276,7 @@ def test_check_grad_no_filter(self):
 
         def test_check_grad_no_input(self):
             place = paddle.XPUPlace(0)
-            # TODO(wangzhongpu): support mkldnn op in dygraph mode
+            # TODO(wangzhongpu): support onednn op in dygraph mode
             self.check_grad_with_place(
                 place,
                 ['Filter'],
diff --git a/test/xpu/test_iou_similarity_op_xpu.py b/test/xpu/test_iou_similarity_op_xpu.py
deleted file mode 100644
index 301d5fb07b99a..0000000000000
--- a/test/xpu/test_iou_similarity_op_xpu.py
+++ /dev/null
@@ -1,131 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from get_test_cover_info import (
-    XPUOpTestWrapper,
-    create_test_class,
-    get_xpu_op_support_types,
-)
-from numpy import random
-from op_test_xpu import XPUOpTest
-
-import paddle
-
-paddle.enable_static()
-
-
-class XPUTestIOUSimilarityOp(XPUOpTestWrapper):
-    def __init__(self):
-        self.op_name = 'iou_similarity'
-        self.use_dynamic_create_class = False
-
-    class TestXPUIOUSimilarityOp(XPUOpTest):
-        def init(self):
-            self.dtype = self.in_type
-            self.place = paddle.XPUPlace(0)
-            self.op_type = 'iou_similarity'
-
-        def test_check_output(self):
-            self.check_output_with_place(self.place)
-
-        def setUp(self):
-            self.init()
-            self.boxes1 = random.rand(2, 4).astype(self.dtype)
-            self.boxes2 = random.rand(3, 4).astype(self.dtype)
-            self.output = random.rand(2, 3).astype(self.dtype)
-            self.box_normalized = False
-            # run python iou computation
-            self._compute_iou()
-            self.inputs = {'X': self.boxes1, 'Y': self.boxes2}
-            self.attrs = {
-                "box_normalized": self.box_normalized,
-                'use_xpu': True,
-            }
-            self.outputs = {'Out': self.output}
-
-        def _compute_iou(
-            self,
-        ):
-            for row in range(self.boxes1.shape[0]):
-                for col in range(self.boxes2.shape[0]):
-                    xmin1, ymin1, xmax1, ymax1 = self.boxes1[row]
-                    xmin2, ymin2, xmax2, ymax2 = self.boxes2[col]
-                    if not self.box_normalized:
-                        area1 = (ymax1 - ymin1 + 1) * (xmax1 - xmin1 + 1)
-                        area2 = (ymax2 - ymin2 + 1) * (xmax2 - xmin2 + 1)
-                    else:
-                        area1 = (ymax1 - ymin1) * (xmax1 - xmin1)
-                        area2 = (ymax2 - ymin2) * (xmax2 - xmin2)
-
-                    inter_xmax = min(xmax1, xmax2)
-                    inter_ymax = min(ymax1, ymax2)
-                    inter_xmin = max(xmin1, xmin2)
-                    inter_ymin = max(ymin1, ymin2)
-                    inter_height = inter_ymax - inter_ymin
-                    inter_width = inter_xmax - inter_xmin
-                    if not self.box_normalized:
-                        inter_height += 1
-                        inter_width += 1
-                    inter_height = max(inter_height, 0)
-                    inter_width = max(inter_width, 0)
-                    inter_area = inter_width * inter_height
-                    union_area = area1 + area2 - inter_area
-                    sim_score = inter_area / union_area
-                    self.output[row, col] = sim_score
-
-    class TestXPUIOUSimilarityOpWithLoD(TestXPUIOUSimilarityOp):
-        def test_check_output(self):
-            self.check_output_with_place(self.place, check_dygraph=False)
-
-        def setUp(self):
-            super().setUp()
-            self.boxes1_lod = [[1, 1]]
-            self.output_lod = [[1, 1]]
-            self.box_normalized = False
-            # run python iou computation
-            self._compute_iou()
-            self.inputs = {
-                'X': (self.boxes1, self.boxes1_lod),
-                'Y': self.boxes2,
-            }
-            self.attrs = {"box_normalized": self.box_normalized}
-            self.outputs = {'Out': (self.output, self.output_lod)}
-
-    class TestXPUIOUSimilarityOpWithBoxNormalized(TestXPUIOUSimilarityOp):
-        def test_check_output(self):
-            self.check_output_with_place(self.place, check_dygraph=False)
-
-        def setUp(self):
-            super().setUp()
-            self.boxes1_lod = [[1, 1]]
-            self.output_lod = [[1, 1]]
-            self.box_normalized = True
-            # run python iou computation
-            self._compute_iou()
-            self.inputs = {
-                'X': (self.boxes1, self.boxes1_lod),
-                'Y': self.boxes2,
-            }
-            self.attrs = {"box_normalized": self.box_normalized}
-            self.outputs = {'Out': (self.output, self.output_lod)}
-
-
-support_types = get_xpu_op_support_types('iou_similarity')
-for stype in support_types:
-    create_test_class(globals(), XPUTestIOUSimilarityOp, stype)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/xpu/test_matmul_op_xpu.py b/test/xpu/test_matmul_op_xpu.py
index 615a1b949df1f..bc944b2608c04 100644
--- a/test/xpu/test_matmul_op_xpu.py
+++ b/test/xpu/test_matmul_op_xpu.py
@@ -303,9 +303,7 @@ def dynamic_create_class(self):
                         no_need_check_grad = False
                         if batch >= 5:
                             no_need_check_grad = True
-                        class_name = 'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}_batch_{}'.format(
-                            dim_X, dim_Y, transpose_x, transpose_y, batch
-                        )
+                        class_name = f'TestMatMulOp_dimX_{dim_X}_dim_Y_{dim_Y}_transX_{transpose_x}_transY_{transpose_y}_batch_{batch}'
                         shape_x, shape_y = generate_compatible_shapes(
                             dim_X, dim_Y, transpose_x, transpose_y, batch
                         )
@@ -333,9 +331,7 @@ def dynamic_create_class(self):
         for dim in [4]:
             for transpose_X in [False, True]:
                 for transpose_Y in [False, True]:
-                    class_name = 'TestMatMulOp2_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
-                        dim, dim, transpose_X, transpose_Y
-                    )
+                    class_name = f'TestMatMulOp2_dimX_{dim}_dim_Y_{dim}_transX_{transpose_X}_transY_{transpose_Y}'
                     shape_X, shape_Y = generate_compatible_shapes_2(
                         dim, transpose_X, transpose_Y
                     )
@@ -361,9 +357,7 @@ def dynamic_create_class(self):
         for dim in [2]:
             for transpose_X in [False, True]:
                 for transpose_Y in [False, True]:
-                    class_name = 'TestMatMulOp2_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
-                        dim, dim, transpose_X, transpose_Y
-                    )
+                    class_name = f'TestMatMulOp2_dimX_{dim}_dim_Y_{dim}_transX_{transpose_X}_transY_{transpose_Y}'
                     shape_X, shape_Y = generate_compatible_shapes_2(
                         dim, transpose_X, transpose_Y
                     )
diff --git a/test/xpu/test_merged_momentum_op_xpu_base.py b/test/xpu/test_merged_momentum_op_xpu_base.py
index e41c7fd4feeab..9a333d50a2d74 100644
--- a/test/xpu/test_merged_momentum_op_xpu_base.py
+++ b/test/xpu/test_merged_momentum_op_xpu_base.py
@@ -27,7 +27,7 @@
 def run_momentum_op(
     params,
     grads,
-    velocitys,
+    velocities,
     master_params,
     learning_rate,
     place,
@@ -38,7 +38,7 @@ def run_momentum_op(
     use_nesterov=True,
 ):
     assert len(params) == len(grads)
-    assert len(params) == len(velocitys)
+    assert len(params) == len(velocities)
     if multi_precision:
         assert len(params) == len(master_params)
     op_type = 'merged_momentum' if use_merged else 'momentum'
@@ -60,7 +60,7 @@ def run_momentum_op(
             helper.create_variable(
                 persistable=True, shape=v.shape, dtype=v.dtype
             )
-            for v in velocitys
+            for v in velocities
         ]
         lr_var = helper.create_variable(
             persistable=True,
@@ -82,7 +82,7 @@ def run_momentum_op(
             OrderedDict(
                 [
                     (v_var.name, v_val)
-                    for v_var, v_val in zip(velocity_vars, velocitys)
+                    for v_var, v_val in zip(velocity_vars, velocities)
                 ]
             )
         )
@@ -191,19 +191,19 @@ def prepare_data(self, shapes, multi_precision, seed, dtype, place):
         np.random.seed(seed)
         params = self.gen_rand_data(shapes, dtype)
         grads = self.gen_rand_data(shapes, dtype)
-        velocitys = self.gen_rand_data(shapes, dtype)
+        velocities = self.gen_rand_data(shapes, dtype)
         learning_rate = self.gen_rand_data([[1]], np.float32)[0]
         if multi_precision:
             master_params = [p.astype(dtype) for p in params]
         else:
             master_params = None
-        return params, grads, velocitys, master_params, learning_rate
+        return params, grads, velocities, master_params, learning_rate
 
     def check_with_place(self, place, dtype, multi_precision=False):
         (
             params,
             grads,
-            velocitys,
+            velocities,
             master_params,
             learning_rate,
         ) = self.prepare_data(
@@ -215,7 +215,7 @@ def run_op(use_nesterov, use_merged):
             return run_momentum_op(
                 params,
                 grads,
-                velocitys,
+                velocities,
                 master_params,
                 learning_rate,
                 place,
diff --git a/test/xpu/test_pow2_decay_with_linear_warmup_op_xpu.py b/test/xpu/test_pow2_decay_with_linear_warmup_op_xpu.py
deleted file mode 100644
index e95caccdd3b35..0000000000000
--- a/test/xpu/test_pow2_decay_with_linear_warmup_op_xpu.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from get_test_cover_info import record_op_test
-
-import paddle
-from paddle.incubate.layers.nn import pow2_decay_with_linear_warmup
-from paddle.optimizer.lr import LinearWarmup, PolynomialDecay
-
-
-def gen_pow2_warmup_op_lr(warmup_steps, total_steps, base_lr, end_lr, place):
-    main = paddle.static.Program()
-    startup = paddle.static.Program()
-    with paddle.static.program_guard(main, startup):
-        lr = pow2_decay_with_linear_warmup(
-            warmup_steps, total_steps, base_lr, end_lr
-        )
-        exe = paddle.static.Executor(place)
-    with paddle.static.scope_guard(paddle.static.Scope()):
-        exe.run(startup)
-        while True:
-            lr_np = exe.run(main, fetch_list=[lr])[0]
-            yield lr_np[0]
-
-
-class Pow2Warmup(LinearWarmup):
-    def __init__(self, warmup_steps, total_steps, base_lr, end_lr):
-        assert total_steps > warmup_steps
-        lr_sch = PolynomialDecay(
-            learning_rate=base_lr,
-            decay_steps=total_steps - warmup_steps,
-            end_lr=end_lr,
-            power=2,
-        )
-
-        super().__init__(
-            learning_rate=lr_sch,
-            warmup_steps=warmup_steps,
-            start_lr=0.0,
-            end_lr=base_lr,
-        )
-
-
-def gen_pow2_warmup_py_lr(warmup_steps, total_steps, base_lr, end_lr, place):
-    lr_sch = Pow2Warmup(warmup_steps, total_steps, base_lr, end_lr)
-    lr_sch.step()
-    while True:
-        yield lr_sch()
-        lr_sch.step()
-
-
-class TestPowWarmup(unittest.TestCase):
-    def setUp(self):
-        paddle.enable_static()
-        self.op_type = 'pow2_decay_with_linear_warmup'
-        self.params = {
-            'warmup_steps': 30,
-            'total_steps': 100,
-            'base_lr': 0.02,
-            'end_lr': 0.001,
-        }
-        self.step_num = 1000
-
-    def check_with_place(self, place):
-        kwargs = dict(self.params)
-        kwargs['place'] = place
-        lr_sch_op = gen_pow2_warmup_op_lr(**kwargs)
-        lr_sch_py = gen_pow2_warmup_py_lr(**kwargs)
-        for i, (lr_op, lr_py) in enumerate(zip(lr_sch_op, lr_sch_py)):
-            self.assertLess(abs(lr_op - lr_py), 1e-6)
-            if i > self.step_num:
-                break
-
-    def test_main(self):
-        self.check_with_place(paddle.XPUPlace(0))
-
-
-record_op_test("pow2_decay_with_linear_warmup", "float32")
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/xpu/test_reduce_min_op_xpu.py b/test/xpu/test_reduce_min_op_xpu.py
index cbe89dd50b6ab..69531832e2455 100644
--- a/test/xpu/test_reduce_min_op_xpu.py
+++ b/test/xpu/test_reduce_min_op_xpu.py
@@ -47,7 +47,7 @@ def set_case(self):
                 'dim': self.axis,
             }
             self.temp_x = np.random.random(self.shape)
-            if self.dtype == np.uint16:  # bfloat16 acturally
+            if self.dtype == np.uint16:  # bfloat16 actually
                 self.x = convert_float_to_uint16(self.temp_x)
             else:
                 self.x = self.temp_x.astype(self.dtype)
diff --git a/test/xpu/test_sequence_unpad_op_xpu.py b/test/xpu/test_sequence_unpad_op_xpu.py
deleted file mode 100644
index 0a61d8b22ec96..0000000000000
--- a/test/xpu/test_sequence_unpad_op_xpu.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from get_test_cover_info import (
-    XPUOpTestWrapper,
-    create_test_class,
-    get_xpu_op_support_types,
-)
-from op_test_xpu import XPUOpTest
-
-import paddle
-
-paddle.enable_static()
-
-
-class XPUTestSequenceUnpadOp(XPUOpTestWrapper):
-    def __init__(self):
-        self.op_name = 'sequence_unpad'
-        self.use_dynamic_create_class = False
-
-    class TestSequenceUnpadOp(XPUOpTest):
-        def setUp(self):
-            self.init_dtype()
-            self.initTestCase()
-            self.set_xpu()
-            self.op_type = 'sequence_unpad'
-            self.place = paddle.XPUPlace(0)
-            self.compute()
-
-        def init_dtype(self):
-            self.dtype = self.in_type
-
-        def set_xpu(self):
-            self.__class__.use_xpu = True
-            self.__class__.no_need_check_grad = True
-
-        def test_check_output(self):
-            self.check_output_with_place(self.place)
-
-        def initTestCase(self):
-            self.length = [2, 3, 4]
-            self.x_shape = (3, 40)
-
-        def compute(self):
-            assert len(self.length) == self.x_shape[0]
-            x = np.random.random(self.x_shape).astype(self.dtype)
-            out_lod = [self.length]
-
-            out = x[0, 0 : self.length[0]]
-            for i in range(1, x.shape[0]):
-                out = np.append(out, x[i, 0 : self.length[i]], axis=0)
-
-            out_shape = (sum(self.length),)
-            if len(self.x_shape) == 2:
-                out_shape = out_shape + (1,)
-            else:
-                out_shape = out_shape + self.x_shape[2:]
-
-            self.inputs = {
-                'X': x,
-                'Length': np.array(self.length).astype('int64'),
-            }
-            self.outputs = {'Out': (out.reshape(out_shape), out_lod)}
-
-    class TestSequenceUnpadOp2(TestSequenceUnpadOp):
-        def initTestCase(self):
-            self.length = [2, 3, 4]
-            self.x_shape = (3, 5, 4, 3)
-
-    class TestSequenceUnpadOp3(TestSequenceUnpadOp):
-        def initTestCase(self):
-            self.length = [5, 2, 3, 4]
-            self.x_shape = (4, 5, 3, 3, 6)
-
-    class TestSequenceUnpadOp4(TestSequenceUnpadOp):
-        def initTestCase(self):
-            self.length = [5, 5, 5, 5]
-            self.x_shape = (4, 5, 3, 3, 6)
-
-    class TestSequenceUnpadOp5(TestSequenceUnpadOp):
-        def initTestCase(self):
-            self.length = [1, 4, 3, 1]
-            self.x_shape = (4, 5, 3, 3, 6)
-
-
-class TestSequenceUnpadOpError(unittest.TestCase):
-    def test_error(self):
-        """
-        The type of 'x' in paddle.static.nn.sequence_unpad must be <class 'paddle.base.framework.Variable'>, but received <class 'numpy.ndarray'>.
-        """
-
-        def test_x_variable():
-            x = np.random.random((10, 5)).astype("float64")
-            len = paddle.static.data(name='length2', shape=[10], dtype='int64')
-            paddle.static.nn.sequence_lod.sequence_unpad(x=x, length=len)
-
-        self.assertRaises(TypeError, test_x_variable)
-        """
-        The type of 'length' in base.layers.sequence_unpad must be <class 'paddle.base.framework.Variable'>, but received <class 'numpy.ndarray'>.
-        """
-
-        def test_length_variable():
-            x1 = paddle.static.data(name='x1', shape=[10, 5], dtype='float32')
-            len1 = np.random.random(10).astype("int64")
-            paddle.static.nn.sequence_lod.sequence_unpad(x=x1, length=len1)
-
-        self.assertRaises(TypeError, test_length_variable)
-        """
-        The data type of 'x' in base.layers.sequence_unpad must be ['float32', 'float64', 'int32', 'int64'], but received float16
-        """
-
-        def test_x_dtype():
-            x2 = paddle.static.data(name='x2', shape=[10, 5], dtype='float16')
-            len2 = paddle.static.data(name='length2', shape=[10], dtype='int64')
-            paddle.static.nn.sequence_lod.sequence_unpad(x=x2, length=len2)
-
-        self.assertRaises(TypeError, test_x_dtype)
-        """
-        The data type of 'length' in base.layers.sequence_unpad must be ['int64'], but received int32
-        """
-
-        def test_length_dtype():
-            x3 = paddle.static.data(name='x3', shape=[10, 5], dtype='float64')
-            len3 = paddle.static.data(name='length3', shape=[10], dtype='int32')
-            paddle.static.nn.sequence_lod.sequence_unpad(x=x3, length=len3)
-
-        self.assertRaises(TypeError, test_length_dtype)
-
-
-support_types = get_xpu_op_support_types('sequence_unpad')
-for stype in support_types:
-    create_test_class(globals(), XPUTestSequenceUnpadOp, stype)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/xpu/test_set_value_op_xpu.py b/test/xpu/test_set_value_op_xpu.py
index 9f64a0c0cea8a..a392038afbb11 100644
--- a/test/xpu/test_set_value_op_xpu.py
+++ b/test/xpu/test_set_value_op_xpu.py
@@ -1230,16 +1230,12 @@ def set_value(t, value):
             np.testing.assert_array_equal(
                 inps.grad.numpy(),
                 input_grad,
-                err_msg='The gradient of value should be \n{},\n but received {}'.format(
-                    input_grad, inps.grad.numpy()
-                ),
+                err_msg=f'The gradient of value should be \n{input_grad},\n but received {inps.grad.numpy()}',
             )
             np.testing.assert_array_equal(
                 value.grad.numpy(),
                 value_grad,
-                err_msg='The gradient of input should be \n{},\n but received {}'.format(
-                    value_grad, value.grad.numpy()
-                ),
+                err_msg=f'The gradient of input should be \n{value_grad},\n but received {value.grad.numpy()}',
             )
 
             # case 2
@@ -1266,16 +1262,12 @@ def set_value(t, value):
             np.testing.assert_array_equal(
                 inps2.grad.numpy(),
                 input_grad2,
-                err_msg='The gradient of value should be \n{},\n but received {}'.format(
-                    input_grad, inps2.grad.numpy()
-                ),
+                err_msg=f'The gradient of value should be \n{input_grad},\n but received {inps2.grad.numpy()}',
             )
             np.testing.assert_array_equal(
                 value2.grad.numpy(),
                 value_grad2,
-                err_msg='The gradient of input should be \n{},\n but received {}'.format(
-                    value_grad, value2.grad.numpy()
-                ),
+                err_msg=f'The gradient of input should be \n{value_grad},\n but received {value2.grad.numpy()}',
             )
 
             # case 3
@@ -1324,16 +1316,12 @@ def set_value3(t, value):
             np.testing.assert_array_equal(
                 inps.grad.numpy(),
                 input_grad,
-                err_msg='The gradient of value should be \n{},\n but received {}'.format(
-                    input_grad, inps.grad.numpy()
-                ),
+                err_msg=f'The gradient of value should be \n{input_grad},\n but received {inps.grad.numpy()}',
             )
             np.testing.assert_array_equal(
                 value.grad.numpy(),
                 value_grad,
-                err_msg='The gradient of input should be \n{},\n but received {}'.format(
-                    value_grad, value.grad.numpy()
-                ),
+                err_msg=f'The gradient of input should be \n{value_grad},\n but received {value.grad.numpy()}',
             )
 
             # case 4: step >0
@@ -1372,16 +1360,12 @@ def set_value4(t, value):
             np.testing.assert_array_equal(
                 inps.grad.numpy(),
                 input_grad,
-                err_msg='The gradient of value should be \n{},\n but received {}'.format(
-                    input_grad, inps.grad.numpy()
-                ),
+                err_msg=f'The gradient of value should be \n{input_grad},\n but received {inps.grad.numpy()}',
             )
             np.testing.assert_array_equal(
                 value.grad.numpy(),
                 value_grad,
-                err_msg='The gradient of input should be \n{},\n but received {}'.format(
-                    value_grad, value.grad.numpy()
-                ),
+                err_msg=f'The gradient of input should be \n{value_grad},\n but received {value.grad.numpy()}',
             )
 
             # case 5:a[0].shape==value.shape
@@ -1426,16 +1410,12 @@ def set_value5(t, value):
             np.testing.assert_array_equal(
                 inps.grad.numpy(),
                 input_grad,
-                err_msg='The gradient of value should be \n{},\n but received {}'.format(
-                    input_grad, inps.grad.numpy()
-                ),
+                err_msg=f'The gradient of value should be \n{input_grad},\n but received {inps.grad.numpy()}',
             )
             np.testing.assert_array_equal(
                 value.grad.numpy(),
                 value_grad,
-                err_msg='The gradient of input should be \n{},\n but received {}'.format(
-                    value_grad, value.grad.numpy()
-                ),
+                err_msg=f'The gradient of input should be \n{value_grad},\n but received {value.grad.numpy()}',
             )
 
             # case 6: pass stop_gradient from value to x
diff --git a/test/xpu/test_softmax_with_cross_entropy_op_xpu.py b/test/xpu/test_softmax_with_cross_entropy_op_xpu.py
index cb623e900d42b..ae28716aff1b7 100644
--- a/test/xpu/test_softmax_with_cross_entropy_op_xpu.py
+++ b/test/xpu/test_softmax_with_cross_entropy_op_xpu.py
@@ -60,6 +60,7 @@ def dynamic_create_class(self):
             [3, 5, 7, 1],
             [1023, 38512],
             [1, 511],
+            [32, 512],
         ]
         for soft_label in [True, False]:
             for numeric_stable_mode in [True, False]:
diff --git a/third_party/nlohmann_json b/third_party/nlohmann_json
new file mode 160000
index 0000000000000..199dea11b17c5
--- /dev/null
+++ b/third_party/nlohmann_json
@@ -0,0 +1 @@
+Subproject commit 199dea11b17c533721b26249e2dcaee6ca1d51d3
diff --git a/third_party/mkldnn b/third_party/onednn
similarity index 100%
rename from third_party/mkldnn
rename to third_party/onednn
diff --git a/tools/CheckPRTemplate.py b/tools/CheckPRTemplate.py
index 2e1b5ac75f635..1cc601dba0a29 100644
--- a/tools/CheckPRTemplate.py
+++ b/tools/CheckPRTemplate.py
@@ -81,11 +81,7 @@ def parameter_accuracy(body):
                 if i not in test_list_lower:
                     single_mess += '%s.' % i
             if len(single_mess) != 0:
-                message += '{} should be in {}. but now is [{}].'.format(
-                    key,
-                    test_list,
-                    single_mess,
-                )
+                message += f'{key} should be in {test_list}. but now is [{single_mess}].'
     return message
 
 
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 6d2ae0330a876..ae6f1a6d9a534 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -223,10 +223,10 @@ for API_FILE in ${API_FILES[*]}; do
             echo_line="You must be approved by Aurelius84(zhangliujie) or cxxly(chenxiaoxu) or xiaoguoguo626807(wangruting) or changeyoung98(chenzhiyang) for python/paddle/autograd/ir_backward.py or python/paddle/autograd/backward_utils.py changes.\n"
             check_approval 1 Aurelius84 cxxly xiaoguoguo626807 changeyoung98
       elif [ "${API_FILE}" == "paddle/scripts/paddle_build.sh" ]; then
-	      echo_line="You must have one RD (tianshuo78520a or risemeup1 or zhangbo9674 or XieYunshen) for ${API_FILE} changes, which manages the Paddle CI on Linux.\n "
+            echo_line="You must have one RD (tianshuo78520a or risemeup1 or zhangbo9674 or XieYunshen) for ${API_FILE} changes, which manages the Paddle CI on Linux.\n "
             check_approval 1 tianshuo78520a risemeup1 zhangbo9674 XieYunshen
       elif [ "${API_FILE}" == "paddle/phi/infermeta/spmd_rules" ]; then
-	      echo_line="You must have one RD (liuzhenhai(liuzhenhai93) or liyurui(LiYuRio) or shenliang03(ForFishes) or zhangyichen03(pkuzyc) or chenqiuliang(zhiqiu)) approval for changing ${API_FILE} , which manages the code for spmd_rules.\n"
+            echo_line="You must have one RD (liuzhenhai(liuzhenhai93) or liyurui(LiYuRio) or shenliang03(ForFishes) or zhangyichen03(pkuzyc) or chenqiuliang(zhiqiu)) approval for changing ${API_FILE} , which manages the code for spmd_rules.\n"
             check_approval 1 liuzhenhai93 LiYuRio ForFishes pkuzyc zhiqiu
       else
           echo_line="You must have one RD (XiaoguangHu01,chenwhql,zhiqiu,Xreki,luotao1,qili93,Aurelius84) approval for ${API_FILE}, which manages the underlying code for fluid.\n"
@@ -284,19 +284,19 @@ HAS_UNITTEST_SKIP=`git diff -U0 upstream/$BRANCH ${NO_NPU_FILE} | grep "^+[[:spa
 if [ "${HAS_UNITTEST_SKIP}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="Unittest is not allowed to be disabled.\nYou must have one RD (kolinwei(Recommend), wanghuancoder, luotao1, QingshuChen, qili93 or ZzSean or Aurelius84) approval for the usage of @unittest.skip or @unittest.skipIf.\n${HAS_UNITTEST_SKIP}\n"
     check_approval 1 kolinwei wanghuancoder luotao1 QingshuChen qili93 ZzSean Aurelius84
-  fi
+fi
 
 HAS_MODIFIED_DEMO_CMAKE=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/inference/api/demo_ci/CMakeLists.txt" || true`
 if [ "${HAS_MODIFIED_DEMO_CMAKE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="You must have one RD (yuanlehome (Recommend), vivienfanghuagood or Aurelius84) approval for paddle/fluid/inference/api/demo_ci/CMakeLists.txt.\nwhich manages the compilation parameter of inference demo\n"
     check_approval 1 Superjomn Shixiaowei02 luotao1 Aurelius84
-  fi
+fi
 
 HAS_MODIFIED_DECLARATIONS=`git diff -U0 upstream/$BRANCH |grep "^+" |grep "paddle/phi/kernels/declarations.h" || true`
 if [ "${HAS_MODIFIED_DECLARATIONS}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="You must be approved by chenwhql or zyfncg for paddle/phi/kernels/declarations.h using. Thanks!\n"
     check_approval 1 chenwhql zyfncg
-  fi
+fi
 
 HAS_USED_CCTESTOLD=`git diff -U0 upstream/$BRANCH |grep "cc_test_old" || true`
 if [ "${HAS_USED_CCTESTOLD}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
@@ -444,7 +444,7 @@ if [ "${ENABLE_TO_STATIC_CHECK}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     check_approval 1 SigureMo Aurelius84 2742195759
 fi
 
-HAS_MODIFIED_DY2ST_TEST_FILES=$(git diff --name-only upstream/$BRANCH | grep "test/dygraph_to_static/test_" || true)
+HAS_MODIFIED_DY2ST_TEST_FILES=$(git diff --name-only --diff-filter=ACMR upstream/$BRANCH | grep "test/dygraph_to_static/test_" || true)
 if [ "${HAS_MODIFIED_DY2ST_TEST_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     error_lines=`python ${PADDLE_ROOT}/test/dygraph_to_static/check_approval.py ${HAS_MODIFIED_DY2ST_TEST_FILES}`
     if [ $? -ne 0 ]; then
@@ -525,7 +525,7 @@ for CHANGE_FILE in ${ALL_CHANGE_FILES}; do
     fi
 done
 if [ "${ALL_OPTEST_BAN_DYGRAPH_MESSAGE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-  echo_line="Developers are not allowed to set the check_dygraph field directly, which is set to True by default. If you need to change the check_dygraph field, you must have one RD (phlrain (Recommend), fuyinno4, QingshuChen (Recommend for kunlun) or lanxianghit) review and approve. \nThe code that do not meet the specification are as follows:\n${ALL_OPTEST_BAN_DYGRAPH_MESSAGE}\n"
+    echo_line="Developers are not allowed to set the check_dygraph field directly, which is set to True by default. If you need to change the check_dygraph field, you must have one RD (phlrain (Recommend), fuyinno4, QingshuChen (Recommend for kunlun) or lanxianghit) review and approve. \nThe code that do not meet the specification are as follows:\n${ALL_OPTEST_BAN_DYGRAPH_MESSAGE}\n"
     check_approval 1 phlrain fuyinno4 QingshuChen lanxianghit
 fi
 
@@ -538,7 +538,7 @@ for CHANGE_FILE in ${ALL_CHANGE_YAML_FILES}; do
     fi
 done
 if [ "${BAN_COMP_MESSAGE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-  echo_line="If you need to change the key composite, you must have one RD (Charles-hit(wanghao), cyber-pioneer(chenzhuo), cxxly(chenxiaoxu)) review and approve. \nThe code that do not meet the specification are as follows:\n${BAN_COMP_MESSAGE}\n"
+    echo_line="If you need to change the key composite, you must have one RD (Charles-hit(wanghao), cyber-pioneer(chenzhuo), cxxly(chenxiaoxu)) review and approve. \nThe code that do not meet the specification are as follows:\n${BAN_COMP_MESSAGE}\n"
     check_approval 1 Charles-hit cyber-pioneer cxxly
 fi
 
@@ -657,14 +657,14 @@ wget https://sys-p0.bj.bcebos.com/blk/block.txt --no-check-certificate --no-prox
 wget https://sys-p0.bj.bcebos.com/bk-ci/bk.txt --no-check-certificate --no-proxy
 HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has unit-test to be fixed" || true`
 if [ "${HASUTFIXED}" != "" ]; then
-  echo_line="${HASUTFIXED} You must have one RD (chalsliu (Recommend) or kolinwei) approval.\n"
-  check_approval 1 chalsliu kolinwei
+    echo_line="${HASUTFIXED} You must have one RD (chalsliu (Recommend) or kolinwei) approval.\n"
+    check_approval 1 chalsliu kolinwei
 fi
 
 HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has benchmark issue to be fixed" || true`
 if [ "${HASUTFIXED}" != "" ]; then
     echo_line="${HASUTFIXED} You must have one RD (hysunflower or xiegegege or Xreki) approval.\n"
-  check_approval 1 hysunflower xiegegege Xreki
+    check_approval 1 hysunflower xiegegege Xreki
 fi
 
 # NOTE(Avin0323): Files with the name "unity_build_rule.cmake" are rules used
diff --git a/tools/check_op_benchmark_result.py b/tools/check_op_benchmark_result.py
index 335f7715489b8..ca3df4bb99eef 100644
--- a/tools/check_op_benchmark_result.py
+++ b/tools/check_op_benchmark_result.py
@@ -86,9 +86,7 @@ def check_speed_result(case_name, develop_data, pr_data, pr_result):
         f"GPU time change: {gpu_time_diff_str} (develop: {develop_gpu_time:.7f} -> PR: {pr_gpu_time:.7f})"
     )
     logging.info(
-        "Total time change: {:.5f}% (develop: {:.7f} -> PR: {:.7f})".format(
-            total_time_diff * 100, develop_total_time, pr_total_time
-        )
+        f"Total time change: {total_time_diff * 100:.5f}% (develop: {develop_total_time:.7f} -> PR: {pr_total_time:.7f})"
     )
     logging.info("backward: %s" % pr_result.get("backward"))
     logging.info("parameters:")
diff --git a/tools/check_op_desc.py b/tools/check_op_desc.py
index 2eb8df32cc7c0..097f08e965af3 100644
--- a/tools/check_op_desc.py
+++ b/tools/check_op_desc.py
@@ -336,9 +336,7 @@ def print_desc_error_message(error_message):
             for arg in changed_args:
                 ori_value, new_value = changed_args.get(arg)
                 print(
-                    " * The arg '{}' of Input '{}' is changed: from '{}' to '{}'.".format(
-                        arg, name, ori_value, new_value
-                    )
+                    f" * The arg '{arg}' of Input '{name}' is changed: from '{ori_value}' to '{new_value}'."
                 )
 
         for name in Inputs_error.get(QUANT, {}):
@@ -364,9 +362,7 @@ def print_desc_error_message(error_message):
             for arg in changed_args:
                 ori_value, new_value = changed_args.get(arg)
                 print(
-                    " * The arg '{}' of Output '{}' is changed: from '{}' to '{}'.".format(
-                        arg, name, ori_value, new_value
-                    )
+                    f" * The arg '{arg}' of Output '{name}' is changed: from '{ori_value}' to '{new_value}'."
                 )
 
         for name in Outputs_error.get(QUANT, {}):
@@ -392,9 +388,7 @@ def print_desc_error_message(error_message):
             for arg in changed_args:
                 ori_value, new_value = changed_args.get(arg)
                 print(
-                    " * The arg '{}' of attr '{}' is changed: from '{}' to '{}'.".format(
-                        arg, name, ori_value, new_value
-                    )
+                    f" * The arg '{arg}' of attr '{name}' is changed: from '{ori_value}' to '{new_value}'."
                 )
 
         for name in attrs_error.get(QUANT, {}):
diff --git a/tools/cinn/build.sh b/tools/cinn/build.sh
index 600cfd687c75a..ca2662ebe4675 100755
--- a/tools/cinn/build.sh
+++ b/tools/cinn/build.sh
@@ -32,13 +32,13 @@ cuda_config=OFF
 cudnn_config=OFF
 
 mklcblas_config=ON
-mkldnn_config=ON
+onednn_config=ON
 
 function mklcblas_off {
   mklcblas_config=OFF
 }
-function mkldnn_off {
-  mkldnn_config=OFF
+function onednn_off {
+  onednn_config=OFF
 }
 
 
@@ -204,11 +204,11 @@ function main {
         case $i in
             mklcblas_off)
                 mklcblas_off
-                mkldnn_off
+                onednn_off
                 shift
                 ;;
-            mkldnn_off)
-                mkldnn_off
+            onednn_off)
+                onednn_off
                 shift
                 ;;
             gpu_on)
diff --git a/tools/cinn/tvm_benchmark/tvm_graph_with_single_op.py b/tools/cinn/tvm_benchmark/tvm_graph_with_single_op.py
index ac608614e720e..60344d2e28a66 100755
--- a/tools/cinn/tvm_benchmark/tvm_graph_with_single_op.py
+++ b/tools/cinn/tvm_benchmark/tvm_graph_with_single_op.py
@@ -237,16 +237,12 @@ def tune_and_evaluate(func):
         np.array(evaluator_preheat().results) * 1000
     )  # convert to millisecond
     print(
-        "[PreHeat]Mean inference time (std dev): {:.4f} ms ({:.4f} ms)".format(
-            np.mean(prof_res1), np.std(prof_res1)
-        )
+        f"[PreHeat]Mean inference time (std dev): {np.mean(prof_res1):.4f} ms ({np.std(prof_res1):.4f} ms)"
     )
 
     prof_res2 = np.array(evaluator().results) * 1000  # convert to millisecond
     print(
-        "[Benchmark]Mean inference time (std dev): {:.4f} ms ({:.4f} ms)".format(
-            np.mean(prof_res2), np.std(prof_res2)
-        )
+        f"[Benchmark]Mean inference time (std dev): {np.mean(prof_res2):.4f} ms ({np.std(prof_res2):.4f} ms)"
     )
 
 
diff --git a/tools/codestyle/clang_format.hook b/tools/codestyle/clang_format.hook
deleted file mode 100755
index fdc3c054c0be4..0000000000000
--- a/tools/codestyle/clang_format.hook
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-set -e
-
-readonly VERSION="13.0.0"
-
-version=$(clang-format -version)
-
-if ! [[ $(python -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $1$2}') -ge 36 ]]; then
-    echo "clang-format installation by pip need python version great equal 3.6,
-          please change the default python to higher version."
-    exit 1
-fi
-
-if ! [[ $version == *"$VERSION"* ]]; then
-    # low version of pip may not have the source of clang-format whl
-    pip install --upgrade pip
-    pip install clang-format==13.0.0
-fi
-
-clang-format $@
diff --git a/tools/codestyle/clang_format.sh b/tools/codestyle/clang_format.sh
new file mode 100755
index 0000000000000..889814aba12a2
--- /dev/null
+++ b/tools/codestyle/clang_format.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+readonly VERSION="13.0.0"
+
+version=$(clang-format -version)
+
+if ! [[ $(python -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $1$2}') -ge 36 ]]; then
+    echo "clang-format installation by pip need python version great equal 3.6,
+          please change the default python to higher version."
+    exit 1
+fi
+
+if ! [[ $version == *"$VERSION"* ]]; then
+    # low version of pip may not have the source of clang-format whl
+    pip install --upgrade pip
+    pip install clang-format==13.0.0
+fi
+
+clang-format $@
diff --git a/tools/codestyle/copyright.hook b/tools/codestyle/copyright.py
similarity index 74%
rename from tools/codestyle/copyright.hook
rename to tools/codestyle/copyright.py
index e007af33ce3cb..1b5138a4a63a3 100644
--- a/tools/codestyle/copyright.hook
+++ b/tools/codestyle/copyright.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 import argparse
+import datetime
+import os
 import re
 import sys
-import os
-import datetime
 
-COPYRIGHT = '''Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+COPYRIGHT = '''Copyright (c) {year} PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -32,27 +32,27 @@
 See the License for the specific language governing permissions and
 limitations under the License.'''
 
-def _generate_copyright(comment_mark):
-    copyright=COPYRIGHT.split(os.linesep)
-    header = copyright[0].rstrip()
-
-    p = re.search(r'(\d{4})', header).group(0)
-    now = datetime.datetime.now()
 
-    header = header.replace(p,str(now.year))
+def _generate_copyright(comment_mark):
+    year = datetime.datetime.now().year
+    copyright = COPYRIGHT.format(year=year)
 
-    ans=[comment_mark + " " + header + os.linesep]
-    for idx, line in enumerate(copyright[1:]):
-        ans.append(comment_mark + " " + line.rstrip() + os.linesep)
+    return [
+        (
+            f"{comment_mark} {line}{os.linesep}"
+            if line
+            else f"{comment_mark}{os.linesep}"
+        )
+        for line in copyright.splitlines()
+    ]
 
-    return ans
 
 def _get_comment_mark(path):
-    lang_type=re.compile(r"\.(py|sh)$")
+    lang_type = re.compile(r"\.(py|sh)$")
     if lang_type.search(path) is not None:
         return "#"
 
-    lang_type=re.compile(r"\.(h|c|hpp|cc|cpp|cu|go|cuh|proto)$")
+    lang_type = re.compile(r"\.(h|c|hpp|cc|cpp|cu|go|cuh|proto)$")
     if lang_type.search(path) is not None:
         return "//"
 
@@ -63,8 +63,9 @@ def _get_comment_mark(path):
 RE_COPYRIGHT = re.compile(r".*Copyright \(c\) \d{4}", re.IGNORECASE)
 RE_SHEBANG = re.compile(r"^[ \t\v]*#[ \t]?\!")
 
+
 def _check_copyright(path):
-    head=[]
+    head = []
     try:
         with open(path, 'r', encoding='utf-8') as f:
             head = [next(f) for x in range(4)]
@@ -77,38 +78,45 @@ def _check_copyright(path):
 
     return False
 
+
 def generate_copyright(path, comment_mark):
     original_contents = open(path, 'r', encoding="utf-8").readlines()
     head = original_contents[0:4]
 
-    insert_line_no=0
+    insert_line_no = 0
     for i, line in enumerate(head):
         if RE_ENCODE.search(line) or RE_SHEBANG.search(line):
-            insert_line_no=i+1
+            insert_line_no = i + 1
 
     copyright = _generate_copyright(comment_mark)
     if insert_line_no == 0:
         new_contents = copyright
-        if len(original_contents) > 0 and len(original_contents[0].strip()) != 0:
+        if (
+            len(original_contents) > 0
+            and len(original_contents[0].strip()) != 0
+        ):
             new_contents.append(os.linesep)
         new_contents.extend(original_contents)
     else:
-        new_contents=original_contents[0:insert_line_no]
+        new_contents = original_contents[0:insert_line_no]
         new_contents.append(os.linesep)
         new_contents.extend(copyright)
-        if len(original_contents) > insert_line_no and len(original_contents[insert_line_no].strip()) != 0:
+        if (
+            len(original_contents) > insert_line_no
+            and len(original_contents[insert_line_no].strip()) != 0
+        ):
             new_contents.append(os.linesep)
         new_contents.extend(original_contents[insert_line_no:])
-    new_contents="".join(new_contents)
+    new_contents = "".join(new_contents)
 
     with open(path, 'w', encoding='utf-8') as output_file:
         output_file.write(new_contents)
 
 
-
 def main(argv=None):
     parser = argparse.ArgumentParser(
-        description='Checker for copyright declaration.')
+        description='Checker for copyright declaration.'
+    )
     parser.add_argument('filenames', nargs='*', help='Filenames to check')
     args = parser.parse_args(argv)
 
@@ -126,4 +134,4 @@ def main(argv=None):
 
 
 if __name__ == '__main__':
-    exit(main())
+    sys.exit(main())
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
deleted file mode 100755
index f755557c65944..0000000000000
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-readonly VERSION="1.6.0"
-
-version=$(cpplint --version)
-
-if ! [[ $version == *"$VERSION"* ]]; then
-    pip install cpplint==1.6.0
-fi
-
-cpplint $@
diff --git a/tools/codestyle/cpplint_pre_commit.sh b/tools/codestyle/cpplint_pre_commit.sh
new file mode 100755
index 0000000000000..9ff02b5458bc8
--- /dev/null
+++ b/tools/codestyle/cpplint_pre_commit.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+readonly VERSION="1.6.0"
+
+version=$(cpplint --version)
+
+if ! [[ $version == *"$VERSION"* ]]; then
+    pip install cpplint==1.6.0
+fi
+
+cpplint $@
diff --git a/tools/continuous_integration/bisect.py b/tools/continuous_integration/bisect.py
index 0f949d9c50bd1..c4b31bb6e8729 100644
--- a/tools/continuous_integration/bisect.py
+++ b/tools/continuous_integration/bisect.py
@@ -118,9 +118,7 @@ def print_arguments():
     # Link error can happen without complete clean up.
     cmd = (
         'rm -rf * && '
-        'cmake -DWITH_TESTING=ON {} >> {} && make -j{} >> {}'.format(
-            args.git_dir, args.log_file, args.build_parallel, args.log_file
-        )
+        f'cmake -DWITH_TESTING=ON {args.git_dir} >> {args.log_file} && make -j{args.build_parallel} >> {args.log_file}'
     )
     sys.stdout.write('cmd: %s\n' % cmd)
     try:
@@ -131,11 +129,7 @@ def print_arguments():
     # test the selected branch.
     passed = True
     try:
-        cmd = 'ctest --repeat-until-fail {} -R {} >> {}'.format(
-            args.test_times,
-            args.test_target,
-            args.log_file,
-        )
+        cmd = f'ctest --repeat-until-fail {args.test_times} -R {args.test_target} >> {args.log_file}'
         sys.stdout.write('cmd: %s\n' % cmd)
         subprocess.check_output([cmd], shell=True)
     except subprocess.CalledProcessError as e:
diff --git a/tools/coverage/coverage_lines.py b/tools/coverage/coverage_lines.py
index d1afc7b645d11..a7385a39c6bcb 100644
--- a/tools/coverage/coverage_lines.py
+++ b/tools/coverage/coverage_lines.py
@@ -68,15 +68,11 @@ def get_lines(info_file):
 
     if actual < expected:
         print(
-            'expected >= {} %, actual {} %, failed'.format(
-                round(expected * 100, 1), round(actual * 100, 1)
-            )
+            f'expected >= {round(expected * 100, 1)} %, actual {round(actual * 100, 1)} %, failed'
         )
 
         sys.exit(1)
 
     print(
-        'expected >= {} %, actual {} %, passed'.format(
-            round(expected * 100, 1), round(actual * 100, 1)
-        )
+        f'expected >= {round(expected * 100, 1)} %, actual {round(actual * 100, 1)} %, passed'
     )
diff --git a/tools/dockerfile/Dockerfile.release.ubuntu20 b/tools/dockerfile/Dockerfile.release.ubuntu20
index 8e0b0c11b6b7b..7a14eb6534afa 100644
--- a/tools/dockerfile/Dockerfile.release.ubuntu20
+++ b/tools/dockerfile/Dockerfile.release.ubuntu20
@@ -119,9 +119,8 @@ RUN wget -q https://paddle-ci.gz.bcebos.com/ccache-4.8.2.tar.gz && \
     ln -s /usr/local/ccache-4.8.2/bin/ccache /usr/local/bin/ccache && \
     cd ../../ && rm -rf ccache-4.8.2.tar.gz
 
-# clang+llvm 3.8.0
-RUN wget https://paddle-ci.cdn.bcebos.com/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz && \ 
-    tar xf clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz && cd clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04 && \
-    cp -rn * /usr/local && cd .. && rm -rf clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04 && rm -rf clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz 
+# clang12
+RUN apt-get update &&\
+    apt install -y clang-12
 
 EXPOSE 22
diff --git a/tools/dockerfile/Dockerfile.ubuntu20 b/tools/dockerfile/Dockerfile.ubuntu20
index 4a2317a185a78..fe5c8a3de5ea3 100644
--- a/tools/dockerfile/Dockerfile.ubuntu20
+++ b/tools/dockerfile/Dockerfile.ubuntu20
@@ -173,9 +173,8 @@ RUN wget -q https://paddle-ci.gz.bcebos.com/ccache-4.8.2.tar.gz && \
     ln -s /usr/local/ccache-4.8.2/bin/ccache /usr/local/bin/ccache && \
     cd ../../ && rm -rf ccache-4.8.2.tar.gz
 
-# clang+llvm 3.8.0
-RUN wget https://paddle-ci.cdn.bcebos.com/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz && \ 
-    tar xf clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz && cd clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04 && \
-    cp -rn * /usr/local && cd .. && rm -rf clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04 && rm -rf clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz 
+# clang12
+RUN apt-get update &&\
+    apt install -y clang-12
 
 EXPOSE 22
diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh
index 0303ef5344aae..f1a1db3773b91 100644
--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -86,6 +86,7 @@ function make_ubuntu20_cu12_dockerfile(){
     make -j8 \&\& make install " ${dockerfile_name}
   sed -i "${dockerfile_line}i RUN pip install wheel \&\& pip3 install PyGithub wheel distro \&\& pip3.8 install distro" ${dockerfile_name}
   sed -i 's# && rm /etc/apt/sources.list.d/nvidia-ml.list##g' ${dockerfile_name}
+  sed -i 's#RUN bash /build_scripts/install_trt.sh#RUN bash /build_scripts/install_trt.sh trt8616#g' ${dockerfile_name}
   sed -i 's#RUN bash /build_scripts/install_cudnn.sh cudnn841#RUN bash /build_scripts/install_cudnn.sh cudnn896 #g' ${dockerfile_name}
 
   sed -i "${dockerfile_line}i WORKDIR /home \n \
@@ -98,6 +99,38 @@ function make_ubuntu20_cu12_dockerfile(){
 }
 
 
+function make_ubuntu20_cu123_dockerfile(){
+  dockerfile_name="Dockerfile.cuda123_cudnn9_gcc122_ubuntu20"
+  sed "s#<baseimg>#nvidia/cuda:12.3.1-devel-ubuntu20.04#g" ./Dockerfile.ubuntu20 >${dockerfile_name}
+  sed -i "s#<setcuda>#ENV LD_LIBRARY_PATH=/usr/local/cuda-12.3/targets/x86_64-linux/lib:\$LD_LIBRARY_PATH #g" ${dockerfile_name}
+  sed -i 's#<install_cpu_package>##g' ${dockerfile_name}
+  sed -i "7i ENV TZ=Asia/Beijing" ${dockerfile_name}
+  sed -i "8i RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone" ${dockerfile_name}
+  sed -i "27i RUN apt-get update && apt-get install -y liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev libsndfile1" ${dockerfile_name}
+  dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}')
+  sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \&\& \
+     tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN apt remove git -y \&\& apt update \&\& apt install -y libcurl4-openssl-dev gettext pigz zstd ninja-build \&\& wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz \&\& \
+    tar -xvf git-2.17.1.tar.gz \&\& \
+    cd git-2.17.1 \&\& \
+    ./configure --with-openssl --with-curl --prefix=/usr/local \&\& \
+    make -j8 \&\& make install " ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN pip install wheel \&\& pip3 install PyGithub wheel distro \&\& pip3.8 install distro" ${dockerfile_name}
+  sed -i 's# && rm /etc/apt/sources.list.d/nvidia-ml.list##g' ${dockerfile_name}
+  sed -i 's#RUN bash /build_scripts/install_trt.sh#RUN bash /build_scripts/install_trt.sh trt8616#g' ${dockerfile_name}
+  sed -i 's#RUN bash /build_scripts/install_cudnn.sh cudnn841#RUN bash /build_scripts/install_cudnn.sh cudnn900 #g' ${dockerfile_name}
+  sed -i 's#CUDNN_VERSION=8.4.1#CUDNN_VERSION=9.0.0#g' ${dockerfile_name}
+
+  sed -i "${dockerfile_line}i WORKDIR /home \n \
+    RUN git clone --depth=1 https://github.com/PaddlePaddle/PaddleNLP.git -b stable/paddle-ci \&\& cd PaddleNLP \&\& \
+    pip3.10 install -r requirements.txt \&\& \
+    pip3.10 install -r scripts/regression/requirements_ci.txt \&\& \
+    pip3.10 install -r csrc/requirements.txt \&\& \
+    pip3.10 install pytest-timeout \&\& \
+    cd /home \&\& rm -rf PaddleNLP" ${dockerfile_name}
+}
+
+
 function make_ubuntu20_cu112_dockerfile(){
   dockerfile_name="Dockerfile.cuda11.2_cudnn8.1_trt8.4_gcc8.2_ubuntu18"
   sed "s#<baseimg>#nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04#g" ./Dockerfile.ubuntu20 >${dockerfile_name}
@@ -125,6 +158,7 @@ function main() {
   make_ce_framework_dockcerfile
   make_ubuntu20_cu12_dockerfile
   make_ubuntu20_cu112_dockerfile
+  make_ubuntu20_cu123_dockerfile
 }
 
 main "$@"
diff --git a/tools/enforce/count_enforce_by_dir.sh b/tools/enforce/count_enforce_by_dir.sh
index 3cb13edf7cc27..77ffe9c158c7d 100644
--- a/tools/enforce/count_enforce_by_dir.sh
+++ b/tools/enforce/count_enforce_by_dir.sh
@@ -48,7 +48,7 @@
 #     paddle/fluid/operators/math/detail | 0 | 0 | 0
 #     paddle/fluid/operators/math | 200 | 7 | 193
 #     paddle/fluid/operators/metrics | 38 | 29 | 9
-#     paddle/fluid/operators/mkldnn | 107 | 14 | 93
+#     paddle/fluid/operators/onednn | 107 | 14 | 93
 #     paddle/fluid/operators/nccl | 27 | 0 | 27
 #     paddle/fluid/operators/optimizers | 214 | 50 | 164
 #     paddle/fluid/operators/reader | 40 | 14 | 26
diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
index 62d1149bf8578..38a1ce1f12569 100644
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
@@ -93,9 +93,7 @@ def __wget_with_retry(self, url):
             if code == 0:
                 return True
             print(
-                'PREC download {} error, retry {} time(s) after {} secs.[proxy_option={}]'.format(
-                    url, ix, ix * 10, proxy
-                )
+                f'PREC download {url} error, retry {ix} time(s) after {ix * 10} secs.[proxy_option={proxy}]'
             )
             time.sleep(ix * 10)
             ix += 1
@@ -119,9 +117,7 @@ def __urlretrieve(self, url, filename):
             except Exception as e:
                 print(e)
                 print(
-                    'PREC download {} error, retry {} time(s) after {} secs.[proxy_option={}]'.format(
-                        url, ix, ix * 10, cur_proxy
-                    )
+                    f'PREC download {url} error, retry {ix} time(s) after {ix * 10} secs.[proxy_option={cur_proxy}]'
                 )
                 continue
             else:
diff --git a/tools/get_single_test_cov.py b/tools/get_single_test_cov.py
index a710e7792e4a5..fd26d8c260278 100644
--- a/tools/get_single_test_cov.py
+++ b/tools/get_single_test_cov.py
@@ -56,10 +56,7 @@ def getFNDAFile(rootPath, test):
                     symbol = tmp_data[1]
                     if symbol in fnda_base_dict:
                         if (hit - fnda_base_dict[symbol]) > 0:
-                            fnda_str = 'FNDA:{},{}'.format(
-                                str(hit - fnda_base_dict[symbol]),
-                                symbol,
-                            )
+                            fnda_str = f'FNDA:{str(hit - fnda_base_dict[symbol])},{symbol}'
                             os.system(f'echo {fnda_str} >> {fn_filename}')
                     else:
                         os.system(f'echo {message} >> {fn_filename}')
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index ef2eb620eddda..e8b181317e4a1 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -289,7 +289,6 @@
     'inlined_vector_test',
     'test_generate_proposal_labels_op',
     'test_analyzer_capi_exp_pd_config',
-    'test_locality_aware_nms_op',
     'test_imperative_decorator',
     'test_npair_loss_op',
     'test_ps_dispatcher',
@@ -343,7 +342,6 @@
     'test_fleet_rolemaker_new',
     'test_imperative_base',
     'dist_multi_trainer_test',
-    'test_mine_hard_examples_op',
     'test_post_training_quantization_lstm_model',
     'aes_cipher_test',
     'test_analyzer_zerocopytensor_tensor',
@@ -626,7 +624,6 @@
     'test_memory_analysis',
     'test_matrix_rank_op',
     'test_merged_momentum_op',
-    'test_parallel_executor_run_cinn',
     'test_parallel_dygraph_dataparallel_cpuonly',
     'test_eigvals_op',
     'test_sparse_attention_op',
@@ -672,9 +669,7 @@
     'test_analyzer_int8_googlenet',
     'test_analyzer_seq_pool1_compare_determine',
     'save_quant2_model_ernie',
-    'test_parallel_executor_seresnext_with_fuse_all_reduce_cpu',
     'test_dataset_uci_housing',
-    'test_parallel_executor_seresnext_base_cpu',
     'test_dataset_download',
     'test_quant_int8_mobilenetv1_mkldnn',
     'test_crf_decoding_op',
@@ -690,7 +685,6 @@
     'test_weight_quantization_mobilenetv1',
     'test_concat_mkldnn_op',
     'test_gaussian_random_mkldnn_op',
-    'test_parallel_executor_seresnext_with_reduce_cpu',
     'test_dataset_imikolov',
     'test_analyzer_rnn1',
     'test_conv2d_mkldnn_op',
@@ -792,7 +786,6 @@
     'test_grid_generator',
     'test_randn_op',
     'test_activation_mkldnn_op',
-    'test_lac',
     'test_pad_op',
     'test_lstmp_op',
     'test_loop',
@@ -810,7 +803,6 @@
     'test_maximum_op',
     'test_rnn_cell_api',
     'device_code_test',
-    'test_ir_inplace_pass',
     'test_cos_sim_op',
     'test_lite_tensor_utils',
     'test_fit_a_line',
@@ -830,7 +822,6 @@
     'test_dygraph_weight_norm',
     'test_tracer',
     'test_list',
-    'test_sequence_concat',
     'test_adaptive_avg_pool1d',
     'test_elementwise_div_op',
     'test_conv1d_transpose_layer',
@@ -893,7 +884,6 @@
     'test_scale_mkldnn_op',
     'test_load_state_dict_from_old_format',
     'test_lookup_table_v2_op',
-    'test_mix_precision_all_reduce_fuse',
     'test_spp_op',
     'test_op_converter',
     'test_mixed_vector',
@@ -924,7 +914,6 @@
     'test_run_program_op',
     'test_cuda_random_seed',
     'test_linear_interp_op',
-    'test_fuse_all_reduce_pass',
     'tensor_util_test',
     'test_median',
     'test_nanmedian',
@@ -1030,7 +1019,6 @@
     'test_gather_tree_op',
     'test_elementwise_mul_op',
     'test_cycle_gan',
-    'test_parallel_executor_transformer_auto_growth',
     'test_bitwise_op',
     'test_uniform_random_op',
     'trt_split_converter_test',
@@ -1079,7 +1067,6 @@
     'test_deprecated_decorator',
     'test_complex_cast',
     'test_diag_v2',
-    'test_iou_similarity_op',
     'test_inplace_auto_generated_apis',
     'test_dataset',
     'test_bilinear_api',
@@ -1087,7 +1074,6 @@
     'test_imperative_layer_children',
     'nccl_op_test',
     'test_share_data_op',
-    'test_ir_memory_optimize_transformer',
     'test_math_op_patch',
     'test_base_layer',
     'test_dequantize_log_op',
@@ -1105,7 +1091,6 @@
     'test_affine_channel_op',
     'test_leaky_relu_grad_grad_functor',
     'test_ctc_align',
-    'test_fuse_relu_depthwise_conv_pass',
     'test_complex_kron',
     'test_imperative_skip_op',
     'test_dgc_op',
@@ -1228,7 +1213,6 @@
     'test_group_norm_op_v2',
     'test_tensor_to_numpy',
     'test_queue',
-    'test_rank_loss_op',
     'test_trace_op',
     'test_case',
     'test_prroi_pool_op',
@@ -1258,7 +1242,6 @@
     'test_conv_elementwise_add2_act_fuse_pass',
     'test_imperative_container_layerlist',
     'test_dequantize_abs_max_op',
-    'test_fuse_optimizer_pass',
     'test_optimizer',
     'test_dynamic_rnn_stop_gradient',
     'test_raw_program_optimizer',
@@ -1360,7 +1343,6 @@
     'test_gradient_accmulator',
     'test_instance_norm_op_v2',
     'test_mobile_net',
-    'test_parallel_executor_transformer',
     'test_tensor_scalar_type_promotion_dynamic',
     'test_eager_deletion_delete_vars',
     'test_asp_pruning_1d',
@@ -1387,7 +1369,6 @@
     'test_tensorrt_engine',
     'test_affine_grid_function',
     'test_nonzero_api',
-    'test_ir_memory_optimize_pass',
     'test_reduce_mkldnn_op',
     'test_bilinear_interp_op',
     'test_cvm_op',
@@ -1469,9 +1450,6 @@
     'test_save_inference_model',
     'test_smooth_l1_loss',
     'test_bilateral_slice_op',
-    'test_parallel_executor_seresnext_base_gpu',
-    'test_parallel_executor_seresnext_with_fuse_all_reduce_gpu',
-    'test_parallel_executor_seresnext_with_reduce_gpu',
     'test_data_norm_op',
     'test_install_check',
     'graph_node_test',
@@ -1753,7 +1731,6 @@
     'test_mkldnn_conv_concat_relu_mkldnn_fuse_pass',
     'test_mkldnn_conv_bias_fuse_pass',
     'test_mkldnn_conv_activation_fuse_pass',
-    'test_mine_hard_examples_op',
     'test_memory_usage',
     'test_matrix_nms_op',
     'test_matmul_transpose_reshape_fuse_pass',
@@ -1765,7 +1742,6 @@
     'test_logger',
     'test_lod_tensor_array_ops',
     'test_lod_tensor_array',
-    'test_locality_aware_nms_op',
     'test_load_vars_shape_check',
     'test_load_op_xpu',
     'test_load_op',
@@ -2171,7 +2147,6 @@
     'test_analyzer_capi_exp_xpu',
     'test_egr_task_autocodegen',
     'test_static_save_load_bf16',
-    'test_parallel_executor_run_cinn',
     'test_egr_task_tensor_utils',
     'test_egr_task_hook',
     'test_egr_task_forward_autograd',
@@ -2286,15 +2261,12 @@
     'test_fused_transformer_encoder_layer',
     'test_eager_deletion_while_op',
     'test_dataloader_unkeep_order',
-    'test_parallel_executor_profiler',
     'test_correlation',
-    'test_ir_inplace_pass',
     'test_moving_average_abs_max_scale_op',
     'test_flatten_contiguous_range_op',
     'test_transforms',
     'test_sum_op',
     'test_scatter_op',
-    'test_mix_precision_all_reduce_fuse',
     'test_tensorrt_engine_op',
     'test_zeropad2d',
     'test_isclose_op',
@@ -2668,7 +2640,6 @@
     'test_conv_elementwise_add_fuse_pass',
     'test_sequence_expand_as',
     'test_cos_sim_op',
-    'test_sequence_concat',
     'test_data_norm_op',
     'test_decoupled_py_reader_data_check',
     'test_deformable_conv_v1_op',
@@ -2742,7 +2713,6 @@
     'test_expand_as_v2_op',
     'test_rand_op',
     'test_empty_like_op',
-    'test_rank_loss_op',
     'test_elementwise_mod_op',
     'test_elementwise_max_op',
     'test_retain_graph',
@@ -2804,7 +2774,6 @@
     'test_tensor_array_to_tensor',
     'test_mean_op',
     'test_momentum_op',
-    'test_iou_similarity_op',
     'test_optimizer_grad',
     'test_dygraph_weight_norm',
     'test_batch_norm_op_v2',
@@ -2889,7 +2858,6 @@
     'test_user_defined_quantization',
     'test_quantization_scale_pass',
     'feed_forward_test',
-    'test_fuse_optimizer_pass',
     'test_standalone_executor',
     'test_imperative_qat_user_defined',
     'test_mkldnn_fc_act_fuse_pass',
@@ -2897,7 +2865,6 @@
     'test_signal',
     'test_fused_feedforward_op',
     'test_weight_decay_extend',
-    'test_fuse_relu_depthwise_conv_pass',
     'test_diag_v2',
     'test_tensordot',
     'test_rnn_decode_api',
@@ -2922,7 +2889,6 @@
     'test_multinomial_op',
     'test_fused_elemwise_activation_op',
     'test_profiler',
-    'test_ir_memory_optimize_pass',
     'test_callback_reduce_lr_on_plateau',
     'test_paddle_save_load',
     'test_stack_op',
@@ -3064,10 +3030,8 @@
     'test_squeeze2_mkldnn_op',
     'test_conv2d_transpose_bf16_mkldnn_op',
     'test_slice_mkldnn_op',
-    'test_parallel_executor_seresnext_base_cpu',
     'test_stack_mkldnn_op',
     'test_softplus_mkldnn_op',
-    'test_parallel_executor_seresnext_with_reduce_cpu',
     'test_nearest_interp_v2_mkldnn_op',
     'test_fusion_lstm_mkldnn_op',
     'test_fuse_resnet_unit',
@@ -3075,7 +3039,6 @@
     'test_uniform_random_bf16_op',
     'test_reshape_mkldnn_op',
     'test_reduce_bf16_mkldnn_op',
-    'test_parallel_executor_seresnext_with_fuse_all_reduce_cpu',
     'test_nearest_interp_mkldnn_op',
     'test_ir_graph_to_program_pass',
     'test_fusion_lstm_int8_mkldnn_op',
@@ -3240,25 +3203,11 @@ def main():
 
     if platform.system() == 'Windows':
         print(
-            "{};{};{};{}".format(
-                high_parallel_job,
-                fourth_high_parallel_job,
-                fifth_high_parallel_job,
-                non_parallel_job,
-            )
+            f"{high_parallel_job};{fourth_high_parallel_job};{fifth_high_parallel_job};{non_parallel_job}"
         )
     else:
         print(
-            "{};{};{};{};{};{};{};{}".format(
-                high_parallel_job,
-                secondary_high_parallel_job,
-                third_high_parallel_job,
-                fourth_high_parallel_job,
-                fifth_high_parallel_job,
-                sixth_high_parallel_job,
-                lowest_high_parallel_job,
-                non_parallel_job,
-            )
+            f"{high_parallel_job};{secondary_high_parallel_job};{third_high_parallel_job};{fourth_high_parallel_job};{fifth_high_parallel_job};{sixth_high_parallel_job};{lowest_high_parallel_job};{non_parallel_job}"
         )
 
 
diff --git a/tools/parse_kernel_info.py b/tools/parse_kernel_info.py
index 19a70bbb22e33..89ea4e3ad44b3 100644
--- a/tools/parse_kernel_info.py
+++ b/tools/parse_kernel_info.py
@@ -119,9 +119,7 @@ def parse_paddle_kernels(lib="phi", kernel_type="function", print_detail=False):
 
     if print_detail:
         print(
-            "==================== lib={}, kernel_type={} ====================".format(
-                lib, kernel_type
-            )
+            f"==================== lib={lib}, kernel_type={kernel_type} ===================="
         )
         print(
             "{} : {}".format(
@@ -131,10 +129,7 @@ def parse_paddle_kernels(lib="phi", kernel_type="function", print_detail=False):
         )
         for key, value in sorted(kernel_info_dict.items()):
             print(
-                "{} : {}".format(
-                    value.op_type.ljust(max_op_type_lengths + 4),
-                    value.supported_dtypes,
-                )
+                f"{value.op_type.ljust(max_op_type_lengths + 4)} : {value.supported_dtypes}"
             )
         print("")
     return stats
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index ff03a33dc2e85..d09a04abd045c 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -116,9 +116,7 @@ def visit_all_module(mod):
                     and member_name != instance.__name__
                 ):
                     print(
-                        "Found alias API, alias name is: {}, original name is: {}".format(
-                            member_name, instance.__name__
-                        ),
+                        f"Found alias API, alias name is: {member_name}, original name is: {instance.__name__}",
                         file=sys.stderr,
                     )
         except:
diff --git a/tools/remove_grad_op_and_kernel.py b/tools/remove_grad_op_and_kernel.py
index cb2201dc4b72b..4db50a23ab166 100644
--- a/tools/remove_grad_op_and_kernel.py
+++ b/tools/remove_grad_op_and_kernel.py
@@ -126,7 +126,7 @@ def update_operator_cmake(cmake_file):
         xpu_kernel_pattern1 = r'REGISTER_OP_XPU_KERNEL\(.*?\);?'
         xpu_kernel_pattern2 = r'REGISTER_OP_XPU_KERNEL\(.*?_grad,.*?\);?'
 
-        # remove custom grad kernel, mkldnn or cudnn etc.
+        # remove custom grad kernel, onednn or cudnn etc.
         op_kernel_pattern1 = r'REGISTER_OP_KERNEL\(.*?\);?'
         op_kernel_pattern2 = r'REGISTER_OP_KERNEL\(.*?_grad,.*?\);?'
 
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 48f7178fa23dc..8d106507df7ba 100755
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -28,7 +28,6 @@
     'test_lookup_table_bf16_op',
     'test_lookup_table_v2_bf16_op',
     'test_scatter_op',
-    'test_sequence_concat',
     'test_sequence_conv',
     'test_sequence_pool',
     'test_sequence_expand_as',
@@ -217,9 +216,6 @@
     'test_functional_conv2d_transpose',
     'test_functional_conv3d',
     'test_functional_conv3d_transpose',
-    'test_fuse_all_reduce_pass',
-    'test_fuse_optimizer_pass',
-    'test_fuse_relu_depthwise_conv_pass',
     'test_fused_elemwise_activation_op',
     'test_fused_emb_seq_pool_op',
     'test_fused_embedding_fc_lstm_op',
@@ -297,7 +293,6 @@
     'test_logspace',
     'test_load_op',
     'test_load_vars_shape_check',
-    'test_locality_aware_nms_op',
     'test_lod_array_length_op',
     'test_lod_tensor_array_ops',
     'test_log_loss_op',
@@ -316,7 +311,6 @@
     'test_memory_usage',
     'test_merge_ids_op',
     'test_meshgrid_op',
-    'test_mine_hard_examples_op',
     'test_minus_op',
     'test_mish_op',
     'test_modified_huber_loss_op',
@@ -401,7 +395,6 @@
     'test_randn_op',
     'test_randperm_op',
     'test_range',
-    'test_rank_loss_op',
     'test_reader_reset',
     'test_recurrent_op',
     'test_reduce_op',
@@ -507,16 +500,9 @@
     'test_transpiler_ops',
     'test_communicator_sync',
     'test_collective_optimizer',
-    'test_parallel_executor_profiler',
-    'test_parallel_executor_transformer',
-    'test_parallel_executor_transformer_auto_growth',
     'test_data_norm_op',
     'test_fuse_bn_act_pass',
-    'test_parallel_executor_seresnext_base_cpu',
-    'test_parallel_executor_seresnext_with_reduce_cpu',
-    'test_parallel_executor_seresnext_with_fuse_all_reduce_cpu',
     'test_layers',
-    'test_sequence_concat',
     'test_sequence_conv',
     'test_sequence_erase_op',
     'test_sequence_expand',
@@ -616,12 +602,9 @@
     'test_fleet_metric',
     'test_fused_bn_add_act',
     'test_fused_multihead_matmul_op',
-    'test_ir_inplace_pass',
-    'test_mix_precision_all_reduce_fuse',
     'test_rank_attention_op',
     'test_fleet_base',
     'test_fleet_meta_optimizer_base',
-    'test_ir_memory_optimize_transformer',
     'test_trt_fc_fuse_pass',
     'test_trt_quant_conv2d_dequant_fuse_pass',
     'test_trt_slice_plugin',
@@ -644,9 +627,6 @@
     'test_trt_pad_op',
     'test_trt_shuffle_channel_detect_pass',
     'test_trt_subgraph_pass',
-    'test_parallel_executor_seresnext_base_gpu',
-    'test_parallel_executor_seresnext_with_fuse_all_reduce_gpu',
-    'test_parallel_executor_seresnext_with_reduce_gpu',
     'test_sync_batch_norm_op',
     'test_multiprocess_dataloader_iterable_dataset_static',
     'test_multiprocess_dataloader_static',
diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index a11e3ad47724f..29b71c4306ee8 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -24,15 +24,10 @@ disable_wingpu_test="^test_model$|\
 ^test_generator_dataloader$|\
 ^test_parallel_dygraph_sync_batch_norm$|\
 ^test_py_reader_using_executor$|\
-^test_parallel_executor_seresnext_base_gpu$|\
-^test_parallel_executor_seresnext_with_fuse_all_reduce_gpu$|\
-^test_parallel_executor_seresnext_with_reduce_gpu$|\
 ^test_program_prune_backward$|\
 ^test_decoupled_py_reader_data_check$|\
 ^test_fleet_base_single$|\
 ^test_multiprocess_dataloader_iterable_dataset_dynamic$|\
-^test_parallel_executor_feed_persistable_var$|\
-^test_parallel_executor_inference_feed_partial_data$|\
 ^test_py_reader_combination$|\
 ^test_py_reader_pin_memory$|\
 ^test_py_reader_push_pop$|\
@@ -76,7 +71,6 @@ disable_wingpu_cuda12_test="^test_cholesky_op$|\
 ^test_elementwise_add_mkldnn_op$|\
 ^test_comp_high_grad$|\
 ^test_multi_precision_fp16_train$|\
-^test_fuse_relu_depthwise_conv_pass$|\
 ^test_imperative_skip_op$|\
 ^test_qat$|\
 ^test_standalone_cuda_graph_multi_stream$|\
@@ -209,7 +203,6 @@ disable_wingpu_cuda12_test="^test_cholesky_op$|\
 ^test_argsort_op$|\
 ^test_image_classification_fp16$|\
 ^test_imperative_double_grad$|\
-^test_parallel_executor_transformer$|\
 ^test_se_resnet$|\
 ^test_standalone_executor_aot_choose_kernel$|\
 ^test_imperative_qat_user_defined$|\
@@ -217,7 +210,6 @@ disable_wingpu_cuda12_test="^test_cholesky_op$|\
 ^test_callback_reduce_lr_on_plateau$|\
 ^test_callback_visualdl$|\
 ^test_callback_wandb$|\
-^test_mix_precision_all_reduce_fuse$|\
 ^test_user_defined_quantization$|\
 ^test_quantization_scale_pass$|\
 ^test_quantization_pass$|\
@@ -399,10 +391,7 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\
 ^test_model$|\
 ^test_py_reader_combination$|\
 ^test_py_reader_push_pop$|\
-^test_parallel_executor_feed_persistable_var$|\
-^test_parallel_executor_inference_feed_partial_data$|\
 ^test_reader_reset$|\
-^test_parallel_executor_seresnext_base_gpu$|\
 ^test_py_reader_pin_memory$|\
 ^test_multiprocess_dataloader_iterable_dataset_dynamic$|\
 ^test_multiprocess_dataloader_iterable_dataset_static$|\
@@ -432,8 +421,6 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\
 ^test_trt_convert_multihead_matmul$|\
 ^test_trt_convert_prelu$|\
 ^test_trt_fc_fuse_quant_dequant_pass$|\
-^test_parallel_executor_seresnext_with_fuse_all_reduce_gpu$|\
-^test_parallel_executor_seresnext_with_reduce_gpu$|\
 ^test_api_impl$|\
 ^test_tensordot$|\
 ^disable_win_inference_test$|\